In [1]:
import imblearn
from imblearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Datasets

In [2]:
random_seed = 1886

In [3]:
# used dataset from imblearn
datasets = imblearn.datasets.fetch_datasets()['wine_quality']

In [4]:
# Convert the dictionary to a pandas dataframe
df = pd.concat([pd.DataFrame(datasets['data'], columns = [f'data_{i}' for i in range(datasets.data.shape[1])]),
                pd.DataFrame(datasets['target'], columns = ['target'])], axis = 1)

In [5]:
# preprocessing for target value
print('before preprocess: ', df['target'].unique())
df['target'] = df['target'].replace(-1, 0)
print(' after preprocess: ', df['target'].unique())

before preprocess:  [-1  1]
 after preprocess:  [0 1]


In [6]:
# define IR
num_major = df['target'].value_counts().sort_values()[0]
num_minor = df['target'].value_counts().sort_values()[1]
IR = int(num_major/num_minor)

print('number of minor class: ', num_minor)
print('number of major class: ', num_major)
print('  IR(Imbalance Ratio): {0:.0f}'.format(IR))

number of minor class:  183
number of major class:  4715
  IR(Imbalance Ratio): 25


In [7]:
# Split data into train test set
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis = 1), 
                                                    df['target'], 
                                                    test_size = 0.2, 
                                                    random_state = random_seed)

In [8]:
X_train.shape, X_test.shape

((3918, 11), (980, 11))

### Define functions

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold, GridSearchCV

In [10]:
# calculate IDCP
def cal_IDCP(y, pred):
    f1 = f1_score(y,pred)
    roc_score = roc_auc_score(y, pred, average='macro')
    IDCP = np.round((2*f1+roc_score)/3, 4)
    return IDCP

In [11]:
# get predicted values from selected model
def model_eval(model, X_train, y_train, weights):
    weights_y = pd.DataFrame(compute_sample_weight(weights, y_train), columns = ['weight'])

    if model == 'LR':
        base_model = LogisticRegression(random_state=random_seed, class_weight=weights)
        base_model.fit(X_train, y_train)
    elif model == 'SVM':
        base_model = SVC(random_state=random_seed, class_weight=weights)
        base_model.fit(X_train, y_train)
    elif model == 'RF':
        base_model = RandomForestClassifier(n_estimators=500, max_depth=8, max_features=5, 
                                            random_state=random_seed, class_weight=weights)
        base_model.fit(X_train, y_train)
    elif model == 'XGB':
        # gridsearch
        XGB = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
        param_grid={'n_estimators' : [50,100],
                    'learning_rate' : [0.01,0.05,0.1],
                    'max_depth' : [3,5,7]}
        cv=KFold(n_splits=3)
        Grid_XGB=GridSearchCV(XGB, param_grid=param_grid, cv=cv, scoring=make_scorer(cal_IDCP), n_jobs=4)
        Grid_XGB.fit(X_train, y_train)
        base_model = Grid_XGB.best_estimator_
        base_model.fit(X_train, y_train, sample_weight=weights_y)
    elif model == 'CB':
        # gridsearch
        CBC = CatBoostClassifier()
        param_grid={'depth'         : [4,5,6],
                    'learning_rate' : [0.01,0.05,0.1,0.2,0.3],
                    'iterations'    : [30,40,50,60,70,80]}
        Grid_CBC = GridSearchCV(estimator=CBC, param_grid = param_grid, cv = 2, scoring=make_scorer(cal_IDCP), n_jobs=-1)
        Grid_CBC.fit(X_train, y_train, verbose=False)
        base_model = Grid_CBC.best_estimator_
        base_model.fit(X_train, y_train, sample_weight=weights_y, verbose=False)

    pred = base_model.predict(X_train)

    return base_model, pred

### Select model

In [12]:
# you can choose among 'LR', 'SVM', 'RF', 'XGB', 'CB'

model = 'XGB'

### Evaluate model

In [13]:
weights = {0:1.0, 1:1.0} # initial weight
base_model, pred = model_eval(model, X_train, y_train, weights)
recall = recall_score(y_train, pred)
print('Recall: {0:.2f}'.format(recall))

Recall: 0.39


### Optimal weight

In [14]:
weight_list = []
IDCP_list = []
patience_check = 0 # Record the number of epochs not improving consecutively  
patience_limit = 3 # Decide the number of epochs to allow in early stopping

if recall >= 0.5:
    potential_range = range(1, IR+1)
else:
    potential_range = range(IR, 0, -1)

for i, w in enumerate(potential_range):
    weights = {0:1.0, 1:w}
    weights_y = pd.DataFrame(compute_sample_weight(weights, y_train), columns = ['weight'])

    base_model, pred = model_eval(model, X_train, y_train, weights)
    IDCP = cal_IDCP(y_train, pred)
    print('w: ', w, ', IDCP: ', IDCP)
    weight_list.append(w)
    IDCP_list.append(IDCP)

    # early stopping
    if IDCP_list[i-1] > IDCP:
        patience_check += 1
    if patience_check >= patience_limit:
        break
optimal_w = weight_list[np.argmax(IDCP_list)]
print('---'*10)
print('optimal w: ', optimal_w)

w:  25 , IDCP:  0.6711
w:  24 , IDCP:  0.68
w:  23 , IDCP:  0.6817
w:  22 , IDCP:  0.6813
w:  21 , IDCP:  0.6854
w:  20 , IDCP:  0.6783
w:  19 , IDCP:  0.7006
w:  18 , IDCP:  0.686
------------------------------
optimal w:  19
