In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [2]:
import pandas as pd
import numpy as np
import pickle
import random
from pathlib import Path

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb, lightgbm as lgbm, catboost as catb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [5]:
def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [6]:
def balance_df_by_target(df, target_name, method='over'):

    assert method in ['over', 'under', 'tomek', 'smote'], 'Неверный метод сэмплирования'
    
    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1
    if method == 'over':
        for i in range(disbalance_coeff):
            sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
            df = df.append(sample, ignore_index=True)
            
    elif method == 'under':
        df_ = df.copy()
        df = df_[df_[target_name] == minor_class_name]
        tmp = df_[df_[target_name] == major_class_name]
        df = df.append(tmp.iloc[
            np.random.randint(0, tmp.shape[0], target_counts[minor_class_name])
        ], ignore_index=True)

    elif method == 'tomek':
        from imblearn.under_sampling import TomekLinks
        tl = TomekLinks()
        X_tomek, y_tomek = tl.fit_sample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_tomek, y_tomek], axis=1)
    
    elif method == 'smote':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE()
        X_smote, y_smote = smote.fit_sample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_smote, y_smote], axis=1)

    return df.sample(frac=1) 

## Paths to directories and files 

In [7]:
from pathlib import Path

DATA_ROOT = Path('./data/training_project/')
MODELS_PATH = Path('./models/')

# input
DATASET_PATH = DATA_ROOT / 'train.csv'
TEST_DATASET_PATH = DATA_ROOT / 'test.csv'

PREP_DATASET_PATH = DATA_ROOT / 'training_project_data_prep.csv'
PREP_TEST_DATASET_PATH = DATA_ROOT / 'final_project_data_prep.csv'


# output
TRAIN_FULL_PATH = DATA_ROOT / 'training_project_train_full.csv'
FINAL_FULL_PATH = DATA_ROOT / 'final_norm_full.csv'
TRAIN_PART_PATH = DATA_ROOT / 'training_project_train_part_b.csv'
TEST_PART_PATH = DATA_ROOT / 'training_project_test_part.csv'

SCALER_FILE_PATH = MODELS_PATH / 'scaler.pkl'


In [8]:
df_base = pd.read_csv(DATASET_PATH)
df = pd.read_csv(PREP_DATASET_PATH)
df_final = pd.read_csv(PREP_TEST_DATASET_PATH)


df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Purpose,...,Credit Score,Credit Default,Home Ownership_1,Home Ownership_2,Home Ownership_3,Purpose_0,Purpose_1,Purpose_2,Term_1,Term_2
0,3,482087.0,10,0,11.0,26.3,685960.0,1.0,1,0,...,749.0,0,0,0,1,1,0,0,0,1
1,3,1025487.0,10,0,15.0,15.3,1181730.0,0.0,0,0,...,737.0,1,0,0,1,1,0,0,1,0
2,1,751412.0,8,0,11.0,35.0,1182434.0,0.0,0,0,...,742.0,0,1,0,0,1,0,0,0,1
3,3,805068.0,6,0,8.0,22.5,147400.0,1.0,1,0,...,694.0,0,0,0,1,1,0,0,0,1
4,2,776264.0,8,0,13.0,13.6,385836.0,1.0,0,0,...,719.0,0,0,1,0,1,0,0,0,1


## Selection of the target variable and feature groups

In [9]:
TARGET_NAME = 'Credit Default'
BASE_FEATURE_NAMES = df_base.columns.drop([TARGET_NAME,'Months since last delinquent']).tolist()
NEW_FEATURE_NAMES = df.columns.drop([TARGET_NAME] + BASE_FEATURE_NAMES).tolist()

In [10]:
df[TARGET_NAME].value_counts(normalize=True)

0    0.718267
1    0.281733
Name: Credit Default, dtype: float64

## Отбор признаков

In [11]:
NUM_FEATURE_NAMES = ['Annual Income','Number of Open Accounts','Years of Credit History',\
       'Maximum Open Credit','Number of Credit Problems','Current Loan Amount','Years in current job',\
                     'Current Credit Balance', 'Monthly Debt','Credit Score']

CAT_FEATURE_NAMES = ['Home Ownership','Tax Liens','Bankruptcies', 'Purpose','Term']
CAT_FEATURE_NAMES_U = ['Bankruptcies']

SELECTED_FEATURE_NAMES = NUM_FEATURE_NAMES + CAT_FEATURE_NAMES_U + NEW_FEATURE_NAMES

## Data scaling

In [12]:
scaler = StandardScaler()

df_norm = df.copy()
df_norm[NUM_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUM_FEATURE_NAMES])

df_final_norm=df_final.copy()
df_final_norm[NUM_FEATURE_NAMES] = scaler.transform(df_final_norm[NUM_FEATURE_NAMES])

## Split into train and test

In [13]:
X = df_norm[SELECTED_FEATURE_NAMES]
y = df_norm[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)

display(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

0    0.718286
1    0.281714
Name: Credit Default, dtype: float64

0    0.718222
1    0.281778
Name: Credit Default, dtype: float64

## Saving training and test datasets

In [14]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [15]:
df_norm.to_csv(TRAIN_FULL_PATH, index=False, encoding='utf-8')
df_final_norm.to_csv(FINAL_FULL_PATH, index=False, encoding='utf-8')

train.to_csv(TRAIN_PART_PATH, index=False, encoding='utf-8')
test.to_csv(TEST_PART_PATH, index=False, encoding='utf-8')

## Building and evaluating base models

### Logistic regression

In [16]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

evaluate_preds(model_lr, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.77      0.98      0.86      3771
           1       0.83      0.27      0.41      1479

    accuracy                           0.78      5250
   macro avg       0.80      0.62      0.64      5250
weighted avg       0.79      0.78      0.74      5250

TEST

              precision    recall  f1-score   support

           0       0.77      0.98      0.86      1616
           1       0.79      0.24      0.37       634

    accuracy                           0.77      2250
   macro avg       0.78      0.61      0.61      2250
weighted avg       0.77      0.77      0.72      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1576   40
1                483  151


### KNN

In [17]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

evaluate_preds(model_knn, X_train, X_test, y_train, y_test)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a19fec310>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_m

TRAIN

              precision    recall  f1-score   support

           0       0.82      0.94      0.88      3771
           1       0.75      0.48      0.59      1479

    accuracy                           0.81      5250
   macro avg       0.78      0.71      0.73      5250
weighted avg       0.80      0.81      0.79      5250

TEST

              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1616
           1       0.56      0.33      0.42       634

    accuracy                           0.74      2250
   macro avg       0.66      0.61      0.62      2250
weighted avg       0.71      0.74      0.71      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1448  168
1                424  210


### decision tree

In [18]:
model_tree = DecisionTreeClassifier(random_state=21,
                                    class_weight={0:1, 1:3.6},
                                    max_depth=4
                                    )
model_tree.fit(X_train, y_train)

evaluate_preds(model_tree, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.92      0.35      0.51      3771
           1       0.36      0.92      0.52      1479

    accuracy                           0.51      5250
   macro avg       0.64      0.64      0.51      5250
weighted avg       0.76      0.51      0.51      5250

TEST

              precision    recall  f1-score   support

           0       0.91      0.35      0.51      1616
           1       0.36      0.91      0.51       634

    accuracy                           0.51      2250
   macro avg       0.63      0.63      0.51      2250
weighted avg       0.75      0.51      0.51      2250

CONFUSION MATRIX

col_0             0     1
Credit Default           
0               570  1046
1                58   576


### Boosted Algorithms

## XGBoost

In [19]:
%%time
model_xgb = xgb.XGBClassifier(random_state=21,
                              scale_pos_weight=3.6,
#                               n_estimators=100
                             )
model_xgb.fit(X_train, y_train)

evaluate_preds(model_xgb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      3771
           1       0.91      1.00      0.95      1479

    accuracy                           0.97      5250
   macro avg       0.95      0.98      0.96      5250
weighted avg       0.97      0.97      0.97      5250

TEST

              precision    recall  f1-score   support

           0       0.81      0.77      0.79      1616
           1       0.48      0.54      0.50       634

    accuracy                           0.70      2250
   macro avg       0.64      0.65      0.65      2250
weighted avg       0.71      0.70      0.71      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1242  374
1                294  340
CPU times: user 4.65 s, sys: 279 ms, total: 4.92 s
Wall time: 962 ms


## LightGBM

In [20]:
%%time
model_lgbm = lgbm.LGBMClassifier(random_state=21, 
                                 class_weight={0:1, 1:3.6},
#                                  n_estimators=100
                                )
model_lgbm.fit(X_train, y_train)

evaluate_preds(model_lgbm, X_train, X_test, y_train, y_test)


TRAIN

              precision    recall  f1-score   support

           0       0.99      0.86      0.92      3771
           1       0.73      0.98      0.83      1479

    accuracy                           0.89      5250
   macro avg       0.86      0.92      0.88      5250
weighted avg       0.92      0.89      0.89      5250

TEST

              precision    recall  f1-score   support

           0       0.82      0.70      0.76      1616
           1       0.45      0.61      0.52       634

    accuracy                           0.68      2250
   macro avg       0.63      0.66      0.64      2250
weighted avg       0.72      0.68      0.69      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1135  481
1                245  389
CPU times: user 784 ms, sys: 913 ms, total: 1.7 s
Wall time: 238 ms


## CatBoost

In [21]:
%%time
model_catb = catb.CatBoostClassifier(silent=True, random_state=21)
model_catb.fit(X_train, y_train)

evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.86      1.00      0.92      3771
           1       0.98      0.58      0.73      1479

    accuracy                           0.88      5250
   macro avg       0.92      0.79      0.82      5250
weighted avg       0.89      0.88      0.87      5250

TEST

              precision    recall  f1-score   support

           0       0.78      0.95      0.86      1616
           1       0.72      0.33      0.45       634

    accuracy                           0.77      2250
   macro avg       0.75      0.64      0.65      2250
weighted avg       0.77      0.77      0.74      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1535   81
1                426  208
CPU times: user 11.4 s, sys: 2.14 s, total: 13.6 s
Wall time: 2.34 s


## Preparing for catboost

In [22]:
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Purpose,...,Credit Score,Credit Default,Home Ownership_1,Home Ownership_2,Home Ownership_3,Purpose_0,Purpose_1,Purpose_2,Term_1,Term_2
0,3,482087.0,10,0,11.0,26.3,685960.0,1.0,1,0,...,749.0,0,0,0,1,1,0,0,0,1
1,3,1025487.0,10,0,15.0,15.3,1181730.0,0.0,0,0,...,737.0,1,0,0,1,1,0,0,1,0
2,1,751412.0,8,0,11.0,35.0,1182434.0,0.0,0,0,...,742.0,0,1,0,0,1,0,0,0,1
3,3,805068.0,6,0,8.0,22.5,147400.0,1.0,1,0,...,694.0,0,0,0,1,1,0,0,0,1
4,2,776264.0,8,0,13.0,13.6,385836.0,1.0,0,0,...,719.0,0,0,1,0,1,0,0,0,1


In [23]:
BASE_FEATURE_NAMES

['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Bankruptcies',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score']

In [24]:
X = df[BASE_FEATURE_NAMES]
y = df[TARGET_NAME]

CAT_FEATURE_NAMES = ['Home Ownership','Term','Purpose','Bankruptcies','Tax Liens']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)

In [25]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21,
                                     cat_features=CAT_FEATURE_NAMES,
#                                      one_hot_max_size=10
                                     )
model_catb.fit(X_train, y_train)

evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      3771
           1       0.97      0.51      0.67      1479

    accuracy                           0.86      5250
   macro avg       0.90      0.75      0.79      5250
weighted avg       0.88      0.86      0.84      5250

TEST

              precision    recall  f1-score   support

           0       0.78      0.95      0.86      1616
           1       0.72      0.32      0.44       634

    accuracy                           0.77      2250
   macro avg       0.75      0.64      0.65      2250
weighted avg       0.77      0.77      0.74      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1539   77
1                431  203


In [26]:
disbalance = y_train.value_counts()[0] / y_train.value_counts()[1]
disbalance

2.5496957403651117

In [27]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21,
                                     cat_features=CAT_FEATURE_NAMES,
                                     class_weights=[1, disbalance]
                                     )
model_catb.fit(X_train, y_train)

evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.94      0.86      0.90      3771
           1       0.71      0.87      0.78      1479

    accuracy                           0.86      5250
   macro avg       0.83      0.87      0.84      5250
weighted avg       0.88      0.86      0.87      5250

TEST

              precision    recall  f1-score   support

           0       0.82      0.76      0.79      1616
           1       0.48      0.57      0.52       634

    accuracy                           0.71      2250
   macro avg       0.65      0.66      0.66      2250
weighted avg       0.72      0.71      0.71      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1233  383
1                275  359


## catboost hyperparameters

In [28]:
model_catb = catb.CatBoostClassifier(silent=True, random_state=21,
                                     class_weights=[1, disbalance],
                                     eval_metric='F1',
                                     cat_features=CAT_FEATURE_NAMES,
                                     early_stopping_rounds=20,
                                     use_best_model=True,
                                     custom_metric=['Precision', 'Recall']
                                    )
model_catb.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f9a70ed3ee0>

In [29]:
model_catb.best_score_

{'learn': {'Recall:use_weights=false': 0.6862745098039216,
  'Logloss': 0.5104432233388413,
  'F1': 0.7244121711901876,
  'Precision:use_weights=false': 0.5649749023982152,
  'Precision:use_weights=true': 0.7680533155417033,
  'Recall:use_weights=true': 0.6862745098039216},
 'validation': {'Recall:use_weights=false': 0.5946372239747634,
  'Logloss': 0.5513387609343764,
  'F1': 0.648663187275781,
  'Precision:use_weights=false': 0.5152487961476726,
  'Precision:use_weights=true': 0.7304659983504388,
  'Recall:use_weights=true': 0.5946372239747634}}

In [30]:
evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      3771
           1       0.55      0.66      0.60      1479

    accuracy                           0.75      5250
   macro avg       0.70      0.72      0.71      5250
weighted avg       0.77      0.75      0.76      5250

TEST

              precision    recall  f1-score   support

           0       0.83      0.76      0.79      1616
           1       0.49      0.59      0.54       634

    accuracy                           0.71      2250
   macro avg       0.66      0.68      0.67      2250
weighted avg       0.73      0.71      0.72      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1230  386
1                257  377


## Selecting the best model and selecting hyperparameters

In [31]:
frozen_params = {
     'class_weights':[1, disbalance], 
     'silent':True,
     'random_state':21,
     'cat_features':CAT_FEATURE_NAMES,
     'eval_metric':'F1',
     'early_stopping_rounds':20
}
model_catb = catb.CatBoostClassifier(**frozen_params)

### Selection of hyperparameters

In [32]:
params = {'iterations':[50, 200, 500, 700, 1500],
          'max_depth':[3, 5, 7]}

In [33]:
cv = StratifiedKFold(n_splits=3, random_state=21, shuffle=True)

In [34]:
grid_search = model_catb.grid_search(params, X_train, y_train, cv=cv, stratified=True, plot=True, refit=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.5890131689
bestIteration = 1

0:	loss: 0.5890132	best: 0.5890132 (0)	total: 78.4ms	remaining: 1.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6298233569
bestIteration = 117

1:	loss: 0.6298234	best: 0.6298234 (1)	total: 527ms	remaining: 3.43s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6298233569
bestIteration = 117

2:	loss: 0.6298234	best: 0.6298234 (1)	total: 959ms	remaining: 3.84s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6298233569
bestIteration = 117

3:	loss: 0.6298234	best: 0.6298234 (1)	total: 1.57s	remaining: 4.32s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6298233569
bestIteration = 117

4:	loss: 0.6298234	best: 0.6298234 (1)	total: 1.99s	remaining: 3.99s

bestTest = 0.6027448256
bestIteration = 49

5:	loss: 0.6027448	best: 0.6298234 (1)	total: 2.13s	remaining: 3.2s
Stopped by overfitting detector  (20 iterati

In [35]:
grid_search

{'params': {'depth': 3, 'iterations': 200},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21],
              'test-F1-mean': [0.6757758387871485,
               0.6695852700629809,
               0.5913193954949801,
               0.599984297446193,
               0.5934331108342658,
               0.5972290834056538,
               0.6001621861240095,
               0.6000654621009858,
               0.6012281454956164,
               0.6021592510630467,
               0.6005039004530639,
               0.6005866025369607,
               0.601426020894217,
               0.60206665141147

In [36]:
pd.DataFrame(grid_search['cv_results']).sort_values('test-F1-mean', ascending=False).head()

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.675776,0.011446,0.675396,0.004504,0.679332,0.000852,0.679278,0.000291
1,1,0.669585,0.016643,0.670446,0.001349,0.669713,0.002962,0.669607,0.001397
15,15,0.603,0.021586,0.603829,0.01101,0.597002,0.002005,0.595826,0.003266
21,21,0.602904,0.021707,0.603459,0.011219,0.589492,0.003011,0.587644,0.002483
19,19,0.602713,0.02165,0.604016,0.011573,0.591332,0.002215,0.589808,0.00312


### Training and evaluation of the final model

In [37]:
%%time

final_model = catb.CatBoostClassifier(**frozen_params, iterations=200, max_depth=7)
final_model.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

evaluate_preds(final_model, X_train, X_test, y_train, y_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

TRAIN

              precision    recall  f1-score   support

           0       0.85      0.78      0.82      3771
           1       0.54      0.66      0.60      1479

    accuracy                           0.75      5250
   macro avg       0.70      0.72      0.71      5250
weighted avg       0.77      0.75      0.75      5250

TEST

              precision    recall  f1-score   support

           0       0.82      0.75      0.78      1616
           1       0.48      0.59      0.53       634

    accuracy                           0.70      2250
   macro avg       0.65      0.67      0.66      2250
weighted avg       0.73      0.70      0.71      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1211  405
1                261  373
CPU times: user 1.62 s, sys: 165 ms, total: 1.78 s
Wall time: 357 ms


## Target variable balancing

In [38]:
import inspect
print(inspect.getsource(balance_df_by_target))

def balance_df_by_target(df, target_name, method='over'):

    assert method in ['over', 'under', 'tomek', 'smote'], 'Неверный метод сэмплирования'
    
    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1
    if method == 'over':
        for i in range(disbalance_coeff):
            sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
            df = df.append(sample, ignore_index=True)
            
    elif method == 'under':
        df_ = df.copy()
        df = df_[df_[target_name] == minor_class_name]
        tmp = df_[df_[target_name] == major_class_name]
        df = df.append(tmp.iloc[
            np.random.randint(0, tmp.shape[0], target_counts[minor_class_name])
        ], ignore_index=True)

    elif method == 'tomek':
        from imblearn.under_sampli

In [39]:
def init_models():
    model_lr = LogisticRegression(random_state=21)
    model_knn = KNeighborsClassifier()
    model_tree = DecisionTreeClassifier(random_state=21)
    model_xgb = xgb.XGBClassifier(random_state=21)
    model_lgbm = lgbm.LGBMClassifier(random_state=21)
    model_catb = catb.CatBoostClassifier(silent=True, random_state=21)

    models = {
        'lr': model_lr,
        'knn': model_knn,
        'tree': model_tree,
        'xgb': model_xgb,
        'lgbm': model_lgbm,
        'cat': model_catb
    }
    return models

In [40]:
def get_metrics(report):
    f1_macro = report['macro avg']['f1-score']
    f1_0 = report['0']['f1-score']
    f1_1 = report['1']['f1-score']
    return f1_macro, f1_0, f1_1

In [41]:
def run_experiment(X_train, X_test, y_train, y_test, method='not'):
    assert method in ['not', 'over', 'under', 'tomek', 'smote'], 'Неправильный метод сэмплирования'
    
    models = init_models()
    
    stata = pd.DataFrame()
    for name, model in models.items():
        model.fit(X_train, y_train)
        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)

        report_train = classification_report(y_train, pred_train, output_dict=True)
        report_test = classification_report(y_test, pred_test, output_dict=True)
        f1_macro_train, f1_0_train, f1_1_train = get_metrics(report_train)
        f1_macro_test, f1_0_test, f1_1_test = get_metrics(report_test)

        stata = stata.append({
            'model_name': name,
            f'f1_macro_train_{method}': f1_macro_train,
            f'f1_macro_test_{method}': f1_macro_test,
            f'f1_0_train_{method}': f1_0_train,
            f'f1_0_test_{method}': f1_0_test,
            f'f1_1_train_{method}': f1_1_train,
            f'f1_1_test_{method}': f1_1_test
        }, ignore_index=True)
    return stata, models

### Doing nothing with the data

In [42]:
df = pd.read_csv(PREP_DATASET_PATH)
df[NUM_FEATURE_NAMES] = scaler.transform(df[NUM_FEATURE_NAMES])

X = df[SELECTED_FEATURE_NAMES]
y = df[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)


In [43]:
%%time
stata_not_balanced, models_not_balanced = run_experiment(X_train, X_test, y_train, y_test, method='not')
stata_not_balanced

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a19fec9d0>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_m

CPU times: user 19.2 s, sys: 4.6 s, total: 23.8 s
Wall time: 4.61 s


Unnamed: 0,f1_0_test_not,f1_0_train_not,f1_1_test_not,f1_1_train_not,f1_macro_test_not,f1_macro_train_not,model_name
0,0.857687,0.863966,0.366061,0.406537,0.611874,0.635252,lr
1,0.830275,0.87531,0.41502,0.586349,0.622647,0.730829,knn
2,0.788162,1.0,0.472868,1.0,0.630515,1.0,tree
3,0.843429,0.974392,0.452,0.928468,0.647714,0.95143,xgb
4,0.85245,0.944298,0.462332,0.82607,0.657391,0.885184,lgbm
5,0.858261,0.921585,0.450704,0.728201,0.654483,0.824893,cat


### Oversampling

In [44]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME, method='over')
    
df_balanced[TARGET_NAME].value_counts()

0    3771
1    2958
Name: Credit Default, dtype: int64

In [45]:
X_train_balanced = df_balanced.drop(columns=TARGET_NAME)
y_train_balanced = df_balanced[TARGET_NAME]

In [46]:
%%time
stata_balanced_over, models_over = run_experiment(X_train_balanced, X_test, y_train_balanced, y_test, method='over')
stata_balanced_over

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a70ed84c0>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_m

CPU times: user 20 s, sys: 4.13 s, total: 24.1 s
Wall time: 4.78 s


Unnamed: 0,f1_0_test_over,f1_0_train_over,f1_1_test_over,f1_1_train_over,f1_macro_test_over,f1_macro_train_over,model_name
0,0.810034,0.75338,0.495532,0.595962,0.652783,0.674671,lr
1,0.779913,0.828337,0.4427,0.762916,0.611307,0.795626,knn
2,0.793337,1.0,0.467409,1.0,0.630373,1.0,tree
3,0.815172,0.98167,0.478778,0.97634,0.646975,0.979005,xgb
4,0.816078,0.943568,0.503289,0.925151,0.659684,0.93436,lgbm
5,0.818841,0.922978,0.494949,0.894977,0.656895,0.908978,cat


### Undersampling

In [47]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME, method='under')
    
df_balanced[TARGET_NAME].value_counts()

1    1479
0    1479
Name: Credit Default, dtype: int64

In [48]:
X_train_balanced = df_balanced.drop(columns=TARGET_NAME)
y_train_balanced = df_balanced[TARGET_NAME]

In [49]:
%%time
stata_balanced_under, models_under = run_experiment(X_train_balanced, X_test, y_train_balanced, y_test, method='under')
stata_balanced_under


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a70ed8f70>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_m

CPU times: user 14.3 s, sys: 3.79 s, total: 18 s
Wall time: 3.21 s


Unnamed: 0,f1_0_test_under,f1_0_train_under,f1_1_test_under,f1_1_train_under,f1_macro_test_under,f1_macro_train_under,model_name
0,0.762159,0.684833,0.523364,0.665505,0.642762,0.675169,lr
1,0.701816,0.771342,0.477995,0.766815,0.589905,0.769079,knn
2,0.675054,1.0,0.473807,1.0,0.57443,1.0,tree
3,0.708318,0.994609,0.519129,0.994573,0.613724,0.994591,xgb
4,0.722653,0.967219,0.528211,0.967196,0.625432,0.967208,lgbm
5,0.745316,0.923543,0.546354,0.922973,0.645835,0.923258,cat


### Undersampling (Tomeklinks)

In [50]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()
X_train_balanced, y_train_balanced = tl.fit_sample(X_train, y_train)

y_train_balanced.value_counts()

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a5fd54700>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


0    3423
1    1479
Name: Credit Default, dtype: int64

In [52]:
%%time
stata_balanced_tomek, models_tomek = run_experiment(X_train_balanced, X_test, y_train_balanced, y_test, method='tomek')
stata_balanced_tomek

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a101c0d30>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_m

CPU times: user 17.7 s, sys: 3.72 s, total: 21.4 s
Wall time: 4.09 s


Unnamed: 0,f1_0_test_tomek,f1_0_train_tomek,f1_1_test_tomek,f1_1_train_tomek,f1_macro_test_tomek,f1_macro_train_tomek,model_name
0,0.848315,0.852235,0.425532,0.482315,0.636923,0.667275,lr
1,0.815774,0.881463,0.431942,0.659558,0.623858,0.77051,knn
2,0.776662,1.0,0.457317,1.0,0.61699,1.0,tree
3,0.831434,0.977924,0.464253,0.945545,0.647843,0.961734,xgb
4,0.838728,0.949436,0.463462,0.861714,0.651095,0.905575,lgbm
5,0.8489,0.926123,0.47047,0.781943,0.659685,0.854033,cat


### Oversampling (SMOTE)

In [53]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_sample(X_train, y_train)

y_train_balanced.value_counts()

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a5fd54550>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


1    3771
0    3771
Name: Credit Default, dtype: int64

In [54]:
%%time
stata_balanced_smote, models_smote = run_experiment(X_train_balanced, X_test, y_train_balanced, y_test, method='smote')
stata_balanced_smote

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9a5fd548b0>
Traceback (most recent call last):
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/recpi/anaconda3/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_m

CPU times: user 21.9 s, sys: 4.89 s, total: 26.8 s
Wall time: 5.22 s


Unnamed: 0,f1_0_test_smote,f1_0_train_smote,f1_1_test_smote,f1_1_train_smote,f1_macro_test_smote,f1_macro_train_smote,model_name
0,0.782888,0.723181,0.514738,0.687553,0.648813,0.705367,lr
1,0.723476,0.847751,0.463749,0.870398,0.593613,0.859074,knn
2,0.76117,1.0,0.465083,1.0,0.613126,1.0,tree
3,0.834957,0.976848,0.497754,0.976206,0.666356,0.976527,xgb
4,0.828487,0.935914,0.488496,0.931338,0.658491,0.933626,lgbm
5,0.837632,0.925611,0.490809,0.918393,0.66422,0.922002,cat


### Putting the results together

In [55]:
stata_combined = stata_not_balanced.merge(stata_balanced_under, on='model_name') \
                                   .merge(stata_balanced_over, on='model_name') \
                                   .merge(stata_balanced_tomek, on='model_name') \
                                   .merge(stata_balanced_smote, on='model_name')
stata_combined.head()

Unnamed: 0,f1_0_test_not,f1_0_train_not,f1_1_test_not,f1_1_train_not,f1_macro_test_not,f1_macro_train_not,model_name,f1_0_test_under,f1_0_train_under,f1_1_test_under,...,f1_1_test_tomek,f1_1_train_tomek,f1_macro_test_tomek,f1_macro_train_tomek,f1_0_test_smote,f1_0_train_smote,f1_1_test_smote,f1_1_train_smote,f1_macro_test_smote,f1_macro_train_smote
0,0.857687,0.863966,0.366061,0.406537,0.611874,0.635252,lr,0.762159,0.684833,0.523364,...,0.425532,0.482315,0.636923,0.667275,0.782888,0.723181,0.514738,0.687553,0.648813,0.705367
1,0.830275,0.87531,0.41502,0.586349,0.622647,0.730829,knn,0.701816,0.771342,0.477995,...,0.431942,0.659558,0.623858,0.77051,0.723476,0.847751,0.463749,0.870398,0.593613,0.859074
2,0.788162,1.0,0.472868,1.0,0.630515,1.0,tree,0.675054,1.0,0.473807,...,0.457317,1.0,0.61699,1.0,0.76117,1.0,0.465083,1.0,0.613126,1.0
3,0.843429,0.974392,0.452,0.928468,0.647714,0.95143,xgb,0.708318,0.994609,0.519129,...,0.464253,0.945545,0.647843,0.961734,0.834957,0.976848,0.497754,0.976206,0.666356,0.976527
4,0.85245,0.944298,0.462332,0.82607,0.657391,0.885184,lgbm,0.722653,0.967219,0.528211,...,0.463462,0.861714,0.651095,0.905575,0.828487,0.935914,0.488496,0.931338,0.658491,0.933626


In [56]:
stata_combined.describe()[
    ['f1_macro_test_not', 'f1_macro_test_under', 'f1_macro_test_over',
     'f1_macro_test_tomek', 'f1_macro_test_smote',
     'f1_1_test_not', 'f1_1_test_under', 'f1_1_test_over',
     'f1_1_test_tomek', 'f1_1_test_smote']
]

Unnamed: 0,f1_macro_test_not,f1_macro_test_under,f1_macro_test_over,f1_macro_test_tomek,f1_macro_test_smote,f1_1_test_not,f1_1_test_under,f1_1_test_over,f1_1_test_tomek,f1_1_test_smote
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.637437,0.615348,0.643003,0.639399,0.64077,0.436498,0.511477,0.480443,0.452163,0.486771
std,0.018517,0.02866,0.018691,0.01655,0.030237,0.039636,0.029108,0.022615,0.018728,0.019608
min,0.611874,0.57443,0.611307,0.61699,0.593613,0.366061,0.473807,0.4427,0.425532,0.463749
25%,0.624614,0.59586,0.634523,0.627124,0.622048,0.423941,0.488279,0.470251,0.438286,0.470936
50%,0.639115,0.619578,0.649879,0.642383,0.653652,0.451352,0.521247,0.486864,0.460389,0.489652
75%,0.652791,0.638429,0.655867,0.650282,0.662788,0.459749,0.527,0.495386,0.464055,0.496018
max,0.657391,0.645835,0.659684,0.659685,0.666356,0.472868,0.546354,0.503289,0.47047,0.514738


In [57]:
stata_combined.sort_values('f1_1_test_smote', ascending=False).iloc[0]

f1_0_test_not           0.857687
f1_0_train_not          0.863966
f1_1_test_not           0.366061
f1_1_train_not          0.406537
f1_macro_test_not       0.611874
f1_macro_train_not      0.635252
model_name                    lr
f1_0_test_under         0.762159
f1_0_train_under        0.684833
f1_1_test_under         0.523364
f1_1_train_under        0.665505
f1_macro_test_under     0.642762
f1_macro_train_under    0.675169
f1_0_test_over          0.810034
f1_0_train_over          0.75338
f1_1_test_over          0.495532
f1_1_train_over         0.595962
f1_macro_test_over      0.652783
f1_macro_train_over     0.674671
f1_0_test_tomek         0.848315
f1_0_train_tomek        0.852235
f1_1_test_tomek         0.425532
f1_1_train_tomek        0.482315
f1_macro_test_tomek     0.636923
f1_macro_train_tomek    0.667275
f1_0_test_smote         0.782888
f1_0_train_smote        0.723181
f1_1_test_smote         0.514738
f1_1_train_smote        0.687553
f1_macro_test_smote     0.648813
f1_macro_t

In [60]:
model = models_smote['lr']

In [61]:
pred_train = model.predict(X_train_balanced)
pred_test = model.predict(X_test)

evaluate_preds(model, X_train_balanced, X_test, y_train_balanced, y_test)


TRAIN

              precision    recall  f1-score   support

           0       0.68      0.77      0.72      3771
           1       0.73      0.65      0.69      3771

    accuracy                           0.71      7542
   macro avg       0.71      0.71      0.71      7542
weighted avg       0.71      0.71      0.71      7542

TEST

              precision    recall  f1-score   support

           0       0.82      0.75      0.78      1616
           1       0.47      0.56      0.51       634

    accuracy                           0.70      2250
   macro avg       0.64      0.66      0.65      2250
weighted avg       0.72      0.70      0.71      2250

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1217  399
1                276  358
