In [0]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [0]:
import pandas as pd
import numpy as np
import pickle
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb, lightgbm as lgbm, catboost as catb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [0]:
import warnings
warnings.simplefilter('ignore')

In [0]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    
    print('CONFUSION MATRIX TEST\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [0]:
def balance_df_by_target(df, target_name='Credit Default'):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1) # случайное перемешивание данных

In [0]:
DATASET_PATH = 'course_project_train.csv'
PREP_DATASET_PATH = 'course_project_train_prep_2.csv'

SCALER_FILE_PATH = 'scaler.pkl'
MODEL_FILE_PATH = 'model.pkl'

**Описание датасета**

* **Home Ownership** - Домовладение (Home Mortgage, Rent, Own Home, Have Mortgage)
* **Annual Income** - Годовой доход
* **Years in current job** - Количество лет на текущем месте работы
* **Tax Liens** - Налоговые обременения
* **Number of Open Accounts** - Количество открытых счетов
* **Years of Credit History** - Количество лет кредитной истории
* **Maximum Open Credit** - Наибольший открытый кредит
* **Number of Credit Problems** - Количество проблем с кредитом
* **Months since last delinquent** - Количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротство
* **Purpose** - Цель кредита (debt consolidation, other, home improvements, business loan, buy a car, medical bills, major purchase, take a trip, buy house, small business, wedding, moving, educational expenses, vacation, renewable energy)
* **Term** - Срок кредита (Long Term, Short Term)
* **Current Loan Amount** - Текущая сумма кредита
* **Current Credit Balance** - Текущий кредитный баланс
* **Monthly Debt** - Ежемесячный долг
* **Credit Score** - Счет кредита
* **Credit Default** -  факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [620]:
df_base = pd.read_csv(DATASET_PATH)
df = pd.read_csv(PREP_DATASET_PATH)

df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default,ID,Years in current job_1 year,Years in current job_10+ years,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Tax Liens_0.0,Tax Liens_1.0,Tax Liens_2.0,Tax Liens_3.0,Tax Liens_4.0,Number of Open Accounts_10.0,Number of Open Accounts_11.0,Number of Open Accounts_12.0,Number of Open Accounts_13.0,Number of Open Accounts_14.0,Number of Open Accounts_15.0,...,Number of Open Accounts_20.0,Number of Open Accounts_21.0,Number of Open Accounts_22.0,Number of Open Accounts_23.0,Number of Open Accounts_24.0,Number of Open Accounts_25.0,Number of Open Accounts_3.0,Number of Open Accounts_4.0,Number of Open Accounts_5.0,Number of Open Accounts_6.0,Number of Open Accounts_7.0,Number of Open Accounts_8.0,Number of Open Accounts_9.0,Number of Credit Problems_0.0,Number of Credit Problems_1.0,Number of Credit Problems_2.0,Number of Credit Problems_3.0,Number of Credit Problems_4.0,Bankruptcies_0.0,Bankruptcies_1.0,Bankruptcies_2.0,Bankruptcies_nan,Purpose_business loan,Purpose_buy a car,Purpose_buy house,Purpose_debt consolidation,Purpose_educational expenses,Purpose_home improvements,Purpose_major purchase,Purpose_medical bills,Purpose_moving,Purpose_other,Purpose_renewable energy,Purpose_small business,Purpose_take a trip,Purpose_vacation,Purpose_wedding,Term_Long Term,Term_Short Term,Average amount on 1 account
0,Own Home,482087.0,10+ years,0.0,1.0,26.3,685960.0,1.0,15.8152,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,749.0
1,Own Home,1025487.0,10+ years,0.0,1.0,15.3,1181730.0,1.0,15.8152,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,737.0
2,Home Mortgage,751412.0,8 years,0.0,1.0,35.0,1182434.0,1.0,15.8152,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0,2,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,742.0
3,Own Home,805068.0,6 years,0.0,1.0,22.5,147400.0,1.0,15.8152,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0,3,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,694.0
4,Rent,776264.0,8 years,0.0,1.0,13.6,385836.0,1.0,15.8152,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0,4,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,719.0


In [0]:
# len(df) - df.count()

In [0]:
df=df.fillna(method ='bfill')
df_base=df_base.fillna(method ='bfill')
df=df.fillna(0)
df_base=df_base.fillna(0)

**Выделение целевой переменной и групп признаков**

In [623]:
TARGET_NAME = 'Credit Default'
BASE_FEATURE_NAMES = df_base.columns.drop(TARGET_NAME).tolist()
NEW_FEATURE_NAMES = df.columns.drop([TARGET_NAME, 'ID'] + BASE_FEATURE_NAMES).tolist()
NEW_FEATURE_NAMES

['Years in current job_1 year',
 'Years in current job_10+ years',
 'Years in current job_2 years',
 'Years in current job_3 years',
 'Years in current job_4 years',
 'Years in current job_5 years',
 'Years in current job_6 years',
 'Years in current job_7 years',
 'Years in current job_8 years',
 'Years in current job_9 years',
 'Years in current job_< 1 year',
 'Tax Liens_0.0',
 'Tax Liens_1.0',
 'Tax Liens_2.0',
 'Tax Liens_3.0',
 'Tax Liens_4.0',
 'Number of Open Accounts_10.0',
 'Number of Open Accounts_11.0',
 'Number of Open Accounts_12.0',
 'Number of Open Accounts_13.0',
 'Number of Open Accounts_14.0',
 'Number of Open Accounts_15.0',
 'Number of Open Accounts_16.0',
 'Number of Open Accounts_17.0',
 'Number of Open Accounts_18.0',
 'Number of Open Accounts_19.0',
 'Number of Open Accounts_2.0',
 'Number of Open Accounts_20.0',
 'Number of Open Accounts_21.0',
 'Number of Open Accounts_22.0',
 'Number of Open Accounts_23.0',
 'Number of Open Accounts_24.0',
 'Number of Open A

### Отбор признаков

In [0]:
# list(df)

In [625]:
NUMERIC_FEATURE_NAMES = [
 'Annual Income',
 'Number of Open Accounts',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Bankruptcies',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score',
 'Average amount on 1 account']

FEATURE_NAMES_SELECTED = NUMERIC_FEATURE_NAMES + NEW_FEATURE_NAMES
FEATURE_NAMES_SELECTED

['Annual Income',
 'Number of Open Accounts',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Bankruptcies',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score',
 'Average amount on 1 account',
 'Years in current job_1 year',
 'Years in current job_10+ years',
 'Years in current job_2 years',
 'Years in current job_3 years',
 'Years in current job_4 years',
 'Years in current job_5 years',
 'Years in current job_6 years',
 'Years in current job_7 years',
 'Years in current job_8 years',
 'Years in current job_9 years',
 'Years in current job_< 1 year',
 'Tax Liens_0.0',
 'Tax Liens_1.0',
 'Tax Liens_2.0',
 'Tax Liens_3.0',
 'Tax Liens_4.0',
 'Number of Open Accounts_10.0',
 'Number of Open Accounts_11.0',
 'Number of Open Accounts_12.0',
 'Number of Open Accounts_13.0',
 'Number of Open Accounts_14.0',
 'Number of Open Accounts_15.0',
 'Number of Open Accounts_16.0',
 'Number of Open Accounts_17.0',
 'Number of Open Accounts_18.0',
 'Number of Open Accounts_19.0',
 'Num

In [0]:
scaler = StandardScaler()

df_norm = df.copy()
df_norm[NUMERIC_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUMERIC_FEATURE_NAMES])


df = df_norm.copy()

### Разбиение на train и test

In [0]:
X = df[FEATURE_NAMES_SELECTED]
y = df[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25, random_state=211)

### Балансировка целевой переменной

In [628]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    4047
1    3156
Name: Credit Default, dtype: int64

In [0]:
X_train = df_balanced.drop(columns=TARGET_NAME)
y_train = df_balanced[TARGET_NAME]

### Построение и оценка базовых моделей

**Логистическая регрессия**

In [630]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.67      0.86      0.75      4047
           1       0.72      0.46      0.56      3156

    accuracy                           0.69      7203
   macro avg       0.70      0.66      0.66      7203
weighted avg       0.69      0.69      0.67      7203

TEST

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1340
           1       0.57      0.46      0.51       535

    accuracy                           0.75      1875
   macro avg       0.69      0.66      0.67      1875
weighted avg       0.74      0.75      0.74      1875

CONFUSION MATRIX TEST

col_0              0    1
Credit Default           
0               1159  181
1                291  244


**k ближайших соседей**

In [631]:
KNeighborsClassifier

sklearn.neighbors._classification.KNeighborsClassifier

In [632]:
model_knn = KNeighborsClassifier(n_neighbors=12)
model_knn.fit(X_train, y_train)

y_train_pred = model_knn.predict(X_train)
y_test_pred = model_knn.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.69      0.90      0.78      4047
           1       0.79      0.49      0.61      3156

    accuracy                           0.72      7203
   macro avg       0.74      0.69      0.69      7203
weighted avg       0.74      0.72      0.71      7203

TEST

              precision    recall  f1-score   support

           0       0.77      0.84      0.81      1340
           1       0.49      0.38      0.43       535

    accuracy                           0.71      1875
   macro avg       0.63      0.61      0.62      1875
weighted avg       0.69      0.71      0.70      1875

CONFUSION MATRIX TEST

col_0              0    1
Credit Default           
0               1132  208
1                333  202


**Бустинговые алгоритмы**

*XGBoost*

In [0]:
# model_xgb = xgb.XGBClassifier(random_state=21)
# model_xgb.fit(X_train, y_train)

# y_train_pred = model_xgb.predict(X_train)
# y_test_pred = model_xgb.predict(X_test)

# get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

*LightGBM*

In [634]:
model_lgbm = lgbm.LGBMClassifier(random_state=21)
model_lgbm.fit(X_train, y_train)

y_train_pred = model_lgbm.predict(X_train)
y_test_pred = model_lgbm.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      4047
           1       0.93      0.78      0.85      3156

    accuracy                           0.88      7203
   macro avg       0.89      0.87      0.87      7203
weighted avg       0.88      0.88      0.88      7203

TEST

              precision    recall  f1-score   support

           0       0.79      0.86      0.82      1340
           1       0.54      0.42      0.48       535

    accuracy                           0.73      1875
   macro avg       0.66      0.64      0.65      1875
weighted avg       0.72      0.73      0.72      1875

CONFUSION MATRIX TEST

col_0              0    1
Credit Default           
0               1147  193
1                308  227


*CatBoost*

In [0]:
# model_catb = catb.CatBoostClassifier(silent=True, random_state=21)
# model_catb.fit(X_train, y_train)

# y_train_pred = model_catb.predict(X_train)
# y_test_pred = model_catb.predict(X_test)

# get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

### Выбор лучшей модели и подбор гиперпараметров

In [0]:
model_lgbm = lgbm.LGBMClassifier(silent=True, random_state=21)

**Подбор гиперпараметров**

In [0]:
arams = {'n_estimators':[50, 100, 200, 500], 
          'max_depth':[3, 5, 7, 9],
          'learning_rate':[0.001, 0.01, 0.05, 0.1]}

In [0]:
cv=KFold(n_splits=3, random_state=21, shuffle=True)

In [639]:
%%time

rs = GridSearchCV(model_lgbm, params, scoring='f1', cv=cv, n_jobs=-1)
rs.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [0]:
rs.best_params_

**Обучение и оценка финальной модели**

In [645]:
%%time

final_model = lgbm.LGBMClassifier(n_estimators=500, max_depth=9, learning_rate=0.1,
                                      silent=True, random_state=21)
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

TRAIN

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4047
           1       1.00      0.99      0.99      3156

    accuracy                           1.00      7203
   macro avg       1.00      1.00      1.00      7203
weighted avg       1.00      1.00      1.00      7203

TEST

              precision    recall  f1-score   support

           0       0.78      0.85      0.82      1340
           1       0.53      0.41      0.46       535

    accuracy                           0.73      1875
   macro avg       0.66      0.63      0.64      1875
weighted avg       0.71      0.73      0.72      1875

CONFUSION MATRIX TEST

col_0              0    1
Credit Default           
0               1141  199
1                313  222
CPU times: user 2.76 s, sys: 46 ms, total: 2.81 s
Wall time: 1.45 s


### Сохранение модели

**Scaler**

In [0]:
with open(SCALER_FILE_PATH, 'wb') as file:
    pickle.dump(scaler, file)

**Model**

In [0]:
with open(MODEL_FILE_PATH, 'wb') as file:
    pickle.dump(final_model, file)