# GeekBrains Credit Default Classification Course Project 2020-09-17

## Установка библиотек (если отсутствуют)

In [1]:
# %conda install --file requirements.txt

# Подключение библиотек и скриптов

In [109]:
import datetime
import os
import pickle

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import norm

import sklearn
import xgboost as xgb
# 1. Разделение датасета и поиск гиперпараметров
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
# 2. Трансформация данных
from sklearn.preprocessing import StandardScaler
# 3. Модели
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# 4. Метрики качества
from sklearn.metrics import (classification_report, accuracy_score,
                             precision_score, f1_score,
                             confusion_matrix, plot_confusion_matrix,
                             auc, roc_auc_score, roc_curve, plot_roc_curve,
                             precision_recall_curve, plot_precision_recall_curve)

%matplotlib inline

In [3]:
import warnings
#warnings.filterwarnings('ignore')

**Скрипт для уменьшения занимаемой памяти DataFrame**

In [4]:
def reduce_mem_usage(df, *, verbose=False):
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
        From Kaggle Notebook:
        https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
    """
    if verbose:
        start_mem = df.memory_usage().sum() / 1024**2
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    if verbose:
        end_mem = df.memory_usage().sum() / 1024**2
        mem_decrease_percent = 100 * (start_mem - end_mem) / start_mem
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        print(f"Decreased by {mem_decrease_percent:.1f}%")
    
    return df

**Скрипт для логирования каждой тренировки, оценки и кросс-валидации модели**

In [5]:
def log_model_results(model:sklearn.base.BaseEstimator,
                      accuracy:np.float64 = None,
                      f1_score:np.float64 = None,
                      cv_score:np.float64 = None,
                      comment:str = None,
                      train_log_path:str = "./training_log.csv",
                      *args, **kwargs) -> None:
    """ Function for logging trained model results on Credit Default dataset.
    Arguments:
        model - a Scikit-learn model that was trained,
        f1_score - F1 score calculated for the model on the training part of the train dataset,
        cv_score - cross-validation F1 score calculated for the model on the full train dataset,
        comment - a comment message for this model training,
        train_log_path - filepath for the training log of the models
    
    Returns:
        None
    """
    
    model_name = type(model).__name__
    model_params = repr(model.get_params())
    curr_datetime = datetime.datetime.now()
    date = curr_datetime.date()
    # Microseconds are not needed
    time = curr_datetime.time().replace(microsecond=0)
    
    log_string = f"{date};{time};{model_name};{accuracy if accuracy else ''};" + \
                 f"{f1_score if f1_score else ''};{cv_score if cv_score else ''};{model_params};" + \
                 f"{comment if comment else ''}\n"
    
    # Setup the log file if it's not created or is blank
    if not os.path.isfile(train_log_path):
        with open(train_log_path, 'w', encoding='utf-8'):
            pass
    if os.stat(train_log_path).st_size == 0:
        with open(train_log_path, 'w', encoding='utf-8') as file:
            file.write(f"date;time;model_type;accuracy;f1;cv_f1;hypers;comment\n")
    
    with open(train_log_path, 'a', encoding='utf-8') as file:
        file.write(log_string)
        print(log_string)

**Скрипт визуализации предсказаний модели**

In [6]:
def eval_predictions(train_true: pd.Series, train_pred: pd.Series, test_true: pd.Series, test_pred: pd.Series) -> None:
    
    """ THIS IS A TEMPLATE FOR REGRESSORS NOW"""
    
#     """ Evaluate model predictions with MSE, RMSE and R2 score.
#         Function from the GeekBrains tutorial notebook.
#     """
#     fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(18,10))
#     ax = ax.flatten()
    
#     print(f"Train R2:\t{round(r2(train_true, train_pred), 3)}", 
#           f"Train RMSE:\t{round(np.sqrt(mse(train_true, train_pred)), 3)}", 
#           f"Train MSE:\t{round(mse(train_true, train_pred), 3)}", 
#           sep='\n')
#     print()
#     print(f"Test R2:\t{round(r2(test_true, test_pred), 3)}", 
#           f"Test RMSE:\t{round(np.sqrt(mse(test_true, test_pred)), 3)}", 
#           f"Test MSE:\t{round(mse(test_true, test_pred), 3)}", 
#           sep='\n')
    
#     min_coord = min(train_true.min(), train_pred.min(), test_true.min(), test_pred.min())
#     max_coord = max(train_true.max(), train_pred.max(), test_true.max(), test_pred.max())
    
#     sns.scatterplot(x=train_pred, y=train_true, ax=ax[0])
#     ax[0].plot([min_coord, max_coord], [min_coord, max_coord], linestyle='--', color='black')
#     ax[0].set_title("Train dataset Prediction")
#     ax[0].set_xlabel("Predicted")
#     ax[0].set_ylabel("True")
    
#     sns.scatterplot(x=test_pred, y=test_true, ax=ax[1])
#     ax[1].plot([min_coord, max_coord], [min_coord, max_coord], linestyle='--', color='black')
#     ax[1].set_title("Test dataset Prediction")
#     ax[1].set_xlabel("Predicted")
#     ax[1].set_ylabel("True")
    
#     plt.show()

**Пути к файлам**

In [7]:
TRAIN_FILEPATH = './data/train.csv'
PREPARED_TRAIN_FILEPATH = './data/prepared_train.csv'
TEST_FILEPATH = './data/test.csv'
SCALER_FILEPATH = './scaler.pkl'
MODEL_FILEPATH = './model.pkl'
LOGFILE_FILEPATH = './training_log.csv'

# Загрузка данных в DataFrame

**Описание задачи**

Цель - предсказать, сможет ли человек выполнить свои кредитные обязательства.\
Тренировка модели классификации будет проходить на датасете course_project_train.csv с имеющимися целевыми значениями.\
Модель будет предсказывать результаты для датасета course_project_test.csv.

Эта цель может быть полезна для:
* банков и кредитных организаций, выдающих кредиты.

**Описание датасета**

В датасете описаны люди, взявшие кредиты, с различной информацией об их финансовом состоянии (признаки) и целевой переменной - факт невыполнения обязательств по кредиту Credit Default.\
Датасет является табличными, структурированными данными.

Признаки датасета:
* **Home Ownership** - домовладение
* **Annual Income** - годовой доход
* **Years in current job** - количество лет на текущем месте работы
* **Tax Liens** - налоговые обременения
* **Number of Open Accounts** - количество открытых счетов
* **Years of Credit History** - количество лет кредитной истории
* **Maximum Open Credit** - наибольший открытый кредит
* **Number of Credit Problems** - количество проблем с кредитом
* **Months since last delinquent** - количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротства
* **Purpose** - цель кредита
* **Term** - срок кредита
* **Current Loan Amount** - текущая сумма кредита
* **Current Credit Balance** - текущий кредитный баланс
* **Monthly Debt** - ежемесячный долг
* **Credit Default** - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [8]:
train_df = pd.read_csv(TRAIN_FILEPATH, sep=',')
train_df.tail()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
7495,Rent,402192.0,< 1 year,0.0,3.0,8.5,107866.0,0.0,,0.0,other,Short Term,129360.0,73492.0,1900.0,697.0,0
7496,Home Mortgage,1533984.0,1 year,0.0,10.0,26.5,686312.0,0.0,43.0,0.0,debt consolidation,Long Term,444048.0,456399.0,12783.0,7410.0,1
7497,Rent,1878910.0,6 years,0.0,12.0,32.1,1778920.0,0.0,,0.0,buy a car,Short Term,99999999.0,477812.0,12479.0,748.0,0
7498,Home Mortgage,,,0.0,21.0,26.5,1141250.0,0.0,,0.0,debt consolidation,Short Term,615274.0,476064.0,37118.0,,0
7499,Rent,,4 years,0.0,8.0,9.4,480832.0,0.0,,0.0,debt consolidation,Short Term,26928.0,288192.0,9061.0,,0


In [9]:
test_df = pd.read_csv(TEST_FILEPATH, sep=',')
test_df.tail()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
2495,Home Mortgage,1020053.0,10+ years,0.0,14.0,29.1,559152.0,1.0,68.0,1.0,debt consolidation,Short Term,99999999.0,162735.0,15046.0,745.0
2496,Home Mortgage,,2 years,0.0,15.0,17.0,1737780.0,0.0,77.0,0.0,debt consolidation,Short Term,468512.0,1439269.0,32996.0,
2497,Home Mortgage,1171806.0,2 years,0.0,48.0,12.8,1706430.0,0.0,,0.0,debt consolidation,Short Term,430496.0,676438.0,36912.0,695.0
2498,Rent,723520.0,10+ years,0.0,14.0,28.8,945780.0,0.0,,0.0,debt consolidation,Short Term,257774.0,391248.0,13506.0,744.0
2499,Rent,1694439.0,10+ years,0.0,12.0,18.4,1199748.0,1.0,72.0,0.0,debt consolidation,Long Term,763004.0,559531.0,23440.0,6820.0


Количество наблюдений в датасете.

In [10]:
print(f"Train samples: {train_df.shape[0]}")
print(f"Test samples: {test_df.shape[0]}")

Train samples: 7500
Test samples: 2500


Количество признаков в датасете (вместе с целевой переменной Price).

In [11]:
train_df.shape[1]

17

Проверяем, что в train и test одинаковое количество признаков.

In [12]:
train_df.shape[1] - 1 == test_df.shape[1]

True

**Уменьшаем занимаемую память DataFrame**

In [13]:
train_df = reduce_mem_usage(train_df, verbose=True)

Memory usage of dataframe is 0.97 MB
Memory usage after optimization is: 0.38 MB
Decreased by 60.9%


**Типы данных переменных**

In [14]:
train_df.dtypes

Home Ownership                  category
Annual Income                    float32
Years in current job            category
Tax Liens                        float32
Number of Open Accounts          float32
Years of Credit History          float32
Maximum Open Credit              float32
Number of Credit Problems        float32
Months since last delinquent     float32
Bankruptcies                     float32
Purpose                         category
Term                            category
Current Loan Amount              float32
Current Credit Balance           float32
Monthly Debt                     float32
Credit Score                     float32
Credit Default                      int8
dtype: object

# 1. EDA и Preprocessing

### Примерное описание этапов выполнения курсового проекта<a class="anchor" id="course_project_steps"></a>

**Построение модели классификации**
1. Обзор обучающего датасета
2. Обработка выбросов
3. Обработка пропусков
4. Анализ данных
5. Отбор признаков
6. Балансировка классов
7. Подбор моделей, получение бейзлана
8. Выбор наилучшей модели, настройка гиперпараметров
9. Проверка качества, борьба с переобучением
10. Интерпретация результатов

**Прогнозирование на тестовом датасете**
1. Выполнить для тестового датасета те же этапы обработки и постронияния признаков
2. Спрогнозировать целевую переменную, используя модель, построенную на обучающем датасете
3. Прогнозы должны быть для всех примеров из тестового датасета (для всех строк)
4. Соблюдать исходный порядок примеров из тестового датасета

## 1.1. Анализ таргета Credit Default

In [15]:
target = 'Credit Default'

In [16]:
train_df[target].value_counts()

0    5387
1    2113
Name: Credit Default, dtype: int64

Классы несбалансированны.

## 1.2. Анализ признаков датасета

### 1.2.1. Количественные признаки

In [17]:
train_df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130934,18.317467,945153.8,0.17,34.6926,0.117152,11873178.0,289833.2,18314.453125,1151.087524,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688807,0.347192,31926124.0,317871.4,11926.764648,1604.451416,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.799999,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.700001,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


### 1.2.2. Категориальные признаки

In [18]:
for feature in train_df.select_dtypes(include='category').columns:
    print(f"{feature}\n\n{train_df[feature].value_counts()}")
    print('-' * 100 + '\n')

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64
----------------------------------------------------------------------------------------------------

Years in current job

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64
----------------------------------------------------------------------------------------------------

Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation  

## 1.3. Обработка пропущенных значений

In [19]:
train_df.isna().sum()

Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

### Annual Income

In [20]:
train_df.loc[train_df['Annual Income'].isna(), 'Annual Income'] = train_df['Annual Income'].median()

### Bankruptcies

In [21]:
train_df.loc[train_df['Bankruptcies'].isna(), 'Bankruptcies'] = train_df['Bankruptcies'].median()

### Credit Score

In [22]:
train_df.loc[train_df['Credit Score'].isna(), 'Credit Score'] = train_df['Credit Score'].median()

### Years in current job

In [23]:
train_df.loc[train_df['Years in current job'].isna(), 'Years in current job'] = train_df['Years in current job'].mode()[0]

### Months since last delinquent

In [24]:
train_df = train_df.drop(columns=['Months since last delinquent'])

## 1.4. Обработка выбросов

## 1.5. Зависимости между признаками, таргетом

## 1.6. Трансформация переменных

### Term

In [25]:
train_df['Term'] = pd.Series(train_df['Term'].map({'Short Term': 0, 'Long Term': 1}), dtype=np.int8)

## 1.7. Генерация новых признаков

## 1.8. Отбор признаков

In [26]:
train_df.columns.tolist()

['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Bankruptcies',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score',
 'Credit Default']

In [27]:
# features = ['ft1', 'ft2', 'ft3', 'ft4']
features = train_df.columns.tolist()
features.remove(target)
features

['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Bankruptcies',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score']

In [28]:
new_features = []

## 1.9. Стандартизация

In [29]:
features_to_standard = train_df[features].select_dtypes(include=
                                                        ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
                                                       ).columns.tolist()
features_to_standard

['Annual Income',
 'Tax Liens',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Number of Credit Problems',
 'Bankruptcies',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score']

In [30]:
scaler = StandardScaler()
standard_features = scaler.fit_transform(train_df[features_to_standard])

In [31]:
train_df[features_to_standard] = pd.DataFrame(standard_features, columns=features_to_standard)
train_df[features_to_standard]

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
0,-1.114304,-0.110953,-0.026674,1.133645,-0.016174,1.664779,2.545732,-0.591517,2.760520,-0.762772,-0.872085,-0.218932
1,-0.396190,-0.110953,0.788223,-0.428528,0.014763,-0.340979,-0.337099,1.690569,-0.363620,0.330781,0.004909,-0.227276
2,-0.758385,-0.110953,-0.026674,2.369181,0.014807,-0.340979,-0.337099,-0.591517,2.760520,0.058379,-0.391034,-0.223799
3,-0.687478,-0.110953,-0.637847,0.593985,-0.049781,1.664779,2.545732,-0.591517,-0.368118,-0.610282,-0.584980,-0.257174
4,-0.725543,-0.110953,0.380774,-0.669954,-0.034903,1.664779,-0.337099,-0.591517,-0.367978,-0.618292,-0.933631,-0.239791
...,...,...,...,...,...,...,...,...,...,...,...,...
7495,-1.219886,-0.110953,-1.656468,-1.394234,-0.052248,-0.340979,-0.337099,-0.591517,-0.367868,-0.680639,-1.376362,-0.255088
7496,0.275799,-0.110953,-0.230398,1.162048,-0.016152,-0.340979,-0.337099,1.690569,-0.358011,0.524039,-0.463816,4.412423
7497,0.731625,-0.110953,0.177050,1.957336,0.052029,-0.340979,-0.337099,-0.591517,2.760520,0.591407,-0.489306,-0.219628
7498,-0.207346,-0.110953,2.010568,1.162048,0.012237,-0.340979,-0.337099,-0.591517,-0.352647,0.585907,1.576689,-0.231448


## 1.10. Dummy-переменные

In [32]:
train_df = pd.get_dummies(train_df, drop_first=True)

У признака Years in current job есть значение "< 1 year".\
Этот значение конвертируется в dummy-признак с таким именем.

Алгоритм XGBoost не поддерживает символы '[', ']', '<'. в именах признаках.\
Заменим все запрещенные символы в названиях столбцов на приемлемые.

In [33]:
problem_chars = '[]<'
replacement = {'[': '(',
               ']': ')',
               '<': 'less_than'}

In [34]:
for colname in train_df.columns:
    for char in problem_chars:
        if char in colname:
            train_df = train_df.rename(columns={colname: colname.replace(char, replacement[char])})

In [35]:
problem_col_name = 'Years in current job_< 1 year'
train_df = train_df.rename(columns={problem_col_name: problem_col_name.replace('<', 'less_than')})
train_df.columns

Index(['Annual Income', 'Tax Liens', 'Number of Open Accounts',
       'Years of Credit History', 'Maximum Open Credit',
       'Number of Credit Problems', 'Bankruptcies', 'Term',
       'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
       'Credit Score', 'Credit Default', 'Home Ownership_Home Mortgage',
       'Home Ownership_Own Home', 'Home Ownership_Rent',
       'Years in current job_10+ years', 'Years in current job_2 years',
       'Years in current job_3 years', 'Years in current job_4 years',
       'Years in current job_5 years', 'Years in current job_6 years',
       'Years in current job_7 years', 'Years in current job_8 years',
       'Years in current job_9 years', 'Years in current job_less_than 1 year',
       'Purpose_buy a car', 'Purpose_buy house', 'Purpose_debt consolidation',
       'Purpose_educational expenses', 'Purpose_home improvements',
       'Purpose_major purchase', 'Purpose_medical bills', 'Purpose_moving',
       'Purpose_other', 'Pur

## 1.11. Описание обработанных данных

In [36]:
train_df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Bankruptcies,Term,Current Loan Amount,Current Credit Balance,...,Purpose_home improvements,Purpose_major purchase,Purpose_medical bills,Purpose_moving,Purpose_other,Purpose_renewable energy,Purpose_small business,Purpose_take a trip,Purpose_vacation,Purpose_wedding
count,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,...,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,1.780192e-09,-1.449585e-08,6.357829e-09,-8.138021e-09,-1.525879e-09,-1.678467e-08,-7.629395e-09,3.026327e-08,3.560384e-09,-6.103515e-09,...,0.054933,0.005333,0.009467,0.001467,0.088667,0.000267,0.003467,0.004933,0.001067,0.002
std,1.000067,1.000067,1.000067,1.000067,1.000067,1.000067,1.000067,1.000067,1.000067,1.000067,...,0.227865,0.07284,0.096842,0.038272,0.284281,0.016329,0.05878,0.070069,0.032645,0.04468
min,-1.533873,-0.1109534,-1.860192,-2.033305,-0.05897941,-0.3409789,-0.337099,-0.5915167,-0.371568,-0.9118549,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.5208805,-0.1109534,-0.6378469,-0.6841558,-0.04155495,-0.3409789,-0.337099,-0.5915167,-0.3662765,-0.5523883,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.2073461,-0.1109534,-0.2303985,-0.1871009,-0.02914137,-0.3409789,-0.337099,-0.5915167,-0.362223,-0.2532962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.2308539,-0.1109534,0.5844983,0.4945742,-0.00946339,-0.3409789,-0.337099,1.690569,-0.3556352,0.2220323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11.66116,25.66362,6.4925,5.592937,81.35844,13.69933,11.19423,1.690569,2.76052,19.55942,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 2. Классы генерации признаков и предобработки данных

## 2.1. Класс генерации признаков

In [37]:
class FeatureGenerator:
    """Class for generating new features for the Credit Default dataset."""
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X):
        pass

## 2.2. Класс предобработки данных

In [38]:
class DataPipeline:
    """Class for cleaning data, transforming data, generating new features in the Credit Default dataset."""
    scaler_filepath = './scaler.pkl'
    scaler_type = StandardScaler
    
    def __init__(self, scaler_path=SCALER_FILEPATH):
        """"""
        self.NUM_FEATURE_TYPES = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
        self.FEATURES = ['Home Ownership', 'Annual Income', 'Years in current job', 'Tax Liens', 
                         'Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit', 
                         'Number of Credit Problems', 'Bankruptcies', 'Purpose', 'Term', 'Current Loan Amount', 
                         'Current Credit Balance', 'Monthly Debt', 'Credit Score']
        self.NEW_FEATURES = []
        self.TARGET_VARIABLE = 'Credit Default'
        
        self.medians = None
        self.modes = None
        
        self.problem_col_name = None
        
        self.features_to_standard = None
        self.scaler_filepath = scaler_path
        self.scaler = self.scaler_type()
        
        self.feature_gen = None
    
    def fit(self, X, y=None):
        """Fit the pipeline to the dataset."""
        self.medians = X.median()
        self.modes = X.select_dtypes(include='category').mode()
        
        self.problem_colname = 'Years in current job_< 1 year'
        
        # Train scaler
        self.features_to_standard = X[self.FEATURES].select_dtypes(include=self.NUM_FEATURE_TYPES).columns.tolist()
        self.scaler.fit(X[self.features_to_standard])
        # Save the scaler
        with open(self.scaler_filepath, 'wb') as file:
            pickle.dump(scaler, file)
    
    def transform(self, X):
        """Transform the dataset in order to make a better ML model."""
        X = self.fix_nan(X)
        X = self.term_to_int(X)
        
        X = self.standardize_features(X)
        
        X = pd.get_dummies(X, drop_first=True)
        X = self.replace_problem_colnames(X)
        
        return X
    
    def fix_nan(self, X):
        """"""
        # Numerical features
        X.loc[X['Annual Income'].isna(), 'Annual Income'] = X['Annual Income'].median()
        X.loc[X['Bankruptcies'].isna(), 'Bankruptcies'] = X['Bankruptcies'].median()
        X.loc[X['Credit Score'].isna(), 'Credit Score'] = X['Credit Score'].median()
        
        # Categorical features
        X.loc[X['Years in current job'].isna(), 'Years in current job'] = X['Years in current job'].mode()[0]
        
        # Drop Months since last delinquent since it is missing > 50% of values
        X = X.drop(columns=['Months since last delinquent'])
        
        return X
    
    def term_to_int(self, X):
        """Convert categorical string values of feature Term to binary 0 and 1."""
        X['Term'] = pd.Series(X['Term'].map({'Short Term': 0, 'Long Term': 1}), dtype=np.int8)
        
        return X
    
    def standardize_features(self, X):
        """Standardize numerical features of a dataset."""
        standard_features = self.scaler.transform(X[self.features_to_standard])
        X[self.features_to_standard] = pd.DataFrame(standard_features, columns=self.features_to_standard)
        
        return X
    
    def replace_problem_colnames(self, X):
        """Replace banned in XGBoost characters in column names."""
        problem_chars = '[]<'
        replacement = {'[': '(',
                       ']': ')',
                       '<': 'less_than'}
        
        for colname in X.columns:
            for char in problem_chars:
                if char in colname:
                    X = X.rename(columns={colname: colname.replace(char, replacement[char])})
        
        return X

**Обработка датасетов классом DataPipeline**

Загрузим изначальные датасеты train, test и обработаем их с помощью класса DataPipeline.

In [39]:
train_df = pd.read_csv(TRAIN_FILEPATH)
test_df = pd.read_csv(TEST_FILEPATH)
train_df.shape, test_df.shape

((7500, 17), (2500, 16))

In [40]:
pipeline = DataPipeline()
pipeline.fit(train_df)

train_df = pipeline.transform(train_df)
test_df = pipeline.transform(test_df)
train_df.shape, test_df.shape

((7500, 40), (2500, 38))

In [41]:
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

train_df.shape, test_df.shape

((7500, 40), (2500, 38))

In [42]:
trn = set(train_df.columns.tolist())
tst = set(test_df.columns.tolist())

trn.remove('Credit Default')
trn.difference(tst)

{'Purpose_renewable energy'}

Есть признаки в dummy-переменных, которые есть не в обоих датасетах.\
Исправим это.

In [43]:
"Train" if 'Purpose_renewable energy' in trn else "Test"

'Train'

Этот признак есть в train датасете, но не в test.\
Добавим его с нулевыми значениями, так как это dummy-переменная общего признака Purpose.

In [44]:
test_df['Purpose_renewable energy'] = 0

train_df.shape, test_df.shape

((7500, 40), (2500, 39))

Теперь все признаки в датасетах одинаковы.

In [45]:
train_df.columns, test_df.columns

(Index(['Annual Income', 'Tax Liens', 'Number of Open Accounts',
        'Years of Credit History', 'Maximum Open Credit',
        'Number of Credit Problems', 'Bankruptcies', 'Term',
        'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
        'Credit Score', 'Credit Default', 'Home Ownership_Home Mortgage',
        'Home Ownership_Own Home', 'Home Ownership_Rent',
        'Years in current job_10+ years', 'Years in current job_2 years',
        'Years in current job_3 years', 'Years in current job_4 years',
        'Years in current job_5 years', 'Years in current job_6 years',
        'Years in current job_7 years', 'Years in current job_8 years',
        'Years in current job_9 years', 'Years in current job_less_than 1 year',
        'Purpose_buy a car', 'Purpose_buy house', 'Purpose_debt consolidation',
        'Purpose_educational expenses', 'Purpose_home improvements',
        'Purpose_major purchase', 'Purpose_medical bills', 'Purpose_moving',
        'Purpo

# 3. Сохранение обработанного датасета

In [46]:
train_df.to_csv(PREPARED_TRAIN_FILEPATH, sep=',', index=False, encoding='utf-8')

# 4. Разбиение на train, test

In [47]:
X = train_df.drop(target, axis=1)
y = train_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# 5. Построение модели

### LogisticRegression

In [110]:
model_params = {'solver': 'liblinear'}

In [111]:
lr_model = LogisticRegression(**model_params)

lr_model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [112]:
y_pred = lr_model.predict(X_train)

In [113]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.781

In [114]:
f1 = f1_score(y_train, y_pred)
f1

0.3933518005540167

In [115]:
cv_score = cross_val_score(lr_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.42369838, 0.3706422 , 0.39718805, 0.38545455, 0.40998217])

In [116]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.39739307107753624

In [117]:
comment = "Replaced KFold to StratifiedKFold"

In [117]:
log_model_results(lr_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:42:25;LogisticRegression;0.781;0.3933518005540167;0.39739307107753624;{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False};Base LogisticRegression



### SupportVectorClassifier

In [118]:
model_params = {'kernel': 'sigmoid'}

In [119]:
svc_model = SVC(**model_params)

svc_model.fit(X_train, y_train)

SVC(kernel='sigmoid')

In [120]:
y_pred = svc_model.predict(X_train)

In [121]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.6856666666666666

In [122]:
f1 = f1_score(y_train, y_pred)
f1

0.4186189889025894

In [123]:
cv_score = cross_val_score(svc_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.42574257, 0.41990291, 0.43985419, 0.42496999, 0.41636582])

In [124]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.42536709823252095

In [125]:
comment = "Replaced KFold to StratifiedKFold"

In [125]:
log_model_results(svc_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:42:44;SVC;0.6856666666666666;0.4186189889025894;0.42536709823252095;{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False};Base SupportVectorClassifier



### KNN

In [126]:
model_params = {'n_neighbors': 5}

In [127]:
knn_model = KNeighborsClassifier(**model_params)

knn_model.fit(X_train, y_train)

KNeighborsClassifier()

In [128]:
y_pred = knn_model.predict(X_train)

In [129]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.8136666666666666

In [130]:
f1 = f1_score(y_train, y_pred)
f1

0.5815868263473054

In [131]:
cv_score = cross_val_score(knn_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.448737  , 0.43373494, 0.45037037, 0.40483384, 0.4260355 ])

In [132]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.4327423296920217

In [133]:
comment = "Replaced KFold to StratifiedKFold"

In [133]:
log_model_results(knn_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:42:56;KNeighborsClassifier;0.8136666666666666;0.5815868263473054;0.4327423296920217;{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'};Base KNNClassifier



### DecisionTree

In [134]:
model_params = {'max_depth': 5,
                'min_samples_leaf': 10,
                'random_state': 42}

In [135]:
tree_model = DecisionTreeClassifier(**model_params)

tree_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=10, random_state=42)

In [136]:
y_pred = tree_model.predict(X_train)

In [137]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.7761666666666667

In [138]:
f1 = f1_score(y_train, y_pred)
f1

0.4971920628977911

In [139]:
cv_score = cross_val_score(tree_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.46496815, 0.41237113, 0.44333333, 0.48367953, 0.46587537])

In [140]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.4540455032725255

In [141]:
comment = "Replaced KFold to StratifiedKFold"

In [141]:
log_model_results(tree_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:43:07;DecisionTreeClassifier;0.7761666666666667;0.4971920628977911;0.4540455032725255;{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 10, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': 'deprecated', 'random_state': 42, 'splitter': 'best'};Base DecisionTreeClassifier



### RandomForest

In [142]:
model_params = {'n_estimators': 100,
                'max_features': 20,
                'max_depth': 20,
                'min_samples_leaf': 3,
                'random_state': 42}

In [143]:
rf_model = RandomForestClassifier(**model_params)

rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, max_features=20, min_samples_leaf=3,
                       random_state=42)

In [144]:
y_pred = rf_model.predict(X_train)

In [145]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.9368333333333333

In [146]:
f1 = f1_score(y_train, y_pred)
f1

0.8724335240659711

In [147]:
cv_score = cross_val_score(rf_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.48287113, 0.42760943, 0.48220065, 0.44039735, 0.44732577])

In [148]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.45608086426357486

In [149]:
comment = "Replaced KFold to StratifiedKFold"

In [149]:
log_model_results(rf_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:43:27;RandomForestClassifier;0.9368333333333333;0.8724335240659711;0.45608086426357486;{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 20, 'max_features': 20, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False};Base RandomForestClassifier



### GradientBoosting

In [150]:
model_params = {'criterion': 'mse',
                'n_estimators': 300,
                'max_depth': 5,
                'min_samples_leaf': 5,
                'random_state': 42}

In [151]:
gb_model = GradientBoostingClassifier(**model_params)

gb_model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='mse', max_depth=5, min_samples_leaf=5,
                           n_estimators=300, random_state=42)

In [152]:
y_pred = gb_model.predict(X_train)

In [153]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.9381666666666667

In [154]:
f1 = f1_score(y_train, y_pred)
f1

0.8761268781302171

In [155]:
cv_score = cross_val_score(rf_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.48287113, 0.42760943, 0.48220065, 0.44039735, 0.44732577])

In [156]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.45608086426357486

In [157]:
comment = "Replaced KFold to StratifiedKFold"

In [157]:
log_model_results(gb_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:43:53;GradientBoostingClassifier;0.9381666666666667;0.8761268781302171;0.45608086426357486;{'ccp_alpha': 0.0, 'criterion': 'mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_iter_no_change': None, 'presort': 'deprecated', 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False};Base GradientBoostingClassifier



### XGBoost

In [158]:
model_params = {'objective': 'binary:logistic',
                'n_estimators': 10,
                'seed': 42,
                'max_depth': 6}

In [159]:
xgb_model = xgb.XGBClassifier(**model_params)

xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=10, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [160]:
y_pred = xgb_model.predict(X_train)

In [161]:
accuracy = accuracy_score(y_train, y_pred)
accuracy

0.7983333333333333

In [162]:
f1 = f1_score(y_train, y_pred)
f1

0.4655477031802121

In [163]:
cv_score = cross_val_score(xgb_model, X, y, scoring='f1', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
cv_score

array([0.42179262, 0.38686131, 0.4130809 , 0.39431616, 0.42248722])

In [164]:
mean_cv_score = cv_score.mean()
mean_cv_score

0.4077076428170698

In [165]:
comment = "Replaced KFold to StratifiedKFold"

In [165]:
log_model_results(xgb_model, accuracy, f1, mean_cv_score, comment)

2020-09-25;21:44:03;XGBClassifier;0.7983333333333333;0.4655477031802121;0.4077076428170698;{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': '', 'learning_rate': 0.300000012, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 10, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'seed': 42};Base XGBoostClassifier



### adaboost

In [166]:
# ?

### catboost

In [None]:
# ?

# 6. Настройка и валидация конечной модели

## 6.1. Оценка модели

## 6.2. Кросс-валидация

## 6.3. Логирование результатов валидации

In [None]:
# Just pass the trained model into a function that will evaluate the model, cross-validate it, then log it into training_log.csv.
# It should basically run every time a model is validated, evaluated so that it logs every experiment automatically.
# Afterwards I can just load this as a DataFrame and find better scores to choose the model.

In [78]:
# log_model_results(model, f1_score, mean_cv_score, comment=None)

# 7. Анализ прогнозов модели

## 7.1. Важность признаков

## 7.2. Топ-3 фичи

## 7.3. Худшие 3 фичи

# 8. Сохранение модели

**Создание файла requirements.txt для проекта**

In [None]:
%conda list -e > requirements.txt

# 9. Прогнозирование на тестовом датасете

# 