In [1]:
import numpy as np
import pandas as pd

from scipy.stats import shapiro
from scipy.stats import probplot
from scipy.stats import ttest_ind, mannwhitneyu
from scipy.stats import chi2_contingency
from statsmodels.stats.weightstats import zconfint

import seaborn as sns
from matplotlib import pyplot as plt

import pickle
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, learning_curve
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb, lightgbm as lgbm, catboost as catb

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline


%matplotlib inline

In [2]:
import warnings
warnings.simplefilter('ignore')

**Пути к директориям и файлам**

In [3]:
TRAIN_DATASET_PATH = 'data/course_project_train.csv'
TEST_DATASET_PATH = 'data/course_project_test.csv'

**Описание датасета**

* **Home Ownership** - домовладение
* **Annual Income** - годовой доход
* **Years in current job** - количество лет на текущем месте работы
* **Tax Liens** - налоговые льготы
* **Number of Open Accounts** - количество открытых счетов
* **Years of Credit History** - количество лет кредитной истории
* **Maximum Open Credit** - наибольший открытый кредит
* **Number of Credit Problems** - количество проблем с кредитом
* **Months since last delinquent** - количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротства
* **Purpose** - цель кредита
* **Term** - срок кредита
* **Current Loan Amount** - текущая сумма кредита
* **Current Credit Balance** - текущий кредитный баланс
* **Monthly Debt** - ежемесячный долг
* **Credit Default** - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [4]:
df = pd.read_csv(TRAIN_DATASET_PATH)
df_test = pd.read_csv(TEST_DATASET_PATH)

df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


**Выделение целевой переменной и групп признаков**

In [5]:
TARGET_NAME = 'Credit Default'
BASE_FEATURE_NAMES = df.columns.drop(TARGET_NAME).tolist()
NEW_FEATURE_NAMES = df.columns.drop([TARGET_NAME] + BASE_FEATURE_NAMES)

### PreProcessing

### Заполнение пропущенных признаков

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 16 columns):
Home Ownership                  2500 non-null object
Annual Income                   1987 non-null float64
Years in current job            2414 non-null object
Tax Liens                       2500 non-null float64
Number of Open Accounts         2500 non-null float64
Years of Credit History         2500 non-null float64
Maximum Open Credit             2500 non-null float64
Number of Credit Problems       2500 non-null float64
Months since last delinquent    1142 non-null float64
Bankruptcies                    2497 non-null float64
Purpose                         2500 non-null object
Term                            2500 non-null object
Current Loan Amount             2500 non-null float64
Current Credit Balance          2500 non-null float64
Monthly Debt                    2500 non-null float64
Credit Score                    1987 non-null float64
dtypes: float64(12), object(4)
me

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
Home Ownership                  7500 non-null object
Annual Income                   5943 non-null float64
Years in current job            7129 non-null object
Tax Liens                       7500 non-null float64
Number of Open Accounts         7500 non-null float64
Years of Credit History         7500 non-null float64
Maximum Open Credit             7500 non-null float64
Number of Credit Problems       7500 non-null float64
Months since last delinquent    3419 non-null float64
Bankruptcies                    7486 non-null float64
Purpose                         7500 non-null object
Term                            7500 non-null object
Current Loan Amount             7500 non-null float64
Current Credit Balance          7500 non-null float64
Monthly Debt                    7500 non-null float64
Credit Score                    5943 non-null float64
Credit Default                  7

In [8]:
# "Credit Score"
def credit_score(df, cs_median):
    df.loc[df['Credit Score'].isnull() == True]
    df['Credit Score'].unique()
    df['Credit Score'] = df['Credit Score'].fillna(cs_median)
    return df
cs_median = df.loc[df['Credit Score'].isnull() == False, 'Credit Score'].unique().mean().round(1)
df = credit_score(df, cs_median)
df_test = credit_score(df_test, cs_median)


In [9]:
# "Bankruptcies"
def bankruptcies(df):
    df.loc[df['Bankruptcies'].isnull() == True]
    df['Bankruptcies'].unique()
    df['Bankruptcies'] = df['Bankruptcies'].fillna(1)
    return df
df = bankruptcies(df)
df_test = bankruptcies(df_test)

In [10]:
# "Annual Income"
def annual_income(df, ai_median):
    df['Annual Income'] = df['Annual Income'].fillna(ai_median)
    return df
ai_median = df.loc[(df['Annual Income'].isnull() == False), 'Annual Income'].mean()

# df = annual_income(df, ai_median)
df_test = annual_income(df_test, ai_median)

In [11]:
# "Years in current job"
def year_job(df):
    df.loc[df['Years in current job'].isnull() == False, 'Years in current job'].value_counts()
    job_median = '10+ years'
    df['Years in current job'] = df['Years in current job'].fillna(job_median)
    return df
df = year_job(df)
df_test = year_job(df_test)

In [12]:
# Удалим признак у которого половина значений пропущена
df.drop(columns=['Months since last delinquent'], inplace=True, axis=1)
df_test.drop(columns=['Months since last delinquent'], inplace=True, axis=1)

### Categorical Variables

In [13]:
# Само значение 9999999, заменим на значение из "текущего кредитный баланс"
def cla_categolial(df):
    df['CLA_categolial'] = 0
    df.loc[df['Current Loan Amount'] == 99999999, 'CLA_categolial'] = 1
    df.loc[df['Current Loan Amount'] == 99999999, 'Current Loan Amount'] =\
    df.loc[df['Current Loan Amount'] == 99999999, 'Current Credit Balance']
    return df
df = cla_categolial(df)
df_test = cla_categolial(df_test)

In [14]:
# Конвертируем стаж работы в числовые

unique_items = df['Term'].unique().tolist()
unique_num = [0, 1]
years_dict = dict(zip(unique_items, unique_num))
df['Term'] = df['Term'].map(years_dict)
df_test['Term'] = df_test['Term'].map(years_dict).astype(int)

In [15]:
# Конвертируем стаж работы

years_job = df['Years in current job']
encoder = LabelBinarizer()
new_years_job = encoder.fit_transform(years_job)
tmp = pd.DataFrame(new_years_job)
tmp_col = ['Years in current job_'+str(i) for i in tmp.columns]
tmp.columns = tmp_col
df = pd.concat([df, tmp], axis=1)
df.drop(columns=['Years in current job'], inplace=True, axis=1)

tmp_test = pd.DataFrame(encoder.transform(df_test['Years in current job']), columns=tmp_col)
df_test = pd.concat([df_test, tmp_test], axis=1)
df_test.drop(columns=['Years in current job'], inplace=True, axis=1)

In [16]:
# Конвертируем Цель кредита

purpose_job = df['Purpose']
encoder_p = LabelBinarizer()
new_years_job = encoder_p.fit_transform(purpose_job)
tmp = pd.DataFrame(new_years_job)
tmp_col = ['Purpose_'+str(i) for i in tmp.columns]
tmp.columns = tmp_col
df = pd.concat([df, tmp], axis=1)
df.drop(columns=['Purpose'], inplace=True, axis=1)

tmp_test = pd.DataFrame(encoder_p.transform(df_test['Purpose']), columns=tmp_col)
df_test = pd.concat([df_test, tmp_test], axis=1)
df_test.drop(columns=['Purpose'], inplace=True, axis=1)

In [17]:
# Конвертируем Домовладение "Home Ownership"

purpose_job = df['Home Ownership']
encoder_h = LabelBinarizer()
new_years_job = encoder_h.fit_transform(purpose_job)
tmp = pd.DataFrame(new_years_job)
tmp_col = ['Home Ownership_'+str(i) for i in tmp.columns]
tmp.columns = tmp_col
df = pd.concat([df, tmp], axis=1)
df.drop(columns=['Home Ownership'], inplace=True, axis=1)

tmp_test = pd.DataFrame(encoder_h.transform(df_test['Home Ownership']), columns=tmp_col)
df_test = pd.concat([df_test, tmp_test], axis=1)
df_test.drop(columns=['Home Ownership'], inplace=True, axis=1)

In [18]:
len(df.drop(columns=TARGET_NAME).columns) == len(df_test.columns)

True

In [19]:
# # Удаляем пропущенные значения для балансировки классов
# df_short = df.loc[df['Annual Income'].isnull()  == True]
# df_short.index
# df.drop(index=df_short.index, inplace=True)
# df = df.reset_index()

In [20]:
# Удаляем пропущенные значения для балансировки классов
df_short = df.loc[(df['Annual Income'].isnull()  == True) & (df[TARGET_NAME] == 0)]
df.drop(index=df_short.index, inplace=True)
df = annual_income(df, ai_median)
df = df.reset_index()

### Models

In [21]:
# a = df.select_dtypes(include='object').columns.tolist()
# b = df.columns.tolist()
# NUMERIC_FEATURE_NAMES = list(set(b).difference(a))
# # NUMERIC_FEATURE_NAMES = temp_columns

In [22]:
# df.info()

### Нормализация данных<a class="anchor" id="normalization"></a>

In [23]:
# Нормализация числовых признаков
scaler = StandardScaler()

df_norm_col = df.drop(columns=TARGET_NAME).columns
df_norm = scaler.fit_transform(df[df_norm_col])

df[df_norm_col] = pd.DataFrame(df_norm, columns=df_norm_col)

df_norm_test_col = df_test.columns
df_test[df_norm_test_col] = pd.DataFrame(scaler.fit_transform(df_test[df_norm_test_col]), columns=df_norm_test_col)


In [24]:
# df.loc[df['Annual Income'].isnull()  == True]

In [25]:
df[TARGET_NAME].value_counts()

0    4359
1    2113
Name: Credit Default, dtype: int64

### Разбиение на train и test<a class="anchor" id="train_and_test"></a>

In [26]:
df = df.drop(columns=['index'])

In [27]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [28]:
X = df.drop(columns=TARGET_NAME)
y = df[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.24, random_state=21)

### Балансировка целевой переменной<a class="anchor" id="target_balancing"></a>

In [29]:
def balance_df_by_target(df, target_name):

    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1

    for i in range(disbalance_coeff):
        sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
        df = df.append(sample, ignore_index=True)

    return df.sample(frac=1) 

In [30]:
df_for_balancing = pd.concat([X_train, y_train], axis=1)
df_balanced = balance_df_by_target(df_for_balancing, TARGET_NAME)
    
df_balanced[TARGET_NAME].value_counts()

0    3294
1    3248
Name: Credit Default, dtype: int64

In [31]:
X = df_balanced.drop(columns=TARGET_NAME)
y = df_balanced[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.24, random_state=21)

# Models

In [32]:
# Создаем датафрейм для создания ансамбля моделей.
predict_one_level = pd.DataFrame(y_train.tolist(), columns=['target'])

### LogRegression

In [33]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

predict_one_level['model_lr'] = model_lr.predict(X_train)  
predict_one_level_test = pd.DataFrame(model_lr.predict(df_test), columns=['model_lr'])

TRAIN

              precision    recall  f1-score   support

           0       0.70      0.85      0.77      2495
           1       0.81      0.64      0.71      2476

   micro avg       0.74      0.74      0.74      4971
   macro avg       0.76      0.74      0.74      4971
weighted avg       0.76      0.74      0.74      4971

TEST

              precision    recall  f1-score   support

           0       0.71      0.84      0.77       799
           1       0.80      0.64      0.71       772

   micro avg       0.75      0.75      0.75      1571
   macro avg       0.75      0.74      0.74      1571
weighted avg       0.75      0.75      0.74      1571

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               674  125
1               275  497


In [34]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

y_train_pred = model_knn.predict(X_train)
y_test_pred = model_knn.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)
predict_one_level['model_knn'] = model_knn.predict(X_train)
predict_one_level_test['model_knn'] = model_knn.predict(df_test)

TRAIN

              precision    recall  f1-score   support

           0       0.79      0.83      0.81      2495
           1       0.82      0.78      0.80      2476

   micro avg       0.80      0.80      0.80      4971
   macro avg       0.80      0.80      0.80      4971
weighted avg       0.80      0.80      0.80      4971

TEST

              precision    recall  f1-score   support

           0       0.72      0.72      0.72       799
           1       0.71      0.72      0.71       772

   micro avg       0.72      0.72      0.72      1571
   macro avg       0.72      0.72      0.72      1571
weighted avg       0.72      0.72      0.72      1571

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               572  227
1               218  554


**Бустинговые алгоритмы**

*XGBoost*

In [35]:
model_xgb = xgb.XGBClassifier(random_state=21)
model_xgb.fit(X_train, y_train)

y_train_pred = model_xgb.predict(X_train)
y_test_pred = model_xgb.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

predict_one_level['model_xgb'] = model_xgb.predict(X_train)
predict_one_level_test['model_xgb'] = model_xgb.predict(df_test)

TRAIN

              precision    recall  f1-score   support

           0       0.74      0.89      0.81      2495
           1       0.86      0.69      0.77      2476

   micro avg       0.79      0.79      0.79      4971
   macro avg       0.80      0.79      0.79      4971
weighted avg       0.80      0.79      0.79      4971

TEST

              precision    recall  f1-score   support

           0       0.73      0.85      0.79       799
           1       0.82      0.68      0.74       772

   micro avg       0.77      0.77      0.77      1571
   macro avg       0.77      0.77      0.77      1571
weighted avg       0.77      0.77      0.77      1571

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               680  119
1               246  526


*LightGBM*

In [36]:
model_lgbm = lgbm.LGBMClassifier(random_state=21)
model_lgbm.fit(X_train, y_train)

y_train_pred = model_lgbm.predict(X_train)
y_test_pred = model_lgbm.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

predict_one_level['model_lgbm'] = model_lgbm.predict(X_train)
predict_one_level_test['model_lgbm'] = model_lgbm.predict(df_test)

TRAIN

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      2495
           1       0.99      0.96      0.97      2476

   micro avg       0.97      0.97      0.97      4971
   macro avg       0.97      0.97      0.97      4971
weighted avg       0.97      0.97      0.97      4971

TEST

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       799
           1       0.83      0.82      0.82       772

   micro avg       0.83      0.83      0.83      1571
   macro avg       0.83      0.83      0.83      1571
weighted avg       0.83      0.83      0.83      1571

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               665  134
1               136  636


*CatBoost*

In [46]:
model_catb1 =catb.CatBoostClassifier(silent=True, random_state=21)

In [47]:
model_catb1.fit(X_train, y_train)

y_train_pred = model_catb1.predict(X_train)
y_test_pred = model_catb1.predict(X_test)

get_classification_report(y_train, y_train_pred, y_test, y_test_pred)
predict_one_level['model_catb'] = model_catb1.predict(X_train)
predict_one_level_test['model_catb'] = model_catb1.predict(df_test)

TRAIN

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      2495
           1       0.91      0.81      0.86      2476

   micro avg       0.87      0.87      0.87      4971
   macro avg       0.87      0.87      0.87      4971
weighted avg       0.87      0.87      0.87      4971

TEST

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       799
           1       0.82      0.75      0.78       772

   micro avg       0.79      0.79      0.79      1571
   macro avg       0.80      0.79      0.79      1571
weighted avg       0.80      0.79      0.79      1571

CONFUSION MATRIX

col_0           0.0  1.0
Credit Default          
0               669  130
1               193  579


In [48]:
# Посторим модель обучения, основанная на ансамбле алгоритмов
from sklearn.linear_model import LinearRegression
meta_model = LinearRegression(n_jobs=-1)
meta_model.fit(predict_one_level.drop('target', axis=1), predict_one_level['target'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [49]:
final_predictions = meta_model.predict(predict_one_level_test)

### Применение модели на тестовую выборку

In [54]:
submission = pd.DataFrame({'Id':np.arange(7500, 7500+2500), 'Credit Default': final_predictions})

In [56]:
submission['Credit Default'] = [True if i>0.5 else False for i in submission['Credit Default']]

In [52]:
submission.to_csv('out_test/pred_test_v09.csv', index = False)