In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score as r2
from sklearn.metrics import classification_report, plot_confusion_matrix

import catboost as catb
import xgboost as xgb
import lightgbm as lgbm
import warnings 

warnings.filterwarnings('ignore')

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

plt.figure(figsize=(18, 18))

<Figure size 1296x1296 with 0 Axes>

<Figure size 1296x1296 with 0 Axes>

In [2]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [3]:
def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [4]:
DATA_TRAIN = 'train.csv'
DATA_TEST = 'test.csv'

In [5]:
train_df = pd.read_csv(DATA_TRAIN)

In [6]:
train_df[
    ["Credit Default"]
].value_counts()

Credit Default
0                 5387
1                 2113
dtype: int64

In [7]:
train_df.dtypes

Home Ownership                   object
Annual Income                   float64
Years in current job             object
Tax Liens                       float64
Number of Open Accounts         float64
Years of Credit History         float64
Maximum Open Credit             float64
Number of Credit Problems       float64
Months since last delinquent    float64
Bankruptcies                    float64
Purpose                          object
Term                             object
Current Loan Amount             float64
Current Credit Balance          float64
Monthly Debt                    float64
Credit Score                    float64
Credit Default                    int64
dtype: object

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [9]:
test_df = pd.read_csv(DATA_TEST)

In [10]:
test_df.tail()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
2495,Home Mortgage,1020053.0,10+ years,0.0,14.0,29.1,559152.0,1.0,68.0,1.0,debt consolidation,Short Term,99999999.0,162735.0,15046.0,745.0
2496,Home Mortgage,,2 years,0.0,15.0,17.0,1737780.0,0.0,77.0,0.0,debt consolidation,Short Term,468512.0,1439269.0,32996.0,
2497,Home Mortgage,1171806.0,2 years,0.0,48.0,12.8,1706430.0,0.0,,0.0,debt consolidation,Short Term,430496.0,676438.0,36912.0,695.0
2498,Rent,723520.0,10+ years,0.0,14.0,28.8,945780.0,0.0,,0.0,debt consolidation,Short Term,257774.0,391248.0,13506.0,744.0
2499,Rent,1694439.0,10+ years,0.0,12.0,18.4,1199748.0,1.0,72.0,0.0,debt consolidation,Long Term,763004.0,559531.0,23440.0,6820.0


## 1. EDA

Делаем EDA для:

 * Заполнения
 

### **Количественные переменнные**

In [11]:
train_df.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130933,18.317467,945153.7,0.17,34.6926,0.117152,11873180.0,289833.2,18314.454133,1151.087498,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688806,0.347192,31926120.0,317871.4,11926.764673,1604.451418,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.8,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.7,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


### **Номинативные переменные**

In [12]:
for column_name, column_values in train_df.select_dtypes(include='object').iteritems():
    print()
    print("-----------------")
    print()
    print(column_name)
    print()
    print(column_values.value_counts())


-----------------

Home Ownership

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64

-----------------

Years in current job

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64

-----------------

Purpose

debt consolidation      5944
other                    665
home improvements        412
business loan            129
buy a car                 96
medical bills             71
major purchase            40
take a trip               37
buy house                 34
small business            26
wedding                   15
moving                    11
educational expenses      10
vacation                   8
renewable energy           2
Name: Purpose, dtype: int64

-----------------

Term

Short Term    5556
Long Term     1944
Name

In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

Пропуски в столбцах типа **object** заполним `модами`

In [14]:
for column_name, column in train_df.select_dtypes(include='object').iteritems():
    if train_df[column_name].isna().sum() > 0:
        train_df[column_name].fillna(train_df[column_name].mode()[0], inplace=True)

Пропуски в столбцах **числовых типов** заполним `медианами`

In [15]:
for column_name, column in train_df.select_dtypes(exclude='object').iteritems():
    if train_df[column_name].isna().sum() > 0:
        train_df[column_name].fillna(train_df[column_name].median(), inplace=True)

In [16]:
class DataPreprocessing:
    """ Подготовка исходных данных """
    
    def __init__(self):
        self.columns_digit_nan_mode = {}
        self.columns_object_nan_mode = {}
        
    def fit(self, X):
        """ Сохранение статистик """
        X = X.copy()
        
        for column_name in X.select_dtypes(exclude='object').columns.tolist():
            if train_df[column_name].isna().sum() > 0:
                self.columns_digit_nan_mode[column_name] = X[column_name].median()
        
        for column_name in X.select_dtypes(include='object').columns.tolist():
            if train_df[column_name].isna().sum() > 0:
                self.columns_object_nan_mode[column_name] = X[column_name].mode()[0]
        
        return X
    
    def transform(self, X):
        """ Трансформация данных """
        X = X.copy()
        
        for column, value in self.columns_digit_nan_mode.items():
            X[column].fillna(value, inplace=True)
            
        for column, value in self.columns_object_nan_mode.items():
            X[column].fillna(value, inplace=True)
        
        return X

In [17]:
class FeatureGenerator:
    """ Генератор признаков """
    
    def __init__(self):
        self.column_name_ho = 'Home Ownership'
        self.column_name_yicj = 'Years in current job'
        self.column_name_t = 'Term'
        self.column_name_p = 'Purpose'
        
        self.home_ownership_mode = None
        self.years_current_job_mode = None
        
        self.home_ownership = {}
        self.years_current_job = {}
    
    def fit(self, X):
        """ Сохранение статистик """
        X = X.copy()
        
        self.home_ownership = {
            'Home Ownership':
            {
                'Own Home': 4,
                'Rent': 3,            
                'Have Mortgage': 2,
                'Home Mortgage': 1,
            }
        }
        
        self.years_current_job = {
            self.column_name_yicj: {
                '10+ years': 11,
                '9 years': 10,
                '8 years': 9,
                '7 years': 8,
                '6 years': 7,
                '5 years': 6,
                '4 years': 5,
                '3 years': 4,
                '2 years': 3,
                '1 year': 2,
                '< 1 year': 1,
            }
        }
        
        self.term = {
            self.column_name_t: {
                'Short Term': 1,
                'Long Term': 0,
            }
        }
        
        self.purpose = {
            self.column_name_p: {
                'major purchase': 5,
                'educational expenses': 5,
                'buy a car': 5,
                'medical bills': 5,
                'take a trip': 5,
                'wedding': 5,
                'moving': 5,
                'vacation': 5,
                'home improvements': 4,
                'buy house': 4,
                'business loan': 3,
                'small business': 3,
                'other': 2,
                'renewable energy': 2,
                'debt consolidation': 1,
            }
        }
        
    def transform(self, X):
        """ Трансформация данных """
        X = X.copy()
        
        # replace 'Home Ownership'
        X.replace(self.home_ownership, inplace=True)
        
        # replace 'Years in current job'
        X.replace(self.years_current_job, inplace=True)
            
        # replace 'Term'
        X.replace(self.term, inplace=True)
        
        # replace 'Purpose'
        X.replace(self.purpose, inplace=True)
            
        return X
       

In [18]:
train_df = pd.read_csv(DATA_TRAIN)

In [19]:
train_df


Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,Rent,402192.0,< 1 year,0.0,3.0,8.5,107866.0,0.0,,0.0,other,Short Term,129360.0,73492.0,1900.0,697.0,0
7496,Home Mortgage,1533984.0,1 year,0.0,10.0,26.5,686312.0,0.0,43.0,0.0,debt consolidation,Long Term,444048.0,456399.0,12783.0,7410.0,1
7497,Rent,1878910.0,6 years,0.0,12.0,32.1,1778920.0,0.0,,0.0,buy a car,Short Term,99999999.0,477812.0,12479.0,748.0,0
7498,Home Mortgage,,,0.0,21.0,26.5,1141250.0,0.0,,0.0,debt consolidation,Short Term,615274.0,476064.0,37118.0,,0


In [20]:
test_df = pd.read_csv(DATA_TEST)

## Отбор признаков

In [21]:
TARGET_NAME = 'Credit Default'

FEATURE_NAMES = train_df.drop(TARGET_NAME, axis=1).columns.tolist()

CATEGORY_NAMES = train_df.drop(TARGET_NAME, axis=1).select_dtypes(include='object').columns.tolist()

NUMBER_NAMES = train_df.drop(TARGET_NAME, axis=1).select_dtypes(exclude='object').columns.tolist()

In [22]:
# Создадим обучающий и целевой датасет
X = train_df[FEATURE_NAMES]
y = train_df[TARGET_NAME]

In [23]:
# Разобьем датасет на тренировочный и проверочный
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, test_df.shape

((5025, 16), (2475, 16), (2500, 16))

In [24]:
# Исправление датасета

dp = DataPreprocessing()
dp.fit(X_train)

X_train = dp.transform(X_train)
X_test = dp.transform(X_test)
test_df = dp.transform(test_df)
X_train.shape, X_test.shape, test_df.shape

((5025, 16), (2475, 16), (2500, 16))

In [25]:
X_train

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
3151,Home Mortgage,1168329.0,10+ years,0.0,9.0,16.5,1271952.0,0.0,32.0,0.0,debt consolidation,Short Term,490864.0,741494.0,28118.0,731.0
4019,Home Mortgage,1428135.0,10+ years,0.0,16.0,19.5,846230.0,0.0,32.0,0.0,debt consolidation,Short Term,270094.0,489668.0,28444.0,717.0
4194,Home Mortgage,748904.0,10+ years,0.0,6.0,23.9,353474.0,0.0,32.0,0.0,debt consolidation,Short Term,173426.0,157206.0,9361.0,747.0
1825,Own Home,1947500.0,3 years,0.0,11.0,9.4,583924.0,0.0,32.0,0.0,buy a car,Long Term,99999999.0,274854.0,24344.0,743.0
7363,Home Mortgage,1237470.0,10+ years,0.0,16.0,22.9,365794.0,1.0,68.0,1.0,debt consolidation,Short Term,264528.0,186713.0,18665.0,747.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,Rent,1168329.0,8 years,0.0,5.0,12.0,351186.0,0.0,32.0,0.0,buy house,Short Term,157190.0,73701.0,4054.0,731.0
5226,Rent,4288680.0,5 years,0.0,10.0,25.0,650188.0,0.0,34.0,0.0,debt consolidation,Short Term,264836.0,183065.0,27626.0,748.0
5390,Rent,741912.0,2 years,0.0,9.0,9.3,678326.0,0.0,32.0,0.0,debt consolidation,Short Term,279202.0,319276.0,15147.0,738.0
860,Rent,3926046.0,3 years,0.0,8.0,13.0,349492.0,0.0,18.0,0.0,debt consolidation,Short Term,470272.0,280098.0,18093.0,742.0


In [26]:
# Добавлние признаков в датасет

fg = FeatureGenerator()
fg.fit(X_train)

X_train = fg.transform(X_train)
X_test = fg.transform(X_test)
test_df = fg.transform(test_df)

In [27]:
X_train.shape, X_test.shape, test_df.shape

((5025, 16), (2475, 16), (2500, 16))

In [28]:
model_catb = catb.CatBoostClassifier(
    random_state=21,
    silent=True,
    n_estimators=180,
    max_depth=7,  # количество уровней дерева
    class_weights= [1, 2.55],  # веса классов
    eval_metric='F1',
    early_stopping_rounds=20, 
    cat_features=CATEGORY_NAMES, # категориальные признаки
    use_best_model=True,
    custom_metric=['Precision', 'Recall']
)


model_catb.fit(X_train, y_train, CATEGORY_NAMES, eval_set=(X_test, y_test))

evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      3631
           1       0.52      0.65      0.58      1394

    accuracy                           0.74      5025
   macro avg       0.68      0.71      0.69      5025
weighted avg       0.76      0.74      0.74      5025

TEST

              precision    recall  f1-score   support

           0       0.82      0.76      0.79      1756
           1       0.50      0.59      0.54       719

    accuracy                           0.71      2475
   macro avg       0.66      0.68      0.67      2475
weighted avg       0.73      0.71      0.72      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1335  421
1                292  427


In [29]:
model_xgb = xgb.XGBClassifier(
    random_state=21,
    n_estimators=180, 
    max_depth=2,  # количество уровней дерева
    reg_lambda=100,  # L2 регуляризация
)

In [30]:
model_xgb.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=180, n_jobs=4, num_parallel_tree=1, random_state=21,
              reg_alpha=0, reg_lambda=100, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
evaluate_preds(model_xgb, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.79      0.98      0.87      3631
           1       0.86      0.32      0.47      1394

    accuracy                           0.80      5025
   macro avg       0.82      0.65      0.67      5025
weighted avg       0.81      0.80      0.76      5025

TEST

              precision    recall  f1-score   support

           0       0.78      0.97      0.86      1756
           1       0.79      0.32      0.45       719

    accuracy                           0.78      2475
   macro avg       0.78      0.64      0.66      2475
weighted avg       0.78      0.78      0.74      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1697   59
1                492  227


In [32]:
model_lgbm = lgbm.LGBMClassifier(
    random_state=21,
    silent=True,
    n_estimators=200,
    class_weights= [1, 2.55],  #  веса классов
    max_depth=3,  # количество уровней дерева
    reg_lambda=15,  # L2 регуляризация
    num_leaves=3,  # максимальное количество листьев на ветке 
    # categorical_feature=CATEGORY_NAMES,  # категориальные признаки
)

In [33]:
model_lgbm.fit(X_train, y_train)



LGBMClassifier(class_weights=[1, 2.55], max_depth=3, n_estimators=200,
               num_leaves=3, random_state=21, reg_lambda=15)

In [34]:
evaluate_preds(model_lgbm, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.78      0.99      0.87      3631
           1       0.88      0.27      0.42      1394

    accuracy                           0.79      5025
   macro avg       0.83      0.63      0.64      5025
weighted avg       0.81      0.79      0.74      5025

TEST

              precision    recall  f1-score   support

           0       0.77      0.98      0.86      1756
           1       0.84      0.28      0.41       719

    accuracy                           0.77      2475
   macro avg       0.80      0.63      0.64      2475
weighted avg       0.79      0.77      0.73      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1717   39
1                521  198


In [35]:
predictions = model_catb.predict(test_df)
predictions

array([1, 1, 1, ..., 0, 0, 1])

In [36]:
SUBMISSION = 'sample_submission.csv'
submit = pd.read_csv(SUBMISSION)

In [37]:
submit['Credit Default'] = predictions

In [38]:
submit.to_csv('credict_classification.csv', index=False)