## Classification course project

In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

## Data loading

In [5]:
DATASET_PATH = "../../data/classification/train.csv"
df_train = pd.read_csv(DATASET_PATH)
df_train = reduce_mem_usage(df_train)

Memory usage of dataframe is 0.97 MB
Memory usage after optimization is: 0.38 MB
Decreased by 60.9%


In [6]:
TEST_DATASET_PATH = "../../data/regression/test.csv"
df_test = pd.read_csv(TEST_DATASET_PATH)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 0.72 MB
Memory usage after optimization is: 0.22 MB
Decreased by 70.3%


### Features

- Home Ownership - домовладение
- Annual Income - годовой доход
- Years in current job - количество лет на текущем месте работы
- Tax Liens - налоговые обременения
- Number of Open Accounts - количество открытых счетов
- Years of Credit History - количество лет кредитной истории
- Maximum Open Credit - наибольший открытый кредит (максимальная сумма, которая когда-либо была доступна клиенту)
- Number of Credit Problems - количество проблем с кредитом
- Months since last delinquent - количество месяцев с последней просрочки платежа
- Bankruptcies - банкротства
- Purpose - цель кредита
- Term - срок кредита
- Current Loan Amount - текущая сумма кредита (сумма, которую еще предстоит выплатить клиенту)
- Current Credit Balance - текущий кредитный баланс (сумма, которую может тратить клиент с кредитного счета)
- Monthly Debt - ежемесячный долг
- Credit Score - баллы кредитного рейтинга
- Credit Default - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

## EDA

In [7]:
df_train.describe()

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
count,5943.0,7500.0,7500.0,7500.0,7500.0,7500.0,3419.0,7486.0,7500.0,7500.0,7500.0,5943.0,7500.0
mean,1366392.0,0.030133,11.130934,18.317467,945153.8,0.17,34.6926,0.117152,11873178.0,289833.2,18314.453125,1151.087524,0.281733
std,845339.2,0.271604,4.908924,7.041946,16026220.0,0.498598,21.688807,0.347192,31926124.0,317871.4,11926.764648,1604.451416,0.449874
min,164597.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,11242.0,0.0,0.0,585.0,0.0
25%,844341.0,0.0,8.0,13.5,279229.5,0.0,16.0,0.0,180169.0,114256.5,10067.5,711.0,0.0
50%,1168386.0,0.0,10.0,17.0,478159.0,0.0,32.0,0.0,309573.0,209323.0,16076.5,731.0,0.0
75%,1640137.0,0.0,14.0,21.799999,793501.5,0.0,50.0,0.0,519882.0,360406.2,23818.0,743.0,1.0
max,10149340.0,7.0,43.0,57.700001,1304726000.0,7.0,118.0,4.0,100000000.0,6506797.0,136679.0,7510.0,1.0


In [8]:
df_train.dtypes

Home Ownership                  category
Annual Income                    float32
Years in current job            category
Tax Liens                        float32
Number of Open Accounts          float32
Years of Credit History          float32
Maximum Open Credit              float32
Number of Credit Problems        float32
Months since last delinquent     float32
Bankruptcies                     float32
Purpose                         category
Term                            category
Current Loan Amount              float32
Current Credit Balance           float32
Monthly Debt                     float32
Credit Score                     float32
Credit Default                      int8
dtype: object

In [9]:
df_train.isna().sum()

Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [10]:
df_train.shape

(7500, 17)

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Home Ownership                7500 non-null   category
 1   Annual Income                 5943 non-null   float32 
 2   Years in current job          7129 non-null   category
 3   Tax Liens                     7500 non-null   float32 
 4   Number of Open Accounts       7500 non-null   float32 
 5   Years of Credit History       7500 non-null   float32 
 6   Maximum Open Credit           7500 non-null   float32 
 7   Number of Credit Problems     7500 non-null   float32 
 8   Months since last delinquent  3419 non-null   float32 
 9   Bankruptcies                  7486 non-null   float32 
 10  Purpose                       7500 non-null   category
 11  Term                          7500 non-null   category
 12  Current Loan Amount           7500 non-null   fl

In [12]:
df_train["Credit Default"].value_counts()

0    5387
1    2113
Name: Credit Default, dtype: int64

### Categories

In [13]:
for cat_colname in df_train.select_dtypes(include='category').columns:
    print(str(cat_colname) + '\n\n' + "Mode: " + str(df_train[cat_colname].mode()) + '\n\n' + str(df_train[cat_colname].value_counts()) + '\n' + '*' * 100 + '\n')

Home Ownership

Mode: 0    Home Mortgage
Name: Home Ownership, dtype: category
Categories (4, object): ['Have Mortgage', 'Home Mortgage', 'Own Home', 'Rent']

Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: Home Ownership, dtype: int64
****************************************************************************************************

Years in current job

Mode: 0    10+ years
Name: Years in current job, dtype: category
Categories (11, object): ['1 year', '10+ years', '2 years', '3 years', ..., '7 years', '8 years', '9 years', '< 1 year']

10+ years    2332
2 years       705
3 years       620
< 1 year      563
5 years       516
1 year        504
4 years       469
6 years       426
7 years       396
8 years       339
9 years       259
Name: Years in current job, dtype: int64
****************************************************************************************************

Purpose

Mode: 0    debt consolidation
Name: Purpose, dtype: categ

## Data Preprocessing

In [14]:
class ClassificationDataPreprocessor:
    
    def __init__(self):
        self.target_name = 'Credit Default'
        self.cat_features = ['Home Ownership', 'Years in current job','Purpose','Term']
        self.float_features = ['Annual Income','Tax Liens','Number of Open Accounts', 'Years of Credit History','Maximum Open Credit','Number of Credit Problems', 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt', 'Credit Score']
        self.medians = []
        self.modes = []
    
    def fit(self, inner_df):
        # Расчет медиан
        self.medians = inner_df[self.float_features].median()
        # Расчет мод
        self.modes = inner_df[self.cat_features].mode().squeeze()
    
    
    def transform(self, inner_df, is_test=False):
        
        # Замена пропусков
        inner_df[self.float_features] = inner_df[self.float_features].fillna(self.medians[self.float_features])
        inner_df[self.cat_features] = inner_df[self.cat_features].fillna(self.modes[self.cat_features])
        
        return inner_df
        
                
    def replace_with_median(self, df:pd.DataFrame, condition:pd.Series, feature:str):
        df.loc[condition, feature] = self.medians[feature]


In [15]:
dataPreprocessor = ClassificationDataPreprocessor()
dataPreprocessor.fit(df_train)
df_train = dataPreprocessor.transform(df_train)

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Home Ownership                7500 non-null   category
 1   Annual Income                 7500 non-null   float32 
 2   Years in current job          7500 non-null   category
 3   Tax Liens                     7500 non-null   float32 
 4   Number of Open Accounts       7500 non-null   float32 
 5   Years of Credit History       7500 non-null   float32 
 6   Maximum Open Credit           7500 non-null   float32 
 7   Number of Credit Problems     7500 non-null   float32 
 8   Months since last delinquent  7500 non-null   float32 
 9   Bankruptcies                  7500 non-null   float32 
 10  Purpose                       7500 non-null   category
 11  Term                          7500 non-null   category
 12  Current Loan Amount           7500 non-null   fl

## Balancing

In [26]:
class ClassificationBalancer:
    
    def __init__(self):
        self.target_name = 'Credit Default'
        self.disbalance_coeff = None
        self.major_class_name = None
        self.minor_class_name = None
        self.target_counts = None
        
    
    def fit(self, inner_df):
        # Расчет коэффициента отклонения
        self.target_counts = inner_df[self.target_name].value_counts()

        self.major_class_name = self.target_counts.argmax()
        self.minor_class_name = self.target_counts.argmin()
        
        self.disbalance_coeff = int(self.target_counts[self.major_class_name] / self.target_counts[self.minor_class_name]) - 1
    
    
    def transform(self, inner_df, is_test=False):
        # Балансировка
        for i in range(self.disbalance_coeff):
            sample = inner_df[inner_df[self.target_name] == self.minor_class_name].sample(self.target_counts[self.minor_class_name])
            inner_df = inner_df.append(sample, ignore_index=True)

        return inner_df.sample(frac=1)

In [29]:
balancer = ClassificationBalancer()
balancer.fit(df_train)
df_train = balancer.transform(df_train)

In [30]:
df_train['Credit Default'].value_counts()

0    5387
1    4226
Name: Credit Default, dtype: int64

## Training

## Predicting

## Evaluating