In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler, StandardScaler,OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,roc_auc_score,classification_report,roc_curve,auc, f1_score

import import_ipynb
from function_for_eda import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


importing Jupyter notebook from function_for_eda.ipynb


# 4. INSTALLMENT_PAYMENT.CSV
Table installment_payment cung cấp những thông tin chi tiết về installment của những khoan vay trước đây tại Home Credit. Từ các bước EDA trước đó, chúng ta tiến hành các bước feature engineering sau:
- <b>Bước 1</b>: Tạo ra một số feature mới như the number of days the payment was delayed, the difference in amount of payment required vs paid.
- <b>Bước 2</b>: Thực hiện các aggregations thông qua SK_ID_PREV bằng các phép aggregation như min, max, sum, count,....Đầu tiên nhóm 12 aggregate trên toàn bộ data, sau đó aggregate qua những installment trong 1 năm gần nhất (dựa vào column DAYS_INSTALLMENT), và aggregate qua 5 installment đầu tiên cho từng khoản vay. Thứ tứ installment dựa vào column NUMBER_INSTALLMENT_NUMBER. Việc lựa chọn các phép aggregation dựa trên domain knowledge, quá trình eda, cũng như một số solutions tham khảo khác
- <b>Bước 3</b>: Thực hiện các aggregations thông qua SK_ID_CURR bằng các phép aggregation như min, max, sum, count,....cho toàn bộ data.


In [2]:
installments_payments = pd.read_csv('../dseb63_final_project_DP_dataset/dseb63_installments_payments.csv')
installments_payments

Unnamed: 0,SK_ID_PREV,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,SK_ID_CURR
0,1054186,1.0,6,-1180.0,-1187.0,6948.360,6948.360,147397.0
1,2452854,1.0,21,-546.0,-552.0,11302.605,11302.605,147397.0
2,1054186,1.0,2,-1300.0,-1307.0,6948.360,6948.360,147397.0
3,1682318,1.0,2,-240.0,-243.0,7374.510,7374.510,147397.0
4,2452854,1.0,10,-876.0,-882.0,11302.605,11302.605,147397.0
...,...,...,...,...,...,...,...,...
7744753,2192667,1.0,6,-2352.0,-2352.0,5322.240,5322.240,21216.0
7744754,2208281,1.0,4,-452.0,-466.0,63195.435,63195.435,21216.0
7744755,2657771,0.0,3,-2907.0,-2932.0,3375.000,3375.000,21216.0
7744756,2657771,0.0,47,-1871.0,-1871.0,4915.890,4915.890,21216.0


In [3]:
installments_payments = installments_payments.sort_values(by = ['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER'], ascending = True)
        
#getting the total NaN values in the table
installments_payments['MISSING_VALS_TOTAL_INSTAL'] = installments_payments.isna().sum(axis = 1)
#engineering new features based on some domain based polynomial operations
installments_payments['DAYS_PAYMENT_RATIO'] = installments_payments['DAYS_INSTALMENT'] / (installments_payments['DAYS_ENTRY_PAYMENT'] + 0.00001)
installments_payments['DAYS_PAYMENT_DIFF'] = installments_payments['DAYS_INSTALMENT'] - installments_payments['DAYS_ENTRY_PAYMENT']
installments_payments['AMT_PAYMENT_RATIO'] = installments_payments['AMT_PAYMENT'] / (installments_payments['AMT_INSTALMENT'] + 0.00001)
installments_payments['AMT_PAYMENT_DIFF'] = installments_payments['AMT_INSTALMENT'] - installments_payments['AMT_PAYMENT']
installments_payments['EXP_DAYS_PAYMENT_RATIO'] = installments_payments['DAYS_PAYMENT_RATIO'].transform(lambda x: x.ewm(alpha = 0.5).mean())
installments_payments['EXP_DAYS_PAYMENT_DIFF'] = installments_payments['DAYS_PAYMENT_DIFF'].transform(lambda x: x.ewm(alpha = 0.5).mean())
installments_payments['EXP_AMT_PAYMENT_RATIO'] = installments_payments['AMT_PAYMENT_RATIO'].transform(lambda x: x.ewm(alpha = 0.5).mean())
installments_payments['EXP_AMT_PAYMENT_DIFF'] = installments_payments['AMT_PAYMENT_DIFF'].transform(lambda x: x.ewm(alpha = 0.5).mean())
        

In [4]:
overall_aggregations = {
            'MISSING_VALS_TOTAL_INSTAL' : ['sum'],
            'NUM_INSTALMENT_VERSION' : ['mean','sum'],
            'NUM_INSTALMENT_NUMBER' : ['max'],
            'DAYS_INSTALMENT' : ['max','min'],
            'DAYS_ENTRY_PAYMENT' : ['max','min'],
            'AMT_INSTALMENT' : ['mean', 'sum', 'max'],
            'AMT_PAYMENT' : ['mean', 'sum', 'max'],
            'DAYS_PAYMENT_RATIO' : ['mean', 'min','max'],
            'DAYS_PAYMENT_DIFF' : ['mean','min','max'],
            'AMT_PAYMENT_RATIO' : ['mean','min','max'],
            'AMT_PAYMENT_DIFF' : ['mean','min','max'],
            'EXP_DAYS_PAYMENT_RATIO' : ['last'],
            'EXP_DAYS_PAYMENT_DIFF' : ['last'],
            'EXP_AMT_PAYMENT_RATIO' : ['last'],
            'EXP_AMT_PAYMENT_DIFF' : ['last']
        }
limited_period_aggregations = {
    'NUM_INSTALMENT_VERSION' : ['mean','sum'],
    'AMT_INSTALMENT' : ['mean', 'sum', 'max'],
    'AMT_PAYMENT' : ['mean', 'sum', 'max'],
    'DAYS_PAYMENT_RATIO' : ['mean', 'min','max'],
    'DAYS_PAYMENT_DIFF' : ['mean','min','max'],
    'AMT_PAYMENT_RATIO' : ['mean','min','max'],
    'AMT_PAYMENT_DIFF' : ['mean','min','max'],
    'EXP_DAYS_PAYMENT_RATIO' : ['last'],
    'EXP_DAYS_PAYMENT_DIFF' : ['last'],
    'EXP_AMT_PAYMENT_RATIO' : ['last'],
    'EXP_AMT_PAYMENT_DIFF' : ['last']
}


In [5]:
#aggregating installments_payments over SK_ID_PREV for last 1 year installments
group_last_1_year = installments_payments[installments_payments['DAYS_INSTALMENT'] > -365].groupby('SK_ID_PREV').agg(limited_period_aggregations)
group_last_1_year.columns = ['_'.join(ele).upper() + '_LAST_1_YEAR' for ele in group_last_1_year.columns]

#aggregating installments_payments over SK_ID_PREV for first 5 installments
group_first_5_instalments = installments_payments.groupby('SK_ID_PREV', as_index = False).head(5).groupby('SK_ID_PREV').agg(limited_period_aggregations)
group_first_5_instalments.columns = ['_'.join(ele).upper() + '_FIRST_5_INSTALLMENTS' for ele in group_first_5_instalments.columns]
#overall aggregation of installments_payments over SK_ID_PREV
group_overall = installments_payments.groupby(['SK_ID_PREV','SK_ID_CURR'], as_index = False).agg(overall_aggregations)
group_overall.columns = ['_'.join(ele).upper() for ele in group_overall.columns]
group_overall.rename(columns = {'SK_ID_PREV_': 'SK_ID_PREV','SK_ID_CURR_' : 'SK_ID_CURR'}, inplace = True)

#merging all of the above aggregations together
installments_payments_agg_prev = group_overall.merge(group_last_1_year, on = 'SK_ID_PREV', how = 'outer')
installments_payments_agg_prev = installments_payments_agg_prev.merge(group_first_5_instalments, on = 'SK_ID_PREV', how = 'outer')


In [6]:
#aggregating over SK_ID_CURR
main_features_aggregations = {
    'MISSING_VALS_TOTAL_INSTAL_SUM' : ['sum'],
    'NUM_INSTALMENT_VERSION_MEAN' : ['mean'],
    'NUM_INSTALMENT_VERSION_SUM' : ['mean'],
    'NUM_INSTALMENT_NUMBER_MAX' : ['mean','sum','max'],
    'AMT_INSTALMENT_MEAN' : ['mean','sum','max'],
    'AMT_INSTALMENT_SUM' : ['mean','sum','max'],
    'AMT_INSTALMENT_MAX' : ['mean'],
    'AMT_PAYMENT_MEAN' : ['mean','sum','max'],
    'AMT_PAYMENT_SUM' : ['mean','sum','max'],
    'AMT_PAYMENT_MAX' : ['mean'],
    'DAYS_PAYMENT_RATIO_MEAN' : ['mean','min','max'],
    'DAYS_PAYMENT_RATIO_MIN' : ['mean','min'],
    'DAYS_PAYMENT_RATIO_MAX' : ['mean','max'],
    'DAYS_PAYMENT_DIFF_MEAN' : ['mean','min','max'],
    'DAYS_PAYMENT_DIFF_MIN' : ['mean','min'],
    'DAYS_PAYMENT_DIFF_MAX' : ['mean','max'],
    'AMT_PAYMENT_RATIO_MEAN' : ['mean', 'min','max'],
    'AMT_PAYMENT_RATIO_MIN' : ['mean','min'],
    'AMT_PAYMENT_RATIO_MAX' : ['mean','max'],
    'AMT_PAYMENT_DIFF_MEAN' : ['mean','min','max'],
    'AMT_PAYMENT_DIFF_MIN' : ['mean','min'],
    'AMT_PAYMENT_DIFF_MAX' : ['mean','max'],
    'EXP_DAYS_PAYMENT_RATIO_LAST' : ['mean'],
    'EXP_DAYS_PAYMENT_DIFF_LAST' : ['mean'],
    'EXP_AMT_PAYMENT_RATIO_LAST' : ['mean'],
    'EXP_AMT_PAYMENT_DIFF_LAST' : ['mean']
}


In [7]:
grouped_main_features = installments_payments_agg_prev.groupby('SK_ID_CURR').agg(main_features_aggregations)
grouped_main_features.columns = ['_'.join(ele).upper() for ele in grouped_main_features.columns]

#group remaining ones
grouped_remaining_features = installments_payments_agg_prev.iloc[:,[1] + list(range(31,len(installments_payments_agg_prev.columns)))].groupby('SK_ID_CURR').mean()

installments_payments_aggregated = grouped_main_features.merge(grouped_remaining_features, on = 'SK_ID_CURR', how = 'inner')


In [8]:
installments_payments_aggregated.reset_index(inplace = True)

In [9]:
installments_payments_aggregated

Unnamed: 0,SK_ID_CURR,MISSING_VALS_TOTAL_INSTAL_SUM_SUM,NUM_INSTALMENT_VERSION_MEAN_MEAN,NUM_INSTALMENT_VERSION_SUM_MEAN,NUM_INSTALMENT_NUMBER_MAX_MEAN,NUM_INSTALMENT_NUMBER_MAX_SUM,NUM_INSTALMENT_NUMBER_MAX_MAX,AMT_INSTALMENT_MEAN_MEAN,AMT_INSTALMENT_MEAN_SUM,AMT_INSTALMENT_MEAN_MAX,...,AMT_PAYMENT_RATIO_MEAN_FIRST_5_INSTALLMENTS,AMT_PAYMENT_RATIO_MIN_FIRST_5_INSTALLMENTS,AMT_PAYMENT_RATIO_MAX_FIRST_5_INSTALLMENTS,AMT_PAYMENT_DIFF_MEAN_FIRST_5_INSTALLMENTS,AMT_PAYMENT_DIFF_MIN_FIRST_5_INSTALLMENTS,AMT_PAYMENT_DIFF_MAX_FIRST_5_INSTALLMENTS,EXP_DAYS_PAYMENT_RATIO_LAST_FIRST_5_INSTALLMENTS,EXP_DAYS_PAYMENT_DIFF_LAST_FIRST_5_INSTALLMENTS,EXP_AMT_PAYMENT_RATIO_LAST_FIRST_5_INSTALLMENTS,EXP_AMT_PAYMENT_DIFF_LAST_FIRST_5_INSTALLMENTS
0,0.0,0,1.008333,3.250000,7.750000,31,21,17514.131812,70056.527250,40878.975000,...,1.000000,1.000000,1.000000,0.0000,0.00000,0.0000,0.885887,16.022582,1.000000,0.000000e+00
1,1.0,0,0.686275,9.333333,11.333333,34,13,20305.859118,60917.577353,45713.427353,...,0.866667,0.668215,1.000000,3466.1820,0.00000,8625.1950,1.002157,-1.566338,0.906331,2.435065e+03
2,3.0,0,1.195833,4.500000,10.500000,42,31,9270.664760,37082.659040,13318.569000,...,1.181049,1.000000,1.377647,-731.2500,-1319.95125,0.0000,0.940710,20.832817,1.117856,-4.311071e+02
3,4.0,0,1.000000,6.000000,6.000000,6,6,8953.522500,8953.522500,8953.522500,...,1.000000,1.000000,1.000000,0.0000,0.00000,0.0000,0.991546,15.186440,1.000000,-2.555627e-08
4,5.0,0,1.000000,10.000000,10.000000,10,10,9818.644500,9818.644500,9818.644500,...,1.000000,1.000000,1.000000,0.0000,0.00000,0.0000,0.995584,8.815413,1.000000,-3.993167e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180728,307501.0,0,1.250000,10.500000,8.250000,33,12,9476.366562,37905.466250,17511.905000,...,1.000000,1.000000,1.000000,0.0000,0.00000,0.0000,0.991956,9.770036,1.000000,2.063957e-13
180729,307503.0,0,1.000000,10.000000,10.000000,10,10,4494.393000,4494.393000,4494.393000,...,1.000000,1.000000,1.000000,0.0000,0.00000,0.0000,0.994004,13.010195,1.000000,9.592018e-23
180730,307504.0,0,0.833333,4.250000,60.250000,241,123,4120.147321,16480.589286,6151.450000,...,0.974339,0.871694,1.000000,189.7785,0.00000,948.8925,0.862019,6.253863,0.935121,4.798152e+02
180731,307506.0,0,1.000000,10.500000,10.500000,42,12,11162.210625,44648.842500,17037.277500,...,1.000000,1.000000,1.000000,0.0000,0.00000,0.0000,0.980942,5.118906,0.999977,1.704013e-01


In [10]:
installments_payments_aggregated.to_csv('installments_payments_final.csv', index = False)
print('done')

done
