In [1]:
import pandas as pd
import numpy as np
import pickle

# visulization
import matplotlib.pyplot as plt
import seaborn as sns

# feature selection
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency

# modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

import warnings
import os

In [2]:
df1 = pd.read_excel('../dataset/bank_data.xlsx')
df2 = pd.read_excel('../dataset/cibil_data.xlsx')

In [3]:
# extensive data of the customers 
df = pd.merge(df1, df2, 'inner', on='PROSPECTID')

In [4]:
# saving dataframe with pickle, faster upload
df.to_pickle('../artifacts/full_data.pkl')
df.to_csv('../artifacts/full_data.csv', index=False)

In [5]:
df = pd.read_pickle('../artifacts/full_data.pkl')

The -99999 are used for null value, and should be rectified

In [6]:
# impute -99999 with null
df.replace(-99999, np.nan, inplace=True)

In [7]:
# null_col: column with null values
null_col = df.isna().sum()[lambda x: x > 0].index
df[null_col].isna().sum().sort_values()

Age_Oldest_TL                      40
Age_Newest_TL                      40
pct_currentBal_all_TL              72
time_since_recent_payment        4291
enq_L3m                          6321
enq_L6m                          6321
enq_L12m                         6321
time_since_recent_enq            6321
PL_enq_L12m                      6321
PL_enq_L6m                       6321
PL_enq                           6321
CC_enq_L12m                      6321
CC_enq_L6m                       6321
CC_enq                           6321
tot_enq                          6321
max_deliq_12mts                 10832
max_deliq_6mts                  12890
max_unsec_exposure_inPct        23178
max_delinquency_level           35949
time_since_recent_deliquency    35949
time_since_first_deliquency     35949
PL_utilization                  44435
CC_utilization                  47636
dtype: int64

* PL_utilization and CC_utilization are only features associated with utilization of TL, and cannot be imputed with confidence.This features needs to be dropped.
* time_since_first_deliquency, time_since_recent_deliquency, and max_delinquency_level are related to customer default/deliquency. There are many more features related to deliquency, require further analysis before making any decision.
* max_unsec_exposure_inPct have 2 related features (Secured_TL, Unsecured_TL) which does not show any direction correlation with `maximum %(exposure to unsecured TL)`. This column cannot be immputed and should be dropped.
* max_deliq_6mts, and max_deliq_12mts are similar to point 2, about default/deliquency. 
* Alot of features have 6321 null values (-99999). 
* 

In [8]:
### Dropping null age and balance entries
df.dropna(subset=['Age_Oldest_TL', 'Age_Newest_TL', 'pct_currentBal_all_TL'], inplace=True)

# dropping utilization features.
df.drop(['PL_utilization', 'CC_utilization', 'max_unsec_exposure_inPct'], axis=1, inplace=True)

### Analysis for deliquency features

In [9]:
# delinquency features with more than 50% null values

null_deq_col = ['time_since_first_deliquency', 'time_since_recent_deliquency', 'max_delinquency_level']
rest_deq_col = ['num_times_delinquent', 'max_recent_level_of_deliq', 'num_deliq_6mts', 'num_deliq_12mts', 'num_deliq_6_12mts', 'max_deliq_6mts', 'max_deliq_12mts', 'recent_level_of_deliq']

df[null_deq_col].isna().sum(axis=1).value_counts()

3    35845
0    15381
dtype: int64

In [10]:
# check for any deq in customer history
df.num_times_delinquent[df[null_deq_col].isna().sum(axis=1) == 3].value_counts()

0    35845
Name: num_times_delinquent, dtype: int64

In [11]:
# cross verify / double check
df.max_delinquency_level[df['num_times_delinquent'] == 0].isna().sum()

35845

In [12]:
# fill null deq with 0
df[null_deq_col] = df[null_deq_col].fillna(0)

In [13]:
other_null_deq_index = df[['max_deliq_12mts', 'max_deliq_6mts']].isna().sum(axis=1)
other_null_deq_index.value_counts()

0    38428
2    10741
1     2057
dtype: int64

In [14]:
df.loc[other_null_deq_index > 0]['num_times_delinquent']

0        11
3         0
4         0
6         0
9         0
         ..
51322     0
51327     1
51328     0
51329     0
51335     0
Name: num_times_delinquent, Length: 12798, dtype: int64

In [15]:
# impute 0 for other null deq if num of deq is 0
other_null_deq_col = ['max_deliq_6mts', 'max_deliq_12mts']
other_null_no_deq_index = df[df.num_times_delinquent == 0].index

df.loc[other_null_no_deq_index, other_null_deq_col] = df.loc[other_null_no_deq_index, other_null_deq_col].fillna(0)


In [16]:
# remaining null deq
df[other_null_deq_col].isna().sum()

max_deliq_6mts     2612
max_deliq_12mts    1677
dtype: int64

In [17]:
reamining_null_deq_series = df[other_null_deq_col].isna().sum(axis=1)
reamining_null_deq_series.value_counts()

0    48614
2     1677
1      935
dtype: int64

In [18]:
df[other_null_deq_col].isna().sum(axis=0)

max_deliq_6mts     2612
max_deliq_12mts    1677
dtype: int64

In [19]:
df[reamining_null_deq_series >= 1]

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.000,0.000,0.200,0.800,...,0.0,0.0,0.0,0.0,1,0,PL,PL,696,P2
12,13,2,2,0,0,0,0.000,0.000,0.000,1.000,...,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,others,669,P2
19,20,7,5,2,1,0,0.143,0.000,0.286,0.714,...,0.0,0.0,0.0,0.0,1,0,others,others,720,P1
26,27,1,1,0,0,0,0.000,0.000,0.000,1.000,...,0.0,1.0,0.0,0.5,0,0,CC,others,660,P3
28,29,42,34,8,2,2,0.048,0.048,0.190,0.810,...,0.0,0.0,0.0,0.0,1,1,HL,HL,740,P1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51188,51189,1,1,0,0,0,0.000,0.000,0.000,1.000,...,0.0,0.0,0.0,0.0,0,0,others,others,671,P2
51203,51204,22,11,11,0,0,0.000,0.000,0.500,0.500,...,0.0,0.0,0.0,0.0,1,0,HL,AL,716,P1
51258,51259,4,4,0,0,1,0.000,0.250,0.000,1.000,...,0.0,0.0,0.0,0.0,1,0,ConsumerLoan,ConsumerLoan,667,P3
51310,51311,3,1,2,0,0,0.000,0.000,0.667,0.333,...,0.0,1.0,0.0,1.0,0,1,CC,HL,677,P2


Remaining null in deq features could not be imputed with confidence, as no colums related to delinquency level and needs to be rejected from dataset.

In [20]:
#df = df.drop(['max_deliq_6mts', 'max_deliq_12mts'], axis=1)

### Analysis of enquiry features

In [21]:
null_col = df.isna().sum(axis=0)[lambda x: x>0].index
df[null_col].isna().sum(axis=0).sort_values()

max_deliq_12mts              1677
max_deliq_6mts               2612
time_since_recent_payment    4248
tot_enq                      6277
CC_enq                       6277
CC_enq_L6m                   6277
CC_enq_L12m                  6277
PL_enq                       6277
PL_enq_L6m                   6277
PL_enq_L12m                  6277
time_since_recent_enq        6277
enq_L12m                     6277
enq_L6m                      6277
enq_L3m                      6277
dtype: int64

In [22]:
# null enq columns which have NAN for 6321 customers.
null_enq_col = ['tot_enq', 'CC_enq', 'CC_enq_L6m', 'enq_L3m', 'PL_enq', 'PL_enq_L6m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L12m', 'enq_L6m', 'CC_enq_L12m']
df[null_enq_col].isna().sum(axis=1).value_counts()

0     44949
11     6277
dtype: int64

6321 customer are about 12% of the dataset and have 11 missing entries (related to enq) per customer. These customer record needs to be removed since there is no other variable to impute data with confidence.

Dropping customer record to save enquire data is a viable and better approach as dropping features would result in no absolute enquire related feature.

In [23]:
# picked any enq feature as there's no sparsity in missing values of enq features.
df.dropna(subset=['tot_enq'], inplace=True)
df.shape

(44949, 84)

### Analysing remaining features with NULL

In [24]:
null_col = df.isna().sum(axis=0)[lambda x: x>0].index
df[null_col].isna().sum(axis=0).sort_values()

max_deliq_12mts              1258
max_deliq_6mts               2021
time_since_recent_payment    2885
dtype: int64

In [25]:
df.isna().sum(axis=1).value_counts()

0    40192
1     3440
2     1227
3       90
dtype: int64

To save a feature, atleast 90 customer records will be dropped. There is no reason specifically to keep a feature, we can compare the model perform before and after removing remanining feature.

In [26]:
# no of customer with 1 null in their record
temp = df[df.isna().sum(axis=1) == 2]
temp_col = temp.loc[:, temp.isna().any()].columns
temp[temp_col].isna().sum()

time_since_recent_payment      59
max_deliq_6mts               1227
max_deliq_12mts              1168
dtype: int64

In [27]:
# no of customer with 1 null in their record
temp = df[df.isna().sum(axis=1) == 1]
temp_col = temp.loc[:, temp.isna().any()].columns
temp[temp_col].isna().sum()

time_since_recent_payment    2736
max_deliq_6mts                704
dtype: int64

In [28]:
# dropping all customer record for multi-collinearity check
no_null_df1 = df.dropna(subset=['time_since_recent_payment', 'max_deliq_6mts', 'max_deliq_12mts'])
no_null_df2 = df.dropna(subset=['time_since_recent_payment']).drop(['max_deliq_6mts', 'max_deliq_12mts'], axis=1)