In [1]:
import pandas as pd
import numpy as np
import pickle

# visulization
import matplotlib.pyplot as plt
import seaborn as sns

# feature selection
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import chi2_contingency

# modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

import warnings
import os

In [2]:
df1 = pd.read_excel('../dataset/bank_data.xlsx')
df2 = pd.read_excel('../dataset/cibil_data.xlsx')

In [3]:
# extensive data of the customers 
df = pd.merge(df1, df2, 'inner')

In [4]:
# saving dataframe with pickle, faster upload
df.to_pickle('../artifacts/full_data.pkl')
df.to_csv('../artifacts/full_data.csv', index=False)

In [5]:
df = pd.read_pickle('../artifacts/full_data.pkl')

The -99999 are used for null value, and should be rectified

In [6]:
# impute -99999 with null
df.replace(-99999, np.nan, inplace=True)

In [7]:
# null_col: column with null values
null_col = df.isna().sum()[lambda x: x > 0].index
df[null_col].isna().sum().sort_values()

Age_Oldest_TL                      40
Age_Newest_TL                      40
pct_currentBal_all_TL              72
time_since_recent_payment        4291
enq_L3m                          6321
enq_L6m                          6321
enq_L12m                         6321
time_since_recent_enq            6321
PL_enq_L12m                      6321
PL_enq_L6m                       6321
PL_enq                           6321
CC_enq_L12m                      6321
CC_enq_L6m                       6321
CC_enq                           6321
tot_enq                          6321
max_deliq_12mts                 10832
max_deliq_6mts                  12890
max_unsec_exposure_inPct        23178
max_delinquency_level           35949
time_since_recent_deliquency    35949
time_since_first_deliquency     35949
PL_utilization                  44435
CC_utilization                  47636
dtype: int64

* PL_utilization and CC_utilization are only features associated with utilization of TL, and cannot be imputed with confidence.This features needs to be dropped.
* time_since_first_deliquency, time_since_recent_deliquency, and max_delinquency_level are related to customer default/deliquency. There are many more features related to deliquency, require further analysis before making any decision.
* max_unsec_exposure_inPct have 2 related features (Secured_TL, Unsecured_TL) which does not show any direction correlation with `maximum %(exposure to unsecured TL)`. This column cannot be immputed and should be dropped.
* max_deliq_6mts, and max_deliq_12mts are similar to point 2, about default/deliquency. 
* Alot of features have 6321 null values (-99999). 
* 

In [8]:
# dropping utilization features.
df.drop(['PL_utilization', 'CC_utilization'], axis=1, inplace=True)

### Analysis for deliquency features

In [9]:
# delinquency features with more than 50% null values

null_deq_col = ['time_since_first_deliquency', 'time_since_recent_deliquency', 'max_delinquency_level']
rest_deq_col = ['num_times_delinquent', 'max_recent_level_of_deliq', 'num_deliq_6mts', 'num_deliq_12mts', 'num_deliq_6_12mts', 'max_deliq_6mts', 'max_deliq_12mts', 'recent_level_of_deliq']

df[null_deq_col].isna().sum(axis=1).value_counts()

3    35949
0    15387
dtype: int64

In [10]:
# check for any deq in customer history
df.num_times_delinquent[df[null_deq_col].isna().sum(axis=1) == 3].value_counts()

0    35949
Name: num_times_delinquent, dtype: int64

In [11]:
# cross verify / double check
df.max_delinquency_level[df['num_times_delinquent'] == 0].isna().sum()

35949

In [12]:
# fill null deq with 0
df[null_deq_col] = df[null_deq_col].fillna(0)

In [13]:
other_null_deq_index = df[['max_deliq_12mts', 'max_deliq_6mts']].isna().sum(axis=1)
other_null_deq_index.value_counts()

0    38446
2    10832
1     2058
dtype: int64

In [14]:
df.loc[other_null_deq_index > 0]['num_times_delinquent']

0        11
3         0
4         0
6         0
9         0
         ..
51322     0
51327     1
51328     0
51329     0
51335     0
Name: num_times_delinquent, Length: 12890, dtype: int64

In [15]:
# impute 0 for other null deq if num of deq is 0
other_null_deq_col = ['max_deliq_6mts', 'max_deliq_12mts']
other_null_no_deq_index = df[df.num_times_delinquent == 0].index
df.loc[other_null_no_deq_index][other_null_deq_col] = df.loc[other_null_no_deq_index].fillna(0)[other_null_deq_col]

In [16]:
# remaining null deq
df[other_null_deq_col].isna().sum()

max_deliq_6mts     12890
max_deliq_12mts    10832
dtype: int64

In [17]:
reamining_null_deq_series = df[other_null_deq_col].isna().sum(axis=1)
reamining_null_deq_series.value_counts()

0    38446
2    10832
1     2058
dtype: int64

In [18]:
(df[reamining_null_deq_series == 1][['num_times_delinquent']] == 0).sum()

num_times_delinquent    1123
dtype: int64

Remaining null in deq features could not be imputed with confidence, and needs to be rejected from dataset.

In [20]:
null_col = df.isna().sum(axis=0)[lambda x: x>0].index
df[null_col].isna().sum(axis=0).sort_values()

Age_Oldest_TL                   40
Age_Newest_TL                   40
pct_currentBal_all_TL           72
time_since_recent_payment     4291
enq_L3m                       6321
enq_L6m                       6321
enq_L12m                      6321
time_since_recent_enq         6321
PL_enq_L12m                   6321
PL_enq_L6m                    6321
CC_enq_L12m                   6321
CC_enq_L6m                    6321
CC_enq                        6321
tot_enq                       6321
PL_enq                        6321
max_deliq_12mts              10832
max_deliq_6mts               12890
max_unsec_exposure_inPct     23178
dtype: int64