# Data Modelling

In [237]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd


# visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE

# for warnings 
import warnings
warnings.filterwarnings('ignore')



from collections import Counter

In [238]:
CIBIL_train =  pd.read_excel('CIBIL Train.xlsx')
CIBIL_test = pd.read_excel('CIBIL Test.xlsx')

In [239]:
CIBIL_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9026 entries, 0 to 10239
Data columns (total 60 columns):
feature_1                         9026 non-null object
feature_3                         9026 non-null int64
feature_4                         9026 non-null int64
feature_6                         9026 non-null int64
feature_7                         9026 non-null int64
feature_8                         9026 non-null object
feature_11                        9026 non-null object
feature_12                        9026 non-null object
feature_13                        9026 non-null object
feature_15                        9026 non-null object
feature_16                        9026 non-null object
feature_17                        9026 non-null object
feature_19                        9026 non-null int64
feature_21                        9026 non-null int64
feature_23                        9026 non-null object
feature_25                        9026 non-null int64
feature_26         

In [240]:
CIBIL_train.shape

(21053, 60)

## Join - train&test data

In [277]:
final = CIBIL_train.append(CIBIL_test)
final.head()

Unnamed: 0,feature_1,feature_3,feature_4,feature_6,feature_7,feature_8,feature_11,feature_12,feature_13,feature_15,...,count_enquiry_recency_365,count_enquiry_recency_90,mean_diff_open_enquiry_dt,max_freq_enquiry,mean_diff_lastpaymt_opened_dt,Ratio_currbalance_creditlimit,utilisation_trend,payment_history_mean_length,avg_enq_amount,total_diff_lastpaymt_opened_dt
0,Insignia,650,2,14,500000,,Y,IS1,,159,...,1,0,1443,10,1496,7,0,34,276730,14960
1,Insignia,760,1,14,1200000,,Y,IS1,,91,...,19,2,812,54,2035,0,0,60,98186500,4070
2,Insignia,774,1,14,700000,,Y,IS1,,91,...,0,0,438,1,2454,0,0,60,3400000,2454
3,Insignia,770,1,14,500000,,Y,IS1,,157,...,0,0,1920,52,2757,1,0,43,1494411,38600
5,Insignia,779,1,14,1000000,,Y,IS1,,88,...,3,0,1725,10,1640,10,0,35,5023111,41021


## Feature Engineering using one hot encoding 

In [242]:
# Empty list to store columns with categorical data
categorical_final = []
for col, value in final.iteritems():
    if value.dtype == 'object':
        categorical_final.append(col)

# Store the numerical columns in a list numerical
numerical_final = final.columns.difference(categorical_final)

In [243]:
# Store the categorical data in a dataframe called attrition_cat
final_cat = final[categorical_final]

In [244]:
final_cat = pd.get_dummies(final_cat)
final_cat.head(3)


Unnamed: 0,feature_1_Golf Card,feature_1_Insignia,feature_1_Platinum Cricke,feature_1_Platinum Deligh,feature_1_Platinum Maxima,feature_1_RBL Bank Fun+,feature_1_Titanium Deligh,feature_8_None,feature_8_RCBL,feature_8_RCBSR,...,feature_51_Standard Chartered Bank,feature_51_State Bank of India,feature_58_N,feature_58_Y,feature_59_N,feature_59_Y,feature_60_N,feature_60_Y,feature_72_O,feature_72_R
0,0,1,0,0,0,0,0,1,0,0,...,0,1,0,1,0,1,1,0,0,1
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,1,0,0,1
2,0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,1,0,0,1


## Creating new features from Numerical data

In [245]:
# Store the numerical features to a dataframe attrition_num
final_num = final[numerical_final]
final_num = final_num.drop(['Bad_label'], axis=1) # To remove the target column PerformanceRating 

In [246]:
# Concat the two dataframes together columnwise
final1 = pd.concat([final_num, final_cat], axis=1)
final1.head(3)

Unnamed: 0,Ratio_currbalance_creditlimit,avg_enq_amount,count_enquiry_recency_365,count_enquiry_recency_90,feature_19,feature_21,feature_25,feature_26,feature_3,feature_30,...,feature_51_Standard Chartered Bank,feature_51_State Bank of India,feature_58_N,feature_58_Y,feature_59_N,feature_59_Y,feature_60_N,feature_60_Y,feature_72_O,feature_72_R
0,7,276730,1,0,1,53,1,2,650,53,...,0,1,0,1,0,1,1,0,0,1
1,0,98186500,19,2,1,56,1,2,760,49,...,0,0,0,1,0,1,1,0,0,1
2,0,3400000,0,0,2,52,1,0,774,52,...,0,0,1,0,0,1,1,0,0,1


## Split the date to train & test by index 

In [247]:
train = final1[:21053] # x_train
test = final1[-9026:] # x_test

In [248]:
target_train = CIBIL_train.loc[:,['Bad_label']] # y_train
target_test = CIBIL_test.loc[:,['Bad_label']] # y_test 

### SMOTE to oversample due to the skewness in target


In [250]:
from imblearn.over_sampling import SMOTE

oversampler= SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_sample(train,target_train)

## Random Forest Classifier - Modelling


In [251]:
seed = 0   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 800,
    'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 9,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [252]:
rf = RandomForestClassifier(**rf_params)

In [253]:
rf.fit(train, target_train)
print("Fitting of Random Forest as finished")

Fitting of Random Forest as finished


In [254]:
rf_predictions = rf.predict(test)
print("Predictions finished")

Predictions finished


In [255]:
accuracy_score(target_test, rf_predictions) # To find the accuracy of the Prediction Model 

0.9559051628628407

## Feature Ranking via the Random Forest


In [256]:
# Scatter plot 
trace = go.Scatter(
    y = rf.feature_importances_,
    x = final1.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        color = rf.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = final1.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance Level',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')