This notebook contains Exploratory Data Analysis followed by Credit Score Model to predict the loans which are going to default.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader as pdr
import datetime as dt
import seaborn as sns

In [None]:
data = pd.read_csv("../input/lt-vehicle-loan-default-prediction/train.csv")
df=data
data.head()

In [None]:
df.info()

Now, we will convert object datatype into an appropriate data type.

In [None]:
#converting object datatype into appropriate datatype


df['Employment.Type'] = data['Employment.Type'].astype('category')
df['PERFORM_CNS.SCORE.DESCRIPTION'] = data['PERFORM_CNS.SCORE.DESCRIPTION'].astype('category')

def dateconv(x,format):
    year = pd.datetime.today().year
    dob = pd.to_datetime(x,format = format)
    dob.loc[dob.dt.year.gt(year)] -= pd.DateOffset(years=100)
    return dob

df['Date.of.Birth'] = dateconv(data['Date.of.Birth'], '%d-%m-%y')
df['DisbursalDate'] = dateconv(data['DisbursalDate'], '%d-%m-%y')

df['CREDIT.HISTORY.LENGTH'] = data['CREDIT.HISTORY.LENGTH'].str.split("yrs",expand=True)[0].astype(np.int) * 12 + data['CREDIT.HISTORY.LENGTH'].str.split("yrs",expand=True)[1].str.split("mon",expand=True)[0].astype(np.int)
df['AVERAGE.ACCT.AGE'] = data['AVERAGE.ACCT.AGE'].str.split("yrs",expand=True)[0].astype(np.int) * 12 + data['AVERAGE.ACCT.AGE'].str.split("yrs",expand=True)[1].str.split("mon",expand=True)[0].astype(np.int)
df.info()

Now, we will remove id columns

In [None]:
#dropping unnecessary columns
df = df.drop(['UniqueID','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','Employee_code_ID'],axis=1)
df.info()

In [None]:
#Columns with missing values
df.isnull().sum()

Now, we will check if there are missing values in the dataset. Those missing values will be plugged using **Mode Replacement**

In [None]:
# Replacing NA with mode replacement
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
df.isnull().sum()

Below are some EDA charts and analysis

In [None]:
#single variable analysis
print(df.loan_default.value_counts())
df.loan_default.value_counts().plot.bar()
plt.title('Default Count')

In [None]:
print(df['Employment.Type'].value_counts())
df['Employment.Type'].value_counts().plot.bar()
plt.title('Employment Type')

In [None]:
print(df['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts())
df['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts().plot.bar()
plt.title('CNS Score Description')

In [None]:
df['State_ID'].value_counts().plot.bar()
plt.title('State')

In [None]:
df.State_ID.value_counts(normalize=True).plot.bar()
plt.title('State - Relative count')

In [None]:
df['disbursed_amount'].plot.hist(bins=100)
plt.title('Disbursed Amount Histogram')

In [None]:
df.boxplot('disbursed_amount')
plt.title('Disbursed Amount BoxPlot')

In [None]:
df['ltv'].plot.hist()
plt.title('LTV Histogram')

In [None]:
df.boxplot('ltv')
plt.title('LTV BoxPlot')

In [None]:
df['age'] = pd.DatetimeIndex(data['DisbursalDate']).year - pd.DatetimeIndex(data['Date.of.Birth']).year
df['age'].plot.hist()
plt.title('Age Histogram')

In [None]:
df.boxplot('age')
plt.title('Age BoxPlot')

In [None]:
sns.boxplot(x='loan_default', y='age',data=df)
plt.title('Age BoxPlot')

In [None]:
sns.boxplot(x='loan_default', y='ltv',data=df)
plt.title('LTV BoxPlot')

In [None]:
sns.countplot(x='Employment.Type',hue='loan_default',data=df)
plt.title('Employment Bar Graph')

In [None]:
Employment_default= pd.crosstab(df['Employment.Type'],df['loan_default'],normalize='index')
Employment_default.plot.bar()
plt.title('Normalized Employment Bar Graph')

In [None]:
sns.boxplot(x='Employment.Type', y='ltv',data=df)
plt.title('Employment Type BoxPlot')

In [None]:
score_default= pd.crosstab(df['PERFORM_CNS.SCORE.DESCRIPTION'],df['loan_default'],normalize='index')
score_default.plot.bar()
plt.title('CNS Score BoxPlot')

In [None]:
corr = df.corr()
sns.heatmap(corr)
plt.title('Heat Map')

Now we will start with model creation. Before we start, we have to convert categorical data into codes and preprocess by dropping some variables and reducing the number of states by keeping only top 6 highest frequency states and combining all other into others and then creating dummy variables of all those states.

In [None]:
df['Employment.Type'] = df['Employment.Type'].cat.reorder_categories(['Self employed', 'Salaried'])
df['Employment.Type']  = df['Employment.Type'].cat.codes

In [None]:
#Preprocessing

#dropping unnecessary columns
# MobileNo_Avl_Flag - All values are 1
# Date.of.Birth , DisbursalDate -  Already used to compute age
# PERFORM_CNS.SCORE.DESCRIPTION - Score is already in dataset

df = df.drop(['MobileNo_Avl_Flag','Date.of.Birth','DisbursalDate','PERFORM_CNS.SCORE.DESCRIPTION'],axis=1)


In [None]:
df.State_ID.value_counts()/len(df)

#70% of the data is in 1st 6 states and hence we will change all other state's value as 'other'
def stid(i):
        switcher={
                4              :'4',
                3              :'3',
                6              :'6',
                13             :'13',
                9              :'9',
                8              :'8'
               
             }
        return switcher.get(i,'Other')
df['State_ID_new']=df['State_ID'].apply (stid) 
df = df.drop(['State_ID'], axis = 1)     

In [None]:
df_new = pd.get_dummies(df,drop_first=True) 
print(df_new.columns)

Now, we will first standardize the data and then split the data into train and test using stratify option.

In [None]:
#train test split
X = df_new.drop('loan_default',axis=1)
y = df_new['loan_default']

from sklearn.preprocessing import StandardScaler

zs = StandardScaler()
zs.fit(X)


Xt_z = pd.DataFrame(zs.transform(X), columns =X.columns)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xt_z, y, stratify=y, random_state=5)

Below are 5 default models and 5 Fine Tuned Models.

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
fit=logreg.fit(X_train, y_train)

In [None]:
feature_list = list(X_train)
importance = pd.DataFrame(index=feature_list, data=np.transpose(logreg.coef_, axes=None), columns=["feature coefficient"])
importance

In [None]:
y_pred= logreg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics
print(confusion_matrix(y_pred,y_test))

In [None]:
cm = confusion_matrix(y_pred,y_test)
accuracy_lr = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
print("accuracy" ,accuracy_lr)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_lr = (cm[1,1])/(cm[0,1]+cm[1,1])
print("sensitivity" ,sensitivity_lr)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_lr = (cm[0,0])/(cm[0,0]+cm[1,0])
print("Specificity" ,Specificity_lr)
#FPR = (FP)/(FP+TN)
FPR_lr = (cm[1,0])/(cm[0,0]+cm[1,0])
print("FPR" ,FPR_lr)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = logreg.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_lr = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_lr

## Logistic Regression Fine Tuned

In [None]:
#Hyper Parameters - Internal parameter to a model
#GridSearch CV
max_iter=[100,250,500,750]
C = [0.01,0.1,1.0]
param_grid = dict(max_iter=max_iter,C=C)
param_grid

In [None]:
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()
grid = GridSearchCV(estimator=lr, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)
print("Best Score: ",grid_result.best_score_)
print("Best Score: ",grid_result.best_params_)

In [None]:
y_pred= grid.predict(X_test)
cm_ft = confusion_matrix(y_pred,y_test)
cm_ft

In [None]:
accuracy_lrft = (cm_ft[0,0]+cm_ft[1,1])/(cm_ft[0,0]+cm_ft[0,1]+cm_ft[1,0]+cm_ft[1,1])
print("accuracy" ,accuracy_lrft)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_lrft = (cm_ft[1,1])/(cm_ft[0,1]+cm_ft[1,1])
print("sensitivity" ,sensitivity_lrft)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_lrft = (cm_ft[0,0])/(cm_ft[0,0]+cm_ft[1,0])
print("Specificity" ,Specificity_lrft)
#FPR = (FP)/(FP+TN)
FPR_lrft = (cm_ft[1,0])/(cm_ft[0,0]+cm_ft[1,0])
print("FPR" ,FPR_lrft)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = grid.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression Fine Tuned ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_lrft = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_lrft

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=42)
DT.fit(X_train, y_train)

In [None]:
feature_importances = DT.feature_importances_
feature_list = list(X_train)
importance = pd.DataFrame(index=feature_list, data=feature_importances, columns=["feature importance"])
importance.sort_values(by='feature importance',ascending=False)

In [None]:
y_pred= DT.predict(X_test)
cm_dt = confusion_matrix(y_pred,y_test)
cm_dt

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_dt = (cm_dt[0,0]+cm_dt[1,1])/(cm_dt[0,0]+cm_dt[0,1]+cm_dt[1,0]+cm_dt[1,1])
print("accuracy" ,accuracy_dt)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_dt = (cm_dt[1,1])/(cm_dt[0,1]+cm_dt[1,1])
print("sensitivity" ,sensitivity_dt)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_dt = (cm_dt[0,0])/(cm_dt[0,0]+cm_dt[1,0])
print("Specificity" ,Specificity_dt)
#FPR = (FP)/(FP+TN)
FPR_dt = (cm_dt[1,0])/(cm_dt[0,0]+cm_dt[1,0])
print("FPR" ,FPR_dt)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = DT.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_dt = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_dt

## Decision Tree Fine Tuned

In [None]:
#gridsearch
from sklearn.model_selection import GridSearchCV
max_depth = [i for i in range(5,10,1)]
min_samples_leaf = [i for i in range(500,2500,500)]
max_features = [10,15,20,25,30]
param_grid = dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features)
param_grid

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
grid = GridSearchCV(estimator=DT, param_grid=param_grid)

In [None]:
grid_result = grid.fit(X_train, y_train)
print("Best Score: ",grid_result.best_score_)
print("Best Score: ",grid_result.best_params_)

In [None]:
y_pred= grid.predict(X_test)
cm_dt_ft = confusion_matrix(y_pred,y_test)
cm_dt_ft

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_dt_ft = (cm_dt_ft[0,0]+cm_dt_ft[1,1])/(cm_dt_ft[0,0]+cm_dt_ft[0,1]+cm_dt_ft[1,0]+cm_dt_ft[1,1])
print("accuracy" ,accuracy_dt_ft)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_dt_ft = (cm_dt_ft[1,1])/(cm_dt_ft[0,1]+cm_dt_ft[1,1])
print("sensitivity" ,sensitivity_dt_ft)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_dt_ft = (cm_dt_ft[0,0])/(cm_dt_ft[0,0]+cm_dt_ft[1,0])
print("Specificity" ,Specificity_dt_ft)
#FPR = (FP)/(FP+TN)
FPR_dt_ft = (cm_dt_ft[1,0])/(cm_dt_ft[0,0]+cm_dt_ft[1,0])
print("FPR" ,FPR_dt_ft)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = grid.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Decision Tree Fine Tuned ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_dt_ft = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_dt_ft

## BaggingClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=42)

In [None]:
from sklearn.ensemble import BaggingClassifier
BC = BaggingClassifier(base_estimator=DT,oob_score=True)
BC.fit(X_train, y_train)

In [None]:
y_pred= BC.predict(X_test)
cm_dt_bg = confusion_matrix(y_pred,y_test)
cm_dt_bg

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_dt_bg = (cm_dt_bg[0,0]+cm_dt_bg[1,1])/(cm_dt_bg[0,0]+cm_dt_bg[0,1]+cm_dt_bg[1,0]+cm_dt_bg[1,1])
print("accuracy" ,accuracy_dt_bg)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_dt_bg = (cm_dt_bg[1,1])/(cm_dt_bg[0,1]+cm_dt_bg[1,1])
print("sensitivity" ,sensitivity_dt_bg)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_dt_bg = (cm_dt_bg[0,0])/(cm_dt_bg[0,0]+cm_dt_bg[1,0])
print("Specificity" ,Specificity_dt_bg)
#FPR = (FP)/(FP+TN)
FPR_dt_bg = (cm_dt_bg[1,0])/(cm_dt_bg[0,0]+cm_dt_bg[1,0])
print("FPR" ,FPR_dt_bg)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = BC.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Bagging Classifier ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_dt_bg = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_dt_bg

## Bagging Classifier Fine Tuned

In [None]:
from sklearn.model_selection import GridSearchCV
max_features = [15,20,25,30]
n_estimators = [i for i in range(8,15,2)]
param_grid = dict(max_features=max_features,
                n_estimators=n_estimators )
param_grid

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
from sklearn.ensemble import BaggingClassifier
BC = BaggingClassifier(base_estimator=DT,oob_score=True,random_state =42)
grid = GridSearchCV(estimator=BC, param_grid=param_grid)

In [None]:
grid_result = grid.fit(X_train, y_train)
print("Best Score: ",grid_result.best_score_)
print("Best Score: ",grid_result.best_params_)

In [None]:
y_pred= grid.predict(X_test)
cm_dt_bg_ft = confusion_matrix(y_pred,y_test)
cm_dt_bg_ft

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_bg_ft = (cm_dt_bg_ft[0,0]+cm_dt_bg_ft[1,1])/(cm_dt_bg_ft[0,0]+cm_dt_bg_ft[0,1]+cm_dt_bg_ft[1,0]+cm_dt_bg_ft[1,1])
print("accuracy" ,accuracy_bg_ft)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_bg_ft = (cm_dt_bg_ft[1,1])/(cm_dt_bg_ft[0,1]+cm_dt_bg_ft[1,1])
print("sensitivity" ,sensitivity_bg_ft)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_bg_ft = (cm_dt_bg_ft[0,0])/(cm_dt_bg_ft[0,0]+cm_dt_bg_ft[1,0])
print("Specificity" ,Specificity_bg_ft)
#FPR = (FP)/(FP+TN)
FPR_bg_ft = (cm_dt_bg_ft[1,0])/(cm_dt_bg_ft[0,0]+cm_dt_bg_ft[1,0])
print("FPR" ,FPR_bg_ft)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = grid.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Bagging Classifier Fine Tuned ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_bg_ft = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_bg_ft

## RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(oob_score=True)
RF.fit(X_train, y_train)

In [None]:
y_pred= RF.predict(X_test)
cm_rf = confusion_matrix(y_pred,y_test)
cm_rf

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_rf = (cm_rf[0,0]+cm_rf[1,1])/(cm_rf[0,0]+cm_rf[0,1]+cm_rf[1,0]+cm_rf[1,1])
print("accuracy" ,accuracy_rf)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_rf = (cm_rf[1,1])/(cm_rf[0,1]+cm_rf[1,1])
print("sensitivity" ,sensitivity_rf)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_rf = (cm_rf[0,0])/(cm_rf[0,0]+cm_rf[1,0])
print("Specificity" ,Specificity_rf)
#FPR = (FP)/(FP+TN)
FPR_rf = (cm_rf[1,0])/(cm_rf[0,0]+cm_rf[1,0])
print("FPR" ,FPR_rf)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = RF.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_rf = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_rf

## Random Forest Classifier Fine Tuned

In [None]:
from sklearn.model_selection import GridSearchCV
max_depth = [i for i in range(5,10,2)]
min_samples_leaf = [i for i in range(500,2500,1000)]
max_features = [20,25,30]
n_estimators = [i for i in range(10,15,2)]
param_grid = dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features,
                n_estimators=n_estimators )
param_grid

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(oob_score=True,random_state =42)
grid = GridSearchCV(estimator=RF, param_grid=param_grid)

In [None]:
grid_result = grid.fit(X_train, y_train)
print("Best Score: ",grid_result.best_score_)
print("Best Score: ",grid_result.best_params_)

In [None]:
y_pred= grid.predict(X_test)
cm_rf_ft = confusion_matrix(y_pred,y_test)
cm_rf_ft

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_rf_ft = (cm_rf_ft[0,0]+cm_rf_ft[1,1])/(cm_rf_ft[0,0]+cm_rf_ft[0,1]+cm_rf_ft[1,0]+cm_rf_ft[1,1])
print("accuracy" ,accuracy_rf_ft)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_rf_ft = (cm_rf_ft[1,1])/(cm_rf_ft[0,1]+cm_rf_ft[1,1])
print("sensitivity" ,sensitivity_rf_ft)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_rf_ft = (cm_rf_ft[0,0])/(cm_rf_ft[0,0]+cm_rf_ft[1,0])
print("Specificity" ,Specificity_rf_ft)
#FPR = (FP)/(FP+TN)
FPR_rf_ft = (cm_rf_ft[1,0])/(cm_rf_ft[0,0]+cm_rf_ft[1,0])
print("FPR" ,FPR_rf_ft)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = grid.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest Fine Tuned ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_rf_ft = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_rf_ft

## GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=100)
clf.fit(X_train, y_train)


In [None]:
y_pred= clf.predict(X_test)
cm_gbf = confusion_matrix(y_pred,y_test)
cm_gbf

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_gbf = (cm_gbf[0,0]+cm_gbf[1,1])/(cm_gbf[0,0]+cm_gbf[0,1]+cm_gbf[1,0]+cm_gbf[1,1])
print("accuracy" ,accuracy_gbf)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_gbf = (cm_gbf[1,1])/(cm_gbf[0,1]+cm_gbf[1,1])
print("sensitivity" ,sensitivity_gbf)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_gbf = (cm_gbf[0,0])/(cm_gbf[0,0]+cm_gbf[1,0])
print("Specificity" ,Specificity_gbf)
#FPR = (FP)/(FP+TN)
FPR_gbf = (cm_gbf[1,0])/(cm_gbf[0,0]+cm_gbf[1,0])
print("FPR" ,FPR_gbf)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = clf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Gradient Boosting ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_gbf = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_gbf

## GradientBoostingClassifier Fine Tuned

In [None]:
from sklearn.model_selection import GridSearchCV
max_depth = [i for i in range(3,6,2)]
min_samples_leaf = [i for i in range(500,2500,1000)]
max_features = [20,25,30]
n_estimators = [i for i in range(10,15,2)]
param_grid = dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,max_features=max_features,
                n_estimators=n_estimators )
param_grid

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=100)
grid = GridSearchCV(estimator=clf, param_grid=param_grid)

In [None]:
grid_result = grid.fit(X_train, y_train)
print("Best Score: ",grid_result.best_score_)
print("Best Score: ",grid_result.best_params_)

In [None]:
y_pred= grid.predict(X_test)
cm_gbf_ft = confusion_matrix(y_pred,y_test)
cm_gbf_ft

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_gbf_ft = (cm_gbf_ft[0,0]+cm_gbf_ft[1,1])/(cm_gbf_ft[0,0]+cm_gbf_ft[0,1]+cm_gbf_ft[1,0]+cm_gbf_ft[1,1])
print("accuracy" ,accuracy_gbf_ft)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_gbf_ft = (cm_gbf_ft[1,1])/(cm_gbf_ft[0,1]+cm_gbf_ft[1,1])
print("sensitivity" ,sensitivity_gbf_ft)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_gbf_ft = (cm_gbf_ft[0,0])/(cm_gbf_ft[0,0]+cm_gbf_ft[1,0])
print("Specificity" ,Specificity_gbf_ft)
#FPR = (FP)/(FP+TN)
FPR_gbf_ft = (cm_gbf_ft[1,0])/(cm_gbf_ft[0,0]+cm_gbf_ft[1,0])
print("FPR" ,FPR_gbf_ft)

In [None]:
from sklearn.metrics import roc_curve

y_pred_prob = grid.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr, label='Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Gradient Boosting Fine Tuned ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_gbf_ft = roc_auc_score(y_test, y_pred_prob) #used to check different models such as logistic, decision tree
roc_gbf_ft

# ALL MODEL COMPUTATION

In [None]:
model = pd.DataFrame({'Logistic Regression': {'Accuracy': "{0:.2f}%".format(accuracy_lr * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_lr * 100), 'Specificity': "{0:.2f}%".format(Specificity_lr * 100), 'FPR': "{0:.2f}%".format(FPR_lr * 100), 'ROC': "{0:.2f}%".format(roc_lr * 100)},
                      'Logistic Regression Fine Tuned': {'Accuracy': "{0:.2f}%".format(accuracy_lrft * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_lrft * 100), 'Specificity': "{0:.2f}%".format(Specificity_lrft * 100), 'FPR': "{0:.2f}%".format(FPR_lrft * 100), 'ROC': "{0:.2f}%".format(roc_lrft * 100)},
                      'Decision Tree': {'Accuracy': "{0:.2f}%".format(accuracy_dt * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_dt * 100), 'Specificity': "{0:.2f}%".format(Specificity_dt * 100), 'FPR': "{0:.2f}%".format(FPR_dt * 100), 'ROC': "{0:.2f}%".format(roc_dt * 100)},
                      'Decision Tree Fine Tuned': {'Accuracy': "{0:.2f}%".format(accuracy_dt_ft * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_dt_ft * 100), 'Specificity': "{0:.2f}%".format(Specificity_dt_ft * 100), 'FPR': "{0:.2f}%".format(FPR_dt_ft * 100), 'ROC': "{0:.2f}%".format(roc_dt_ft * 100)},
                      'Bagging Classifier': {'Accuracy': "{0:.2f}%".format(accuracy_dt_bg * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_dt_bg * 100), 'Specificity': "{0:.2f}%".format(Specificity_dt_bg * 100), 'FPR': "{0:.2f}%".format(FPR_dt_bg * 100), 'ROC': "{0:.2f}%".format(roc_dt_bg * 100)},
                      'Bagging Classifier Fine Tuned': {'Accuracy': "{0:.2f}%".format(accuracy_bg_ft * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_bg_ft * 100), 'Specificity': "{0:.2f}%".format(Specificity_bg_ft * 100), 'FPR': "{0:.2f}%".format(FPR_bg_ft * 100), 'ROC': "{0:.2f}%".format(roc_bg_ft * 100)},
                      'Random Forest': {'Accuracy': "{0:.2f}%".format(accuracy_rf * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_rf * 100), 'Specificity': "{0:.2f}%".format(Specificity_rf * 100), 'FPR': "{0:.2f}%".format(FPR_rf * 100), 'ROC': "{0:.2f}%".format(roc_rf * 100)},
                      'Random Forest Fine Tuned': {'Accuracy': "{0:.2f}%".format(accuracy_rf_ft * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_rf_ft * 100), 'Specificity': "{0:.2f}%".format(Specificity_rf_ft * 100), 'FPR': "{0:.2f}%".format(FPR_rf_ft * 100), 'ROC': "{0:.2f}%".format(roc_rf_ft * 100)},
                      'Gradient Boosting': {'Accuracy': "{0:.2f}%".format(accuracy_gbf * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_gbf * 100), 'Specificity': "{0:.2f}%".format(Specificity_gbf * 100), 'FPR': "{0:.2f}%".format(FPR_gbf * 100), 'ROC': "{0:.2f}%".format(roc_gbf * 100)},
                      'Gradient Boosting Fine Tuned': {'Accuracy': "{0:.2f}%".format(accuracy_gbf_ft * 100), 'Sensitivity': "{0:.2f}%".format(sensitivity_gbf_ft * 100), 'Specificity': "{0:.2f}%".format(Specificity_gbf_ft * 100), 'FPR': "{0:.2f}%".format(FPR_gbf_ft * 100), 'ROC': "{0:.2f}%".format(roc_gbf_ft * 100)}})
model.head()

Since the objective of the model is to predict the number of loans which are going to default, we will focus on **Sensitivity** rather than accuracy of the model. To compare sensitivity and specificity, we will use ROC values of different model to check.

Based on above table, we can observe that **Gradient Boosting** has the highest ROC value and hence, we will select Gradient Boosting as our model for prediction.


Due to default threshold value of the model being 0.5 for dividing the predicted probabilities into Default or Not Default, we are getting such a low Sensitivity value. To get optimal Sensitivity value, we will plot Sensitivty and Specificty for different threshold values.

In [None]:
# Gradient Boosting is best

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=100)
clf.fit(X_train, y_train)

In [None]:
#Checking optimal threshold value to get decent sensitivity value

from sklearn.metrics import roc_curve

y_pred_prob = clf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot(thresholds, tpr, label='Sensitivty')
plt.xlabel('Thresholds')
plt.ylabel('Rate')
plt.plot(thresholds, 1-fpr, label='Specificity')
plt.title('Gradient Boosting ROC Curve')
plt.axvline(0.23, c='black', ls='dashed') # trail and error
plt.xlim(0,1)
plt.legend()
plt.show()

We need to get threshold value for which we can get high Sensitivity value but at the same time we cannot compromise with Sepecitivty value also. From above graph, it is evident that Sensitivity is a downward sloping curve whereas Specificity is a upward sloping curve and at the intersection point, we can get highest value of Sensitivty and specificity simulatneously.

Hence, at **threshold = 0.23**, the two curve interects and hence we will select 0.23 as threshold value for deciding if a loan is going to default or not.

In [None]:
#As per the graph at Threshold 0.23, we get maximum Sensitivity and Specificity
y_pred= (clf.predict_proba(X_test)[:,1]>=0.23).astype(int)
cm_final = confusion_matrix(y_pred,y_test)
cm_final

In [None]:
#accuracy = (TN+TP)/(ALL)
accuracy_final = (cm_final[0,0]+cm_final[1,1])/(cm_final[0,0]+cm_final[0,1]+cm_final[1,0]+cm_final[1,1])
print("accuracy" ,accuracy_final)
#sensitivity(TPR) = (TP)/(TP+FN)
sensitivity_final = (cm_final[1,1])/(cm_final[0,1]+cm_final[1,1])
print("sensitivity" ,sensitivity_final)
#Specificity(TNR) = (TN)/(TN+FP)
Specificity_final = (cm_final[0,0])/(cm_final[0,0]+cm_final[1,0])
print("Specificity" ,Specificity_final)
#FPR = (FP)/(FP+TN)
FPR_final = (cm_final[1,0])/(cm_final[0,0]+cm_final[1,0])
print("FPR" ,FPR_final)

From above values, we can observe that accuracy of the model may have gone down but the model is giving very good sensitivty as well as specificty values.