In [None]:
#Importing Required Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#SMOTE to balance the Imbalance Data
from imblearn.over_sampling import SMOTE

#for Spliting Data and Hyperparameter Tuning 
from sklearn.model_selection import train_test_split, GridSearchCV

#Importing Machine Learning Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from catboost import CatBoostClassifier
    
#Bagging Algo
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

from sklearn.neural_network import MLPClassifier

#To tranform data
from sklearn import preprocessing

#statistical Tools
from sklearn.metrics import roc_auc_score,accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc

#Setting Format
pd.options.display.float_format = '{:.5f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/lt-vehicle-loan-default-prediction/train.csv")
test = pd.read_csv("../input/lt-vehicle-loan-default-prediction/test.csv")

In [None]:
test.shape,train.shape

In [None]:
train.loan_default.value_counts().plot(kind='bar')

In [None]:
#Lets looks at data description
info = pd.read_csv("../input/lt-vehicle-loan-default-prediction/data_dictionary.csv")
info

In [None]:
train.describe().T

In [None]:
train.info()

In [None]:
#Replacing all the Spaces with '_'
train.columns = train.columns.str.replace('.','_')

In [None]:
train.isna().sum()

#So only Employment Type data is missing

In [None]:
#Data Correlation
plt.figure(figsize=(12,8))
sns.heatmap(train.corr())

# Digging Few Columns for Insight

In [None]:
#Lets Look at few columns

columns_unique = ['UniqueID','MobileNo_Avl_Flag',
         'Current_pincode_ID','Employee_code_ID',
         'NO_OF_INQUIRIES','State_ID',
         'branch_id','manufacturer_id','supplier_id']


unique_col = train[columns_unique]

In [None]:
unique_col.head()

In [None]:
#Looking at all unique values
for i in unique_col.columns:
    print(i," : distinct_value")
    print(unique_col[i].nunique()," : No. of unique Items")
    #print(unique_col[i].unique())
    print("-"*30)
    print("")

In [None]:
unique_col.hist(bins=5, figsize=(16,12))
plt.show()

UniqueID = It is provided to every customer so its Unique and will always be different

MobileNo_Avl_Flag = Whether person provided Mobile No. Doesn't tell us if loan will default

Current_pincode_ID = It is Customers address we don't need that for Prediction

Employee_code_ID = Employee ID is not required as it doesn't related with Loan_defualt

NO_OF_INQUIRIES = No. of Inquiries to loan doesn't help us to determine wheather loan will default or not

State_ID = It is where loan is availed and doesn't add much to prediction to loan default

branch_id = Branch ID isn't relevent to Data Processing

manufacturer_id = Manufacturer ID doesn't add much too data

supplier_id = Supplier ID doesn't add much too data


In [None]:
def columns_drop(data):
    data.drop(unique_col,axis=1,inplace=True)

In [None]:
columns_drop(train)

In [None]:
#Now we have 2 Columns named "AVERAGE_ACCT_AGE" & "CREDIT_HISTORY_LENGTH".
#They have AplhNumeric Values Lets change them to Months

def change_col_month(col):
    year = int(col.split()[0].replace('yrs',''))
    month = int(col.split()[1].replace('mon',''))
    return year*12+month

def months_transformation(data):
    data['CREDIT_HISTORY_LENGTH'] = data['CREDIT_HISTORY_LENGTH'].apply(change_col_month)
    data['AVERAGE_ACCT_AGE'] = data['AVERAGE_ACCT_AGE'].apply(change_col_month)

In [None]:
months_transformation(train)

In [None]:
train.head()

In [None]:
#plot = data.iloc[:test.shape[0]]
plot = train[train['AVERAGE_ACCT_AGE']<175]
sns.lineplot(x=train['AVERAGE_ACCT_AGE'],y=train['loan_default'])

In [None]:
#plot = data.iloc[:test.shape[0]]
plot = train[train['CREDIT_HISTORY_LENGTH']<200]
sns.lineplot(x=train['CREDIT_HISTORY_LENGTH'],y=train['loan_default'])

# Transform CNS Score And Create New Columns

Now lets look at CNS Score Description




In [None]:
train.PERFORM_CNS_SCORE_DESCRIPTION.value_counts()

In [None]:
def replace_not_scored(n):
    #here we are spliting letters before '-'.
    score=n.split("-")
    
    if len(score)!=1:
        return score[0]
    else:
        return 'N'

def transform_CNS_Description(data):
    data['CNS_SCORE_DESCRIPTION']=data['PERFORM_CNS_SCORE_DESCRIPTION'].apply(replace_not_scored).astype(np.object)
    
    #Now Transform CNS Score Description data into Numbers

    sub_risk = {'N':-1, 'K':0, 'J':1, 'I':2, 'H':3, 'G':4, 'E':5,'F':6, 'L':7, 'M':8, 'B':9, 'D':10, 'A':11, 'C':12}

    data['CNS_SCORE_DESCRIPTION'] = data['CNS_SCORE_DESCRIPTION'].apply(lambda x: sub_risk[x])
    
transform_CNS_Description(train)

In [None]:
train.head()

In [None]:
def transform_PERFORM_CNS_SCORE_DESCRIPTION(data):
    #Replacing all the values into Common Group

    data['PERFORM_CNS_SCORE_DESCRIPTION'].replace({'C-Very Low Risk':'Very Low Risk',
                                                 'A-Very Low Risk':'Very Low Risk',
                                                 'D-Very Low Risk':'Very Low Risk',
                                                 'B-Very Low Risk':'Very Low Risk',
                                                 'M-Very High Risk':'Very High Risk',
                                                 'L-Very High Risk':'Very High Risk',
                                                 'F-Low Risk':'Low Risk',
                                                 'E-Low Risk':'Low Risk',
                                                 'G-Low Risk':'Low Risk',
                                                 'H-Medium Risk':'Medium Risk',
                                                 'I-Medium Risk':'Medium Risk',
                                                 'J-High Risk':'High Risk',
                                                 'K-High Risk':'High Risk'},
                                                  inplace=True)

    #Transformin them into Numeric Features

    risk_map = {'No Bureau History Available':-1, 
                  'Not Scored: No Activity seen on the customer (Inactive)':-1,
                  'Not Scored: Sufficient History Not Available':-1,
                  'Not Scored: No Updates available in last 36 months':-1,
                  'Not Scored: Only a Guarantor':-1,
                  'Not Scored: More than 50 active Accounts found':-1,
                  'Not Scored: Not Enough Info available on the customer':-1,
                  'Very Low Risk':4,
                  'Low Risk':3,
                  'Medium Risk':2, 
                  'High Risk':1,
                  'Very High Risk':0}

    data['PERFORM_CNS_SCORE_DESCRIPTION'] = data['PERFORM_CNS_SCORE_DESCRIPTION'].map(risk_map)

transform_PERFORM_CNS_SCORE_DESCRIPTION(train)

In [None]:
train.head()

In [None]:
sns.countplot(x = train['PERFORM_CNS_SCORE_DESCRIPTION'])

# Treating Missing Values

In [None]:
train.Employment_Type.value_counts()

In [None]:
defa = pd.crosstab(train['Employment_Type'], train['loan_default'])
print(defa)

In [None]:
def fill_employment_type(data):
    data['Employment_Type'] = data['Employment_Type'].fillna('Salaried')
    employment_map = {'Self employed':0, 'Salaried':1, 'Not_employed':-1}

    data['Employment_Type'] = data['Employment_Type'].apply(lambda x: employment_map[x])
fill_employment_type(train)

In [None]:
import scipy.stats as stats
chi_sq, p_value, deg_freedom, exp_freq = stats.chi2_contingency(defa)
print('Chi Square Statistics',chi_sq)
print('p-value',p_value)
print('Degree of freedom',deg_freedom)

In [None]:
sns.countplot(x = train['Employment_Type'])

# Transforming Primary and Secondary Accounts

In [None]:
pri_columns = ['PRI_NO_OF_ACCTS','SEC_NO_OF_ACCTS',
           'PRI_ACTIVE_ACCTS','SEC_ACTIVE_ACCTS',
           'PRI_OVERDUE_ACCTS','SEC_OVERDUE_ACCTS',
           'PRI_CURRENT_BALANCE','SEC_CURRENT_BALANCE',
           'PRI_SANCTIONED_AMOUNT','SEC_SANCTIONED_AMOUNT',
           'PRI_DISBURSED_AMOUNT','SEC_DISBURSED_AMOUNT',
           'PRIMARY_INSTAL_AMT', 'SEC_INSTAL_AMT']

pri_df = train[pri_columns]

In [None]:
def new_col(data):
    #Creating and Sorting Columns

    data['NO_OF_ACCTS'] = data['PRI_NO_OF_ACCTS'] + data['SEC_NO_OF_ACCTS']

    data['ACTIVE_ACCTS'] = data['PRI_ACTIVE_ACCTS'] + data['SEC_ACTIVE_ACCTS']

    data['OVERDUE_ACCTS'] = data['PRI_OVERDUE_ACCTS'] + data['SEC_OVERDUE_ACCTS']

    data['CURRENT_BALANCE'] = data['PRI_CURRENT_BALANCE'] + data['SEC_CURRENT_BALANCE']

    data['SANCTIONED_AMOUNT'] = data['PRI_SANCTIONED_AMOUNT'] + data['SEC_SANCTIONED_AMOUNT']

    data['DISBURSED_AMOUNT'] = data['PRI_DISBURSED_AMOUNT'] + data['SEC_DISBURSED_AMOUNT']

    data['INSTAL_AMT'] = data['PRIMARY_INSTAL_AMT'] + data['SEC_SANCTIONED_AMOUNT']
    
    data.drop(pri_columns, axis=1, inplace=True)

new_col(train)

In [None]:
new_columns = ['NO_OF_ACCTS', 'ACTIVE_ACCTS', 'OVERDUE_ACCTS', 'CURRENT_BALANCE',
       'SANCTIONED_AMOUNT', 'DISBURSED_AMOUNT', 'INSTAL_AMT']

for i in new_columns:
    print(i," : distinct_value")
    print(train[i].nunique()," : No. of unique Items")
    #print(data[i].unique())
    print("-"*30)
    print("")

# Visualization and Treating Outliers

In [None]:
sns.scatterplot(data=train['ACTIVE_ACCTS'])

In [None]:
sns.scatterplot(data=train['NO_OF_ACCTS'])

In [None]:
sns.scatterplot(data=train['OVERDUE_ACCTS'])

In [None]:
sns.scatterplot(data = train['CURRENT_BALANCE'])

In [None]:
def mode_impute_outlier(data):
    li = list(data['ACTIVE_ACCTS'].sort_values()[-3:].index)
    data['ACTIVE_ACCTS'][li] = int(data.drop(li)['ACTIVE_ACCTS'].mode())
    li = list(data['NO_OF_ACCTS'].sort_values()[-4:].index)
    data['NO_OF_ACCTS'][li] = int(data.drop(li)['NO_OF_ACCTS'].mode())
    li = list(data['OVERDUE_ACCTS'].sort_values()[-10:].index)
    data['OVERDUE_ACCTS'][li] = int(data.drop(li)['OVERDUE_ACCTS'].mode())
    li = list(data['CURRENT_BALANCE'].sort_values()[-15:].index)
    data['CURRENT_BALANCE'][li] = int(data.drop(li)['CURRENT_BALANCE'].mode())

In [None]:
mode_impute_outlier(train)

In [None]:
train.head()

# Lets take a look at Date of Birth Column

In [None]:
train.Date_of_Birth.min(), train.Date_of_Birth.max()

In [None]:
df_age = train[['disbursed_amount', 'asset_cost', 'ltv', 'Date_of_Birth','DisbursalDate','loan_default']]
df_age.tail()

Date_of_Birth = Date of birth of the customer	

Disbursal_Date = Date of disbursement

Disbursement means the payment of money from a fund.

In [None]:
def age(dob):
    yr = int(dob[-2:])
    if yr >=0 and yr < 20:
        return yr + 2000
    else:
         return yr + 1900
        
df_age['Date_of_Birth'] = df_age['Date_of_Birth'].apply(age)
df_age['DisbursalDate'] = df_age['DisbursalDate'].apply(age)
df_age['Age']=df_age['DisbursalDate']-df_age['Date_of_Birth']
df_age=df_age.drop(['DisbursalDate','Date_of_Birth'],axis=1)

df_age.head()

In [None]:
def calculate_age(data):
    data['Date_of_Birth'] = data['Date_of_Birth'].apply(age)
    data['DisbursalDate'] = data['DisbursalDate'].apply(age)
    # Age of applicant when he/she applied for Loan
    data['Age'] = data['DisbursalDate'] - data['Date_of_Birth']
    data = data.drop( ['DisbursalDate', 'Date_of_Birth'], axis=1)
    
calculate_age(train)

In [None]:
train.describe().T

In [None]:
transformed = []
transformed_with_one = []
not_transformed = []

def column_to_transform(data):

    num_col = ['disbursed_amount', 'asset_cost', 'ltv', 'PERFORM_CNS_SCORE',
            'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'CREDIT_HISTORY_LENGTH',
           'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS', 'AVERAGE_ACCT_AGE',
           'NO_OF_ACCTS', 'ACTIVE_ACCTS', 'OVERDUE_ACCTS', 'CURRENT_BALANCE',
           'SANCTIONED_AMOUNT', 'DISBURSED_AMOUNT', 'INSTAL_AMT', 'Age']
    
    num_col_data = data[num_col]
    
    def transformation_boxcox(num_col_data):
    
        from scipy.stats import boxcox

        for i in num_col:
            if num_col_data[i].min() > 0:
                num_col_data[i] = boxcox(num_col_data[i])[0]
                transformed.append(i)
            elif num_col_data[i].min() == 0:
                num_col_data[i] = boxcox(num_col_data[i]+1)[0]
                transformed_with_one.append(i)
            else:
                num_col_data[i] = num_col_data[i]
                not_transformed.append(i)
        print("Successful")
    
    transformation_boxcox(data)

column_to_transform(train)

In [None]:
train.describe().T

In [None]:
def data_processing(data):
    test.columns = test.columns.str.replace('.','_')
    columns_drop(data)
    months_transformation(data)
    transform_CNS_Description(data)
    transform_PERFORM_CNS_SCORE_DESCRIPTION(data)
    fill_employment_type(data)
    new_col(data)
    mode_impute_outlier(data)
    calculate_age(data)
    column_to_transform(data)
    preprocessing.RobustScaler()
    scaler.transform(data)
    return data.shape

In [None]:
X = train.drop(['loan_default'], axis=1)
y = train['loan_default']

In [None]:
X.head()

# Balance Data using SMOTE

SMOTE is python library which is used when the data is imbalanced.

In [None]:
smote = SMOTE()
X_tf,y_tf = smote.fit_resample(X,y)
X_tf.shape, y_tf.shape

# Transforming Data

In [None]:
scaler = preprocessing.RobustScaler()
X_tf = scaler.fit_transform(X_tf)

# Split the data into training and testing sets 
x_train,x_test,y_train,y_test = train_test_split(X_tf,y_tf,test_size = .1, random_state = 3300)

print(x_train.shape[0], x_test.shape[0])

# Traning Our Model

In [None]:
accuracy = {}
roc_r = {}

def train_model(model, model_name):
    print(model_name)
    
    # Fitting model
    model = model.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    #Model accuracy
    acc = accuracy_score(y_test, pred)*100
    accuracy[model_name] = acc
    print('accuracy_score',acc)
    print('precision_score',precision_score(y_test, pred)*100)
    print('recall_score',recall_score(y_test, pred)*100)
    print('f1_score',f1_score(y_test, pred)*100)
    
    
    #ROC Score
    roc_score = roc_auc_score(y_test, pred)*100
    roc_r[model_name] = roc_score
    print('roc_auc_score',roc_score)
    
    # Confusion matrix
    print('confusion_matrix')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    
    #ROC Score
    fpr, tpr, threshold = roc_curve(y_test, pred)
    roc_auc = auc(fpr, tpr)*100
    
    #ROC Plot
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [None]:
knn = KNeighborsClassifier(weights='distance', algorithm='auto', n_neighbors=15, n_jobs=4)

train_model(knn, 'K Nearest Neighbour')

In [None]:
lr = LogisticRegression(C=5.0, solver='saga')

train_model(lr, 'Logistic Regression')

In [None]:
dtc = DecisionTreeClassifier(criterion='gini', splitter='random', max_depth=25, min_samples_split=4,
                            min_samples_leaf=2)

train_model(dtc, 'Decision Tree Classifier')

In [None]:
bnb = BernoulliNB()

train_model(bnb, 'Bernolli Naive Bayes')

In [None]:
rfc = RandomForestClassifier(n_estimators = 1500, n_jobs=-1, max_depth=15, 
                             min_samples_split=5, min_samples_leaf=3)

train_model(rfc, 'Random Forest Classifier')

In [None]:
lgbm = LGBMClassifier(n_estimators=720, n_jobs=-1, max_depth=15, min_child_weight=5, 
                      min_child_samples=5, num_leaves=10, learning_rate=0.15)

train_model(lgbm, 'LGBMClassifier')

In [None]:
cat = CatBoostClassifier(verbose = 0)

train_model(cat, "Cat Boost")

In [None]:
mlp = MLPClassifier(hidden_layer_sizes = (200,3), activation = 'relu', solver = 'adam', learning_rate = 'adaptive',
                   max_iter = 1000)

train_model(mlp, 'Multi-layer Perceptron Classifier')

In [None]:
xgb = XGBClassifier(n_estimators = 1500, nthread  = 4, max_depth = 15, min_child_weight = 5, learning_rate=0.1)

train_model(xgb, 'XGBClassifier')

In [None]:
xgbr = XGBRFClassifier(n_estimators = 2000, nthread  = 4, max_depth = 10, min_child_weight = 4, learning_rate=0.1)

train_model(xgbr, 'XGBRFClassifier')

In [None]:
gbc = GradientBoostingClassifier(n_estimators=1000, min_samples_split=5, max_depth=15)

train_model(gbc, 'GradientBoostingClassifier')

In [None]:
ada = AdaBoostClassifier(n_estimators=1000, learning_rate=0.1)

train_model(ada, 'AdaBoostClassifier')

In [None]:
'''from sklearn.ensemble import StackingClassifier

estimator = [('Lgbr', lgbm), ('xgb', xgb), ('gbc', gbc), ('mlp', mlp)]

sc = StackingClassifier(estimators = estimator, final_estimator = lgbm, n_jobs=-1)

train_model(sc, 'StackingClassifier')'''

In [None]:
# Predicted values
y_head_lr = lr.predict(x_test)
y_head_knn = knn.predict(x_test)
y_head_xgb = xgb.predict(x_test)
y_head_nb = bnb.predict(x_test)
y_head_dtc = dtc.predict(x_test)
y_head_rfc = rfc.predict(x_test)
y_head_lgbm = lgbm.predict(x_test)
y_head_ada = ada.predict(x_test)
y_head_gbc = gbc.predict(x_test)
y_head_mlp = mlp.predict(x_test)
y_head_cat = cat.predict(x_test)

In [None]:
cm_lr = confusion_matrix(y_test,y_head_lr)
cm_knn = confusion_matrix(y_test,y_head_knn)
cm_xgb = confusion_matrix(y_test,y_head_xgb)
cm_nb = confusion_matrix(y_test,y_head_nb)
cm_dtc = confusion_matrix(y_test,y_head_dtc)
cm_rfc = confusion_matrix(y_test,y_head_rfc)
cm_lgbm = confusion_matrix(y_test,y_head_lgbm)
cm_ada = confusion_matrix(y_test,y_head_ada)
cm_gbc = confusion_matrix(y_test,y_head_gbc)
cm_mlp = confusion_matrix(y_test,y_head_mlp)
cm_cat = confusion_matrix(y_test,y_head_cat)

In [None]:
plt.figure(figsize=(24,12))

plt.suptitle("Confusion Matrixes",fontsize=24)
plt.subplots_adjust(wspace = 0.4, hspace= 0.4)

plt.subplot(4,3,5)
plt.title("Logistic Regression Confusion Matrix")
sns.heatmap(cm_lr,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})
plt.subplot(4,3,6)
plt.title("K Nearest Neighbors Confusion Matrix")
sns.heatmap(cm_knn,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,2)
plt.title("XGB Confusion Matrix")
sns.heatmap(cm_xgb,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,4)
plt.title("Naive Bayes Confusion Matrix")
sns.heatmap(cm_nb,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,3)
plt.title("Decision Tree Classifier Confusion Matrix")
sns.heatmap(cm_dtc,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,1)
plt.title("Random Forest Gini Confusion Matrix")
sns.heatmap(cm_rfc,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,7)
plt.title("LightGB Confusion Matrix")
sns.heatmap(cm_lgbm,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,8)
plt.title("Ada Boost Confusion Matrix")
sns.heatmap(cm_ada,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,9)
plt.title("Gradient boost Classifier Confusion Matrix")
sns.heatmap(cm_gbc,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,10)
plt.title("Multi-layer Perceptron Classifier Confusion Matrix")
sns.heatmap(cm_mlp,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})

plt.subplot(4,3,11)
plt.title("Cat boost Classifier Confusion Matrix")
sns.heatmap(cm_cat,annot=True,cmap="Blues",fmt="d",cbar=False, annot_kws={"size": 24})


plt.show()

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,5))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
plt.xticks(rotation = 90)
sns.barplot(x=list(accuracy.keys()), y=list(accuracy.values()), palette="cubehelix")
plt.show()

# Now train it with whole Traning dataset

In [None]:
cat.fit(X_tf, y_tf)

# Now we will predict on Test Data

In [None]:
submission = pd.DataFrame()
submission['UniqueID'] = test['UniqueID']

In [None]:
test.head()

In [None]:
data_processing(test)
test.head()

In [None]:
submission['loan_default'] = cat.predict(test)

In [None]:
submission.head()