In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Since we have to accurately predict the probability of loanee/borrower defaulting on a vehicle loan in the first EMI on the due date, along with AUC-ROC score, have taken F1-score(1s) and binary log loss as the performance metrics.**

In [None]:
df = pd.read_csv('/kaggle/input/lt-vehicle-loan-default-prediction/train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns = [i.lower() for i in df.columns]
df.columns = [i.replace('.','_') for i in df.columns]

In [None]:
df.columns

In [None]:
print('The number of duplicates:',df.duplicated().sum())

In [None]:
# Dropping the unnecessary features

df.drop(['uniqueid','branch_id','supplier_id','mobileno_avl_flag','current_pincode_id','employee_code_id','manufacturer_id','state_id'],axis=1,inplace=True)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Dropping the rows with Employment Type Null as it constitutes just 3% of data
df = df.dropna()

In [None]:
df.describe(include='all')

# Dependent variable distribution

In [None]:
pct_loan_default = df['loan_default'].value_counts(normalize=True)*100
pct_loan_default

In [None]:
import plotly.express as px 
fig = px.pie(values=pct_loan_default.values, names=['Not defaulted','Defaulted']) 
fig.show()

**78.3% observations in the dataset have not defaulted vehicle loan while 21.7% have defaulted loan. It is slighly imbalanced dataset.**

# Categorical Features Analysis

## Employment_Type

In [None]:
df1 = pd.crosstab(df['employment_type'],df['loan_default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()

In [None]:
df['employment_type'].value_counts(normalize=True)*100

In [None]:
print('Percentage of salaried people who have defaulted:',
     np.round(df[(df['employment_type']=='Salaried') & (df['loan_default']==1)].shape[0]/(df[df['employment_type']=='Salaried'].shape[0])*100,3))
print('Percentage of self employed people who have defaulted:',
     np.round(df[(df['employment_type']=='Self employed') & (df['loan_default']==1)].shape[0]/(df[df['employment_type']=='Self employed'].shape[0])*100,3))

## Aadhar

In [None]:
df1 = pd.crosstab(df['aadhar_flag'],df['loan_default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()

In [None]:
df['aadhar_flag'].value_counts(normalize=True)*100

In [None]:
print('Percentage of people who have given their Aadhar details and defaulted:',
     np.round(df[(df['aadhar_flag']==1) & (df['loan_default']==1)].shape[0]/(df[df['aadhar_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their Aadhar details and defaulted:',
     np.round(df[(df['aadhar_flag']==0) & (df['loan_default']==1)].shape[0]/(df[df['aadhar_flag']==0].shape[0])*100,3))

## PAN

In [None]:
df1 = pd.crosstab(df['pan_flag'],df['loan_default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()

In [None]:
df['pan_flag'].value_counts(normalize=True)*100

In [None]:
print('Percentage of people who have given their PAN details and defaulted:',
     np.round(df[(df['pan_flag']==1) & (df['loan_default']==1)].shape[0]/(df[df['pan_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their PAN details and defaulted:',
     np.round(df[(df['pan_flag']==0) & (df['loan_default']==1)].shape[0]/(df[df['pan_flag']==0].shape[0])*100,3))

## Voter ID

In [None]:
df1 = pd.crosstab(df['voterid_flag'],df['loan_default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()

In [None]:
df['voterid_flag'].value_counts(normalize=True)*100

In [None]:
print('Percentage of people who have given their voter_id details and defaulted:',
     np.round(df[(df['voterid_flag']==1) & (df['loan_default']==1)].shape[0]/(df[df['voterid_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their voter_id details and defaulted:',
     np.round(df[(df['voterid_flag']==0) & (df['loan_default']==1)].shape[0]/(df[df['voterid_flag']==0].shape[0])*100,3))

## DL

In [None]:
df1 = pd.crosstab(df['driving_flag'],df['loan_default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()

In [None]:
df['driving_flag'].value_counts(normalize=True)*100

In [None]:
print('Percentage of people who have given their DL details and defaulted:',
     np.round(df[(df['driving_flag']==1) & (df['loan_default']==1)].shape[0]/(df[df['driving_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their DL details and defaulted:',
     np.round(df[(df['driving_flag']==0) & (df['loan_default']==1)].shape[0]/(df[df['driving_flag']==0].shape[0])*100,3))

## Passport

In [None]:
df1 = pd.crosstab(df['passport_flag'],df['loan_default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()

In [None]:
df['passport_flag'].value_counts(normalize=True)*100

In [None]:
print('Percentage of people who have given their Passport details and defaulted:',
     np.round(df[(df['passport_flag']==1) & (df['loan_default']==1)].shape[0]/(df[df['passport_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their Passport details and defaulted:',
     np.round(df[(df['passport_flag']==0) & (df['loan_default']==1)].shape[0]/(df[df['passport_flag']==0].shape[0])*100,3))

In [None]:
#Encoding Employment Type
df['self_employed'] = pd.get_dummies(df['employment_type'],drop_first=True)

In [None]:
df.drop('employment_type',axis=1,inplace=True)
df.head()

**Feature importances of different categorical features**

In [None]:
df1 = df[['self_employed','aadhar_flag','pan_flag','voterid_flag','driving_flag',
         'passport_flag']]

In [None]:
# Feature importances using SelectKBest algorithm using chi2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest,chi2
n = SelectKBest(score_func=chi2, k='all')
catcols=n.fit(df1,df['loan_default'])
plt.figure(figsize=(7,5))
sns.barplot(x=catcols.scores_,y=df1.columns)
plt.title('Best Categorical Features')
plt.show()

In [None]:
# Feature importances using Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(df1,df['loan_default'])

In [None]:
pd.DataFrame(model.feature_importances_,index=df1.columns,columns=['Feature_Importance']).sort_values(by='Feature_Importance',ascending=False)

In [None]:
ranked_features=pd.Series(model.feature_importances_,index=df1.columns)
ranked_features.nlargest(6).plot(kind='barh')
plt.show()

In [None]:
# Dropping unncessary features based on the above analysis

df.drop(['pan_flag','driving_flag','passport_flag'],axis=1,inplace=True)

# Numerical Features Analysis

## Disbursed Amount

In [None]:
df['disbursed_amount'].describe()

In [None]:
disbursed_amount_non_default = df[df['loan_default']==0]['disbursed_amount']
disbursed_amount_default = df[df['loan_default']==1]['disbursed_amount']

In [None]:
pd.DataFrame([disbursed_amount_non_default.describe(), disbursed_amount_default.describe()], index=['non_defaulters','defaulters'])

The maximum disbursed amount is way higher for non-defaulters.

In [None]:
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(15,8))

plt.subplot(1,2,1)
sns.distplot(df['disbursed_amount'])

plt.subplot(1,2,2)
sns.boxplot(df['disbursed_amount'])

plt.show()

The distribution is highly right skewed and there are extreme values

In [None]:
plt.figure(figsize=(15,6))
sns.violinplot(x ='loan_default',y='disbursed_amount',data=df)
plt.show()

More outliers/extreme values are present for non defaulters

## Asset Cost

In [None]:
df['asset_cost'].describe()

In [None]:
asset_cost_non_default = df[df['loan_default']==0]['asset_cost']
asset_cost_default = df[df['loan_default']==1]['asset_cost']

In [None]:
pd.DataFrame([asset_cost_non_default.describe(), asset_cost_default.describe()], index=['non_defaulters','defaulters'])

The maximum asset cost of non defaulters is way higher than that of defaulters

In [None]:
plt.figure(figsize=(15,8))

plt.subplot(1,2,1)
sns.distplot(df['asset_cost'])

plt.subplot(1,2,2)
sns.boxplot(df['asset_cost'])

plt.show()

The distribution is highly right skewed and there are extreme values.

In [None]:
plt.figure(figsize=(15,6))
sns.violinplot(x ='loan_default',y='asset_cost',data=df)
plt.show()

Extreme/outlier values of asset_cost are present among non defaulters

## Ltv

In [None]:
df['ltv'].describe()

In [None]:
ltv_non_default = df[df['loan_default']==0]['ltv']
ltv_default = df[df['loan_default']==1]['ltv']

In [None]:
pd.DataFrame([ltv_non_default.describe(), ltv_default.describe()], index=['non_defaulters','defaulters'])

The ltv is almost same for both defaulters and non defaulters

In [None]:
plt.figure(figsize=(15,8))

plt.subplot(1,2,1)
sns.distplot(df['ltv'])

plt.subplot(1,2,2)
sns.boxplot(df['ltv'])

plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.violinplot(x ='loan_default',y='ltv',data=df)
plt.show()

## Age

In [None]:
# We have DOB of the customer and the date of disbursal, from which we need to calculate the age of the customer at the
# time of loan disbursal

def age(dob):
    yr = int(dob.split('-')[2])
    if yr >=0 and yr < 21:
        return yr + 2000
    else:
         return yr + 1900
        
df['date_of_birth'] = df['date_of_birth'].apply(age)
df['disbursaldate'] = df['disbursaldate'].apply(age)
# Age of the customer at the time of disbursement of fund
df['age'] = df['disbursaldate'] - df['date_of_birth']

In [None]:
# Dropping the DOB and Disbursal Date
df.drop(['date_of_birth','disbursaldate'],axis=1,inplace=True)

In [None]:
df['age'].describe()

In [None]:
age_non_defaulters = df[df['loan_default'] == 0]['age']
age_defaulters = df[df['loan_default'] == 1]['age']

In [None]:
pd.DataFrame([age_non_defaulters.describe(), age_defaulters.describe()], index=['non_defaulters','defaulters'])

All the stats of age are almost same for defaulters and non defaulters

In [None]:
plt.figure(figsize=(15,8))

plt.subplot(1,2,1)
sns.distplot(df['age'])

plt.subplot(1,2,2)
sns.boxplot(df['age'])

plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(x ='loan_default',y='age',data=df)
plt.show()

The Age at the time of loan disbursement is almost similar among defaulters and non defaulters

## Perform CNS score / CIBIL score

In [None]:
df['perform_cns_score'].describe()

In [None]:
cibil_non_default = df[df['loan_default']==0]['perform_cns_score']
cibil_default = df[df['loan_default']==1]['perform_cns_score']

In [None]:
pd.DataFrame([cibil_non_default.describe(), cibil_default.describe()], index=['non_defaulters','defaulters'])

Here we can observe a difference in the mean and median cibil scores among the defaulters and non defaulters. The mean and median cibil scores are higher for non defaulters.

In [None]:
plt.figure(figsize=(15,6))

plt.subplot(1,2,1)
sns.distplot(df['perform_cns_score'])

plt.subplot(1,2,2)
sns.boxplot(df['perform_cns_score'])

plt.show()

In [None]:
plt.figure(figsize=(15,6))

sns.distplot(cibil_non_default, color='blue', label = 'Non Defaulter')
sns.distplot(cibil_default, color='red', label = 'Defaulter')

plt.legend()
plt.show()

CIBIL score distribution is looking almost similar for defaulters and non defaulters

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(x ='loan_default',y='perform_cns_score',data=df)
plt.show()

We can see that the mean and median cibil scores of non defaulters is sligtly higher than that of defaulters. Also the 75th percentile value of cibil score is higher for defaulters. The max score is nearly same for defaulters and non defaulters

In [None]:
sns.scatterplot(df['age'], df['perform_cns_score'])

plt.show()

# Here we can see that irrespective of age CIBIL score variation is same 

## Perform cns score description

In [None]:
df['perform_cns_score_description'].describe()

In [None]:
df['perform_cns_score_description'].value_counts()

In [None]:
cnsd = pd.crosstab(index=df['perform_cns_score_description'], columns=df['loan_default'])
cnsd['Percent of Defaulters'] = (cnsd[1] / (cnsd[0] + cnsd[1]))*100
cnsd

In [None]:
df.groupby(by='perform_cns_score_description')['perform_cns_score'].agg([min,max]).sort_values(by='min')

We can see that as the risk increases, the percent of default also increases

In [None]:
# grouping all same risk into one and creating a new feature with only six classes
# e.g A-Very Low Risk to Very Low Risk
#     B-Very Low Risk to Very Low Risk

risk = []
for i in df['perform_cns_score_description']:
    if('Very Low' in i):
        risk.append('Very Low Risk')
    elif('Low' in i):
        risk.append('Low Risk')
    elif('Medium' in i):
        risk.append('Medium Risk')
    elif('Very High' in i):
        risk.append('Very High Risk')
    elif('High' in i):
        risk.append('High Risk')
    else:
        risk.append('Not Scored')

In [None]:
df['risk'] = risk

In [None]:
risk_counts = pd.Series(risk).value_counts().sort_values()
risk_counts

In [None]:
plt.barh(y = risk_counts.index,width=risk_counts.values)
plt.show()

In [None]:
risk_counts = pd.crosstab(index=df['risk'], columns=df['loan_default'])

risk_counts['Percent_of_default'] = round((risk_counts[1]/risk_counts.sum(axis=1))*100,2)

risk_counts.sort_values(by='Percent_of_default',ascending=False)

In [None]:
df.groupby(by='risk')['perform_cns_score'].agg([min,max]).sort_values(by='min')

In [None]:
risk_map = {'Not Scored':-1, 
            'Very Low Risk':4,
            'Low Risk':3,
            'Medium Risk':2, 
            'High Risk':1,
            'Very High Risk':0}

df['risk'] = df['risk'].map(risk_map)

In [None]:
risk_counts = pd.crosstab(index=df['risk'], columns=df['loan_default'])
risk_counts['Percent of Defaluters'] = (risk_counts[1] / (risk_counts[0] + risk_counts[1]))*100
risk_counts.sort_values(by='Percent of Defaluters', ascending=False)

We can see that the percent of defaulters are less for low risk and very low risk categories.

In [None]:
pd.crosstab(index=df['risk'], columns=df['loan_default']).plot(kind='bar')
plt.show()

In [None]:
df.drop('perform_cns_score_description',axis=1,inplace=True)

## Average Account Age and Credit History Length

In [None]:
# We have 2 Columns named "AVERAGE_ACCT_AGE" & "CREDIT_HISTORY_LENGTH".
# They have AplhaNumeric Values,changing them to Months

def duration(dur):
    yrs = int(dur.split(' ')[0].replace('yrs',''))
    mon = int(dur.split(' ')[1].replace('mon',''))
    return yrs*12+mon

In [None]:
df['credit_history_length'] = df['credit_history_length'].apply(duration)
df['average_acct_age'] = df['average_acct_age'].apply(duration)

In [None]:
df['average_acct_age'].describe()

In [None]:
acct_age_non_defaulters = df[df['loan_default'] == 0]['average_acct_age']
acct_age_defaulters = df[df['loan_default'] == 1]['average_acct_age']

In [None]:
pd.DataFrame([acct_age_non_defaulters.describe(), acct_age_defaulters.describe()], index=['non_defaulters','defaulters'])

The maximum average account age is higher for non defaulters

In [None]:
plt.figure(figsize=(15,6))

plt.subplot(1,2,1)
sns.distplot(df['average_acct_age'])

plt.subplot(1,2,2)
sns.boxplot(df['average_acct_age'])

plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(x ='loan_default',y='average_acct_age',data=df)
plt.show()

It is highly right skewed

## Credit History Length

In [None]:
df['credit_history_length'].describe()

In [None]:
credit_non_default = df[df['loan_default'] == 0]['credit_history_length']
credit_default = df[df['loan_default'] == 1]['credit_history_length']

In [None]:
pd.DataFrame([credit_non_default.describe(), credit_default.describe()], index=['non_defaulters','defaulters'])

The mean and std is slightly higher for non defaulters

In [None]:
plt.figure(figsize=(15,6))

plt.subplot(1,2,1)
sns.distplot(df['credit_history_length'])

plt.subplot(1,2,2)
sns.boxplot(df['credit_history_length'])

plt.show()

Highly right skewed

## New accounts in last six months

In [None]:
counts = df['new_accts_in_last_six_months'].value_counts()
percent = df['new_accts_in_last_six_months'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

Most of them have not opened any new account in the last 6 months

## Delinquent Accounts in last six months

In [None]:
counts = df['delinquent_accts_in_last_six_months'].value_counts()
percent = df['delinquent_accts_in_last_six_months'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

We can see that 92% of customers have not defaulted loans in last six months. 8% of customers have deafulted loans for >= 1 time

## No of Inquiries

In [None]:
counts = df['no_of_inquiries'].value_counts()
percent = df['no_of_inquiries'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

Most of the customers have not made any enquiries regarding loans

In [None]:
no_inquiries = pd.crosstab(index=df['no_of_inquiries'], columns=df['loan_default'])
no_inquiries['pct_default'] = (no_inquiries[1]/no_inquiries.sum(axis=1))*100
no_inquiries

Here, except for few cases, as the number of enquires increase, there is an increase in the pct of default.

In [None]:
plt.figure(figsize=(15,8))
plt.bar(no_inquiries.index,no_inquiries['pct_default'])
plt.xticks(no_inquiries.index)
plt.xlabel('No of Enquires')
plt.ylabel('Percent of default')
plt.show()

**Feature importances of different numerical features**

In [None]:
df2 = df[['disbursed_amount','asset_cost', 'ltv','perform_cns_score', 'pri_no_of_accts', 'pri_active_accts',
       'pri_overdue_accts', 'pri_current_balance', 'pri_sanctioned_amount',
       'pri_disbursed_amount', 'sec_no_of_accts', 'sec_active_accts',
       'sec_overdue_accts', 'sec_current_balance', 'sec_sanctioned_amount',
       'sec_disbursed_amount', 'primary_instal_amt', 'sec_instal_amt',
       'new_accts_in_last_six_months', 'delinquent_accts_in_last_six_months',
       'average_acct_age', 'credit_history_length', 'no_of_inquiries',
       'age', 'risk']]

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(df2,df['loan_default'])

In [None]:
pd.DataFrame(model.feature_importances_,index=df2.columns,columns=['Feature_Importances']).sort_values(by='Feature_Importances',ascending=False)

In [None]:
plt.figure(figsize=(8,8))
ranked_features=pd.Series(model.feature_importances_,index=df2.columns)
ranked_features.nlargest(25).plot(kind='barh')
plt.show()

We cannot drop the secondary account details as they are asked by the institutions before granting loan.

In [None]:
# Checking the correlation between primary and secondary accounts
plt.figure(figsize=(12,8))
sns.heatmap(df[['pri_no_of_accts','pri_active_accts','pri_overdue_accts','pri_current_balance','pri_sanctioned_amount',
               'pri_disbursed_amount','primary_instal_amt','sec_no_of_accts','sec_active_accts','sec_overdue_accts',
               'sec_current_balance','sec_sanctioned_amount','sec_disbursed_amount','sec_instal_amt']].corr(),annot=True)
plt.show()

There is no correlation between primary and secondary accounts

## Primary and Secondary Accounts

In [None]:
# Combining the Primary and Secondary Accounts

df['no_of_accts'] = df['pri_no_of_accts'] + df['sec_no_of_accts']
df['active_accts'] = df['pri_active_accts'] + df['sec_active_accts']
df['overdue_accts'] = df['pri_overdue_accts'] + df['sec_overdue_accts']
df['outstanding_amount'] = df['pri_current_balance'] + df['sec_current_balance']
df['sanctioned_amount'] = df['pri_sanctioned_amount'] + df['sec_sanctioned_amount']
df['psdisbursed_amount'] = df['pri_disbursed_amount'] + df['sec_disbursed_amount']
df['install_amt'] = df['primary_instal_amt'] + df['sec_instal_amt']

In [None]:
df.drop(['pri_no_of_accts','sec_no_of_accts','pri_active_accts','sec_active_accts',
        'pri_overdue_accts','sec_overdue_accts','pri_current_balance','sec_current_balance',
        'pri_sanctioned_amount','sec_sanctioned_amount','pri_disbursed_amount','sec_disbursed_amount',
        'primary_instal_amt','sec_instal_amt'],axis=1,inplace=True)

In [None]:
# Account and amount description

df[['no_of_accts','active_accts','overdue_accts','outstanding_amount','sanctioned_amount','psdisbursed_amount','install_amt']].describe()

## Total number of accounts

In [None]:
df['no_of_accts'].describe()

In [None]:
na_non_default = df[df['loan_default']==0]['no_of_accts']
na_default = df[df['loan_default']==1]['no_of_accts']

In [None]:
pd.DataFrame([na_non_default.describe(), na_default.describe()], index=['non_defaulters','defaulters'])

## Active Accounts

In [None]:
counts = df['active_accts'].value_counts()
percent = df['active_accts'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

There are over 50 percent inactive accounts. Around 18 percent have 1 active account present

## Overdue Accounts

In [None]:
counts = df['overdue_accts'].value_counts()
percent = df['overdue_accts'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

Most of the accounts are not overdue. Around 9 percent of data contain 1 overdue account, and around 2 percent of data contain 2 overdue accounts

In [None]:
no_inquiries = pd.crosstab(index=df['overdue_accts'], columns=df['loan_default'])
no_inquiries['pct_default'] = (no_inquiries[1]/no_inquiries.sum(axis=1))*100
no_inquiries

Upto 5 overdue accounts, we can see that as the number of overdue accounts increase, the percentage of default also increase. However we do not observe the same pattern/any pattern beyond 5 overdue accounts

## Outstanding Amount

In [None]:
counts = df['outstanding_amount'].value_counts()
percent = df['outstanding_amount'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

Around 60 percent of data do not have any outstanding amount

## Sanctioned Amount

In [None]:
counts = df['sanctioned_amount'].value_counts()
percent = df['sanctioned_amount'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

For around 58 percent of the accounts, no amount was sanctioned for all the loans at the time of disbursement

## Psdisbursed Amount

In [None]:
counts = df['psdisbursed_amount'].value_counts()
percent = df['psdisbursed_amount'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

For 58 percent of all accounts, no amount was disbursed for all the loans at the time of disbursement

## Instalment Amount

In [None]:
counts = df['install_amt'].value_counts()
percent = df['install_amt'].value_counts(normalize=True)*100

pd.DataFrame({'counts':counts,'percent_of_data':percent})

Around 68 percent of data do not have any installment amount to pay

In [None]:
df3 = df[['disbursed_amount', 'asset_cost', 'ltv','perform_cns_score', 'new_accts_in_last_six_months',
       'delinquent_accts_in_last_six_months', 'average_acct_age',
       'credit_history_length', 'no_of_inquiries',
       'age', 'risk', 'no_of_accts', 'active_accts',
       'overdue_accts', 'outstanding_amount', 'sanctioned_amount',
       'psdisbursed_amount', 'install_amt']]

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(df3.corr(),annot=True,cmap='Blues')
plt.show()

From the above correlation heatmap, we can see that some of the features are highly correlated(>0.75) with each other.
* --- disbursed amount and asset cost - 0.75
* --- perform_cns.score and risk - 0.98
* --- average_acct_age and credit_history_length - 0.83
* --- no_of_accts and active_accts - 0.76
* --- sanctioned_amount and psdisbursed_amount - 1

**Feature importances of above numerical features**

In [None]:
# Feature importance using Extra Trees classifier
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(df3,df['loan_default'])

In [None]:
pd.DataFrame(model.feature_importances_,index=df3.columns,columns=['Feature_Importances']).sort_values(by='Feature_Importances',ascending=False)

In [None]:
plt.figure(figsize=(8,8))
ranked_features=pd.Series(model.feature_importances_,index=df3.columns)
ranked_features.nlargest(18).plot(kind='barh')
plt.show()

In [None]:
df.drop(['asset_cost','perform_cns_score','average_acct_age','no_of_accts','psdisbursed_amount','delinquent_accts_in_last_six_months'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

## Checking and Handling outliers

In [None]:
df12 = df[['disbursed_amount', 'ltv',
       'new_accts_in_last_six_months',
       'credit_history_length', 'no_of_inquiries',
       'age', 'active_accts', 'overdue_accts',
       'outstanding_amount', 'sanctioned_amount', 'install_amt']]

In [None]:
plt.figure(figsize=(20,20))
x = 1
for column in df12.columns:
    if x<=11:
        plt.subplot(5,3,x)
        sns.boxplot(df[column])
    x+=1
plt.show()

In [None]:
sns.boxplot(df['disbursed_amount'])
plt.show()

In [None]:
#calculating 0-100th percentile to find a the correct percentile value for removal of outliers
for i in range(0,100,10):
    var = df['disbursed_amount'].values
    var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

In [None]:
#looking further from the 99th percecntile
for i in range(90,100):
    var = df['disbursed_amount'].values
    var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

In [None]:
df = df[df['disbursed_amount']<max(df['disbursed_amount'])]

In [None]:
df = df[df['new_accts_in_last_six_months']<25]
df = df[df['credit_history_length']<400]
df = df[df['no_of_inquiries']<25]
df = df[df['active_accts']<50]
df = df[df['sanctioned_amount']<max(df['sanctioned_amount'])]

In [None]:
sns.boxplot(df['outstanding_amount'])
plt.show()

In [None]:
#calculating 0-100th percentile to find a the correct percentile value for removal of outliers
for i in range(0,100,10):
    var = df['outstanding_amount'].values
    var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

In [None]:
#looking further from the 99th percecntile
for i in range(90,100):
    var = df['outstanding_amount'].values
    var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

In [None]:
df = df[(df['outstanding_amount']>-6678296) & (df['outstanding_amount']<75603400)]

In [None]:
sns.boxplot(df['install_amt'])
plt.show()

In [None]:
#calculating 0-100th percentile to find a the correct percentile value for removal of outliers
for i in range(0,100,10):
    var = df['install_amt'].values
    var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

In [None]:
#looking further from the 99th percecntile
for i in range(90,100):
    var = df['install_amt'].values
    var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

In [None]:
df = df[df['install_amt']<10000000]

In [None]:
# Checking the distributions again

plt.figure(figsize=(20,20))
x = 1
for column in df12.columns:
    if x<=13:
        plt.subplot(5,3,x)
        sns.boxplot(df[column])
    x+=1
plt.show()

In [None]:
# Removing few more outliers/extreme values

df = df[df['disbursed_amount']<250000]
df = df[df['outstanding_amount']<40000000]
df = df[df['sanctioned_amount']<0.800000e+08]
df = df[df['install_amt']<=5.000000e+06]

In [None]:
plt.figure(figsize=(20,20))
x = 1
for column in df12.columns:
    if x<=11:
        plt.subplot(5,3,x)
        sns.boxplot(df[column])
    x+=1
plt.show()

In [None]:
# Transforming the features outstanding amount, sanctioned amount and install_amt
df['log_outstanding_amount'] = np.log(df['outstanding_amount']+1-min(df['outstanding_amount']))
sns.boxplot(df['log_outstanding_amount'])
plt.show()

In [None]:
df['log_outstanding_amount'] = df[df['log_outstanding_amount']>12]
sns.boxplot(df['log_outstanding_amount'])
plt.show()

In [None]:
df['log_outstanding_amount'].describe()

In [None]:
df.isnull().sum()

In [None]:
df['log_outstanding_amount'] = df.fillna(df['log_outstanding_amount'].median())

In [None]:
df['log_sanctioned_amount'] = np.log(df['sanctioned_amount']+1)
sns.boxplot(df['log_sanctioned_amount'])
plt.show()

In [None]:
df['log_install_amt'] = np.log(df['install_amt']+1)
sns.boxplot(df['log_install_amt'])
plt.show()

In [None]:
df.drop(['outstanding_amount','sanctioned_amount','install_amt'],axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.columns

**Pairplot**

In [None]:
n = df.shape[0]
sns.pairplot(df[['disbursed_amount', 'ltv','new_accts_in_last_six_months', 
                 'loan_default']][0:n], hue='loan_default', 
             vars=['disbursed_amount', 'ltv','new_accts_in_last_six_months'])
plt.show()

In [None]:
n = df.shape[0]
sns.pairplot(df[['credit_history_length', 'no_of_inquiries',
       'self_employed', 'age', 'risk','loan_default']][0:n], hue='loan_default', 
             vars=['credit_history_length', 'no_of_inquiries',
       'self_employed', 'age', 'risk'])
plt.show()

In [None]:
n = df.shape[0]
sns.pairplot(df[['active_accts', 'overdue_accts',
       'log_outstanding_amount', 'log_sanctioned_amount', 'log_install_amt','loan_default']][0:n], hue='loan_default', 
             vars=['active_accts', 'overdue_accts',
       'log_outstanding_amount', 'log_sanctioned_amount', 'log_install_amt'])
plt.show()

In [None]:
df.columns

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df[['disbursed_amount', 'ltv','new_accts_in_last_six_months','credit_history_length','no_of_inquiries',
               'age','risk','active_accts', 'overdue_accts', 'log_outstanding_amount','log_sanctioned_amount', 
                'log_install_amt']].corr(),annot=True)
plt.show()

There is a correlation of 1 between disbursed amount and log_outstanding_amount. Hence removing the log_outstanding_amount feature after comparing the feature importances

In [None]:
df.drop('log_outstanding_amount',axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.columns

# Modelling

## Logistic Regression

In [None]:
y = df['loan_default']
X = df.drop('loan_default',axis=1)

In [None]:
y = list(y)

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xscaled = sc.fit_transform(X)
Xscaled = pd.DataFrame(Xscaled,columns=X.columns)

In [None]:
import statsmodels.api as sm
Xc = sm.add_constant(Xscaled)
model = sm.Logit(y, Xc).fit()
model.summary()

In [None]:
from sklearn.metrics import confusion_matrix,roc_auc_score,log_loss,roc_curve,accuracy_score

In [None]:
y_pred = model.predict(Xc)
prob = pd.DataFrame(y_pred, columns=['probability'])
prob['loan_default'] = y
prob['y_est'] = prob['probability'].apply(lambda x: 0 if x<0.5 else 1)
prob.head()

In [None]:
# Confusion matrix
confusion_matrix(prob['loan_default'], prob['y_est'])

In [None]:
# AUC score
roc_auc_score(prob['loan_default'],prob['probability'])

In [None]:
# Checking for multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
vf = [vif(Xscaled.values,i) for i in range(X.shape[1])]
pd.DataFrame(vf,index=X.columns,columns=['vif'])

In [None]:
# Building sklearn Linear Regression model
y = df['loan_default']
X = df.drop('loan_default',axis=1)

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=120)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear',random_state=42)
lr.fit(X_train,y_train)

In [None]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
y_train_prob = lr.predict_proba(X_train)
y_test_prob = lr.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
# FNs are too high, TPs are too low. Maybe Applying SMOTE and balancing the data might help.
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
# For 1s the f1 score is really low

# Before computing the binary log loss, we need to perform caliberation

# https://machinelearningmastery.com/calibrated-classification-model-in-scikit-learn/
# https://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html

from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(lr, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)

In [None]:
log_loss(y_test,isotonic_predict_prob_test)

**Using SMOTE to handle imbalance**

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_sm,y_train_sm = smote.fit_resample(X_train,y_train)
X_train_sm.shape, y_train_sm.shape

In [None]:
lr1 = LogisticRegression(solver='liblinear',random_state=42)
lr1.fit(X_train_sm,y_train_sm)

In [None]:
y_train_pred = lr1.predict(X_train_sm)
y_test_pred = lr1.predict(X_test)
y_train_prob = lr1.predict_proba(X_train_sm)
y_test_prob = lr1.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train_sm,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(lr1, cv=3, method='isotonic')
model_isotonic.fit(X_train_sm,y_train_sm)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)

In [None]:
log_loss(y_test,isotonic_predict_prob_test)

## Random Forest Classifier

### Modelling without SMOTE

Hyperparameter tuning is done using Random Search CV and best parameters are obtained and used for the modelling

In [None]:
rsearch1_best_params = {'max_depth': 13,
 'min_samples_leaf': 10,
 'min_samples_split': 11,
 'n_estimators': 374}

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier(**rsearch1_best_params, random_state=300)
rfc1.fit(X_train, y_train)

In [None]:
y_train_pred = rfc1.predict(X_train)
y_test_pred = rfc1.predict(X_test)
y_train_prob = rfc1.predict_proba(X_train)
y_test_prob = rfc1.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(rfc1, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

### Modelling with SMOTE

In [None]:
rsearch_best_params = {'max_depth': 17,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 317}

In [None]:
rfc = RandomForestClassifier(**rsearch_best_params, random_state=300)
rfc.fit(X_train_sm, y_train_sm)

In [None]:
y_train_pred = rfc.predict(X_train_sm)
y_test_pred = rfc.predict(X_test)
y_train_prob = rfc.predict_proba(X_train_sm)
y_test_prob = rfc.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train_sm,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(rfc, cv=3, method='isotonic')
model_isotonic.fit(X_train_sm,y_train_sm)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

## LightGBM

### Modelling without SMOTE

In [None]:
import lightgbm as lgb

In [None]:
rsearch1_best_params = {'learning_rate': 0.10308835171850986,
 'max_depth': 3,
 'n_estimators': 275,
 'num_leaves': 18}

In [None]:
lgbmc1 = lgb.LGBMClassifier(**rsearch1_best_params, importance_type='gain',random_state=300)
lgbmc1.fit(X_train, y_train)

In [None]:
y_train_pred = lgbmc1.predict(X_train)
y_test_pred = lgbmc1.predict(X_test)
y_train_prob = lgbmc1.predict_proba(X_train)
y_test_prob = lgbmc1.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(lgbmc1, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

### Modelling with SMOTE

In [None]:
rsearch_best_params = {'learning_rate': 0.32585614358745185,
 'max_depth': 12,
 'n_estimators': 540,
 'num_leaves': 31}

In [None]:
lgbmc = lgb.LGBMClassifier(**rsearch_best_params, importance_type='gain',random_state=300)
lgbmc.fit(X_train_sm, y_train_sm)

In [None]:
y_train_pred = lgbmc.predict(X_train_sm)
y_test_pred = lgbmc.predict(X_test)
y_train_prob = lgbmc.predict_proba(X_train_sm)
y_test_prob = lgbmc.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train_sm,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(lgbmc, cv=3, method='isotonic')
model_isotonic.fit(X_train_sm,y_train_sm)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

## XGBoost

### Modelling without SMOTE

In [None]:
import xgboost
from xgboost import XGBClassifier
rsearch1_best_params = {'eval_metric': 'auc',
 'gamma': 0.3,
 'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 270,
 'reg_alpha': 0.01}

In [None]:
xgbc1 = XGBClassifier(**rsearch1_best_params, random_state=300)
xgbc1.fit(X_train, y_train)

In [None]:
y_train_pred = xgbc1.predict(X_train)
y_test_pred = xgbc1.predict(X_test)
y_train_prob = xgbc1.predict_proba(X_train)
y_test_prob = xgbc1.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(xgbc1, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

### Modelling with SMOTE

In [None]:
rsearch_best_params = {'eval_metric': 'auc',
 'gamma': 0.2,
 'learning_rate': 0.2,
 'max_depth': 9,
 'n_estimators': 192,
 'reg_alpha': 0.1}

In [None]:
xgbc = XGBClassifier(**rsearch_best_params, random_state=300)
xgbc.fit(X_train_sm, y_train_sm)

In [None]:
y_train_pred = xgbc.predict(X_train_sm)
y_test_pred = xgbc.predict(X_test)
y_train_prob = xgbc.predict_proba(X_train_sm)
y_test_prob = xgbc.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train_sm,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(xgbc, cv=3, method='isotonic')
model_isotonic.fit(X_train_sm,y_train_sm)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

## Stacking

### Modelling without SMOTE

All the hyperparameters are tuned and are same as used before for individual modelling

In [None]:
from sklearn.ensemble import StackingClassifier
estimators = [
('rfc',RandomForestClassifier(max_depth = 13,
 min_samples_leaf = 10,
 min_samples_split = 11,
 n_estimators = 374)),
 
('lgbmc',lgb.LGBMClassifier(learning_rate = 0.10308835171850986,
 max_depth = 3,
 n_estimators = 275,
 num_leaves = 18)),
 
('xgbc', XGBClassifier(eval_metric = 'auc',
 gamma = 0.3,
 learning_rate = 0.1,
 max_depth = 3,
 n_estimators = 270,
 reg_alpha = 0.01))
]


In [None]:
clf1 = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression(solver='liblinear'),
                        cv = 5, n_jobs=-1)
clf1.fit(X_train,y_train)

In [None]:
y_train_pred = clf1.predict(X_train)
y_test_pred = clf1.predict(X_test)
y_train_prob = clf1.predict_proba(X_train)
y_test_prob = clf1.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(clf1, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

### Modelling with SMOTE

In [None]:
from sklearn.ensemble import StackingClassifier
estimators = [
('rfc',RandomForestClassifier(max_depth = 17,
 min_samples_leaf = 2,
 min_samples_split = 4,
 n_estimators = 317)),
 
('lgbmc',lgb.LGBMClassifier(learning_rate = 0.32585614358745185,
 max_depth = 12,
 n_estimators = 540,
 num_leaves = 31)),
 
('xgbc', XGBClassifier(eval_metric = 'auc',
 gamma = 0.2,
 learning_rate = 0.2,
 max_depth = 9,
 n_estimators = 192,
 reg_alpha = 0.1))
]

In [None]:
clf = StackingClassifier(estimators=estimators,final_estimator=LogisticRegression(solver='liblinear'),
                        cv = 5, n_jobs=-1)
clf.fit(X_train_sm,y_train_sm)

In [None]:
y_train_pred = clf.predict(X_train_sm)
y_test_pred = clf.predict(X_test)
y_train_prob = clf.predict_proba(X_train_sm)
y_test_prob = clf.predict_proba(X_test)

In [None]:
print('The train AUC score is:',roc_auc_score(y_train_sm,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()

In [None]:
confusion_matrix(y_test,y_test_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(clf, cv=3, method='isotonic')
model_isotonic.fit(X_train_sm,y_train_sm)

In [None]:
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
log_loss(y_test,isotonic_predict_prob_test)

## Summary stats of all Models

In [None]:
# http://zetcode.com/python/prettytable/
from prettytable import PrettyTable

x = PrettyTable()
x.field_names = ["Model","Train-AUC","Test-AUC","Test-Binary_Log_Loss","F1-Score(1)","SMOTE-applied"]

x.add_row(["Logistic Regression", 0.623, 0.623, 0.509, 0.01,'No'])
x.add_row(["Logistic Regression", 0.624, 0.623, 0.670, 0.39,'Yes'])
x.add_row(["Random Forest Classifier", 0.727, 0.637, 0.503, 0.00,'No'])
x.add_row(["Random Forest Classifier", 0.855, 0.625, 0.614, 0.38,'Yes'])
x.add_row(["LightGBM Classifier", 0.652, 0.639, 0.503, 0.01,'No'])
x.add_row(["LightGBM Classifier", 0.928, 0.601, 0.527, 0.22,'Yes'])
x.add_row(["XGBoost Classifier", 0.652, 0.638, 0.503, 0.01,'No'])
x.add_row(["XGBoost Classifier", 0.936, 0.610, 0.523, 0.20,'Yes'])
x.add_row(["Stacked Classifier", 0.690, 0.639, 0.503, 0.03,'No'])
x.add_row(["Stacked Classifier", 0.936, 0.618, 0.523, 0.29,'Yes'])


print(x)

Inference:
The given problem statement requires us to determine the probability of loanee/borrower defaulting on a vehicle loan in the first EMI (Equated Monthly Instalments) on the due date. Hence along with the prediction of whether a person is a defautee/not a defaultee, we also need to predict the probability that a person might default the loan.

Hence to measure the performance of models, we have taken AUC-score, F1-score of 1's and Binary Log Loss as the performance metrics.

Without application of SMOTE, all the models are giving way less F1-score(1s). By the application of SMOTE this issue is cleared(Though the f1-scores can be controlled by selecting the appropriate threshold from the ROC curve).

By looking at the above table containing the performance metrics of various models, we can clearly say that Logistic Regression with SMOTE is performing really well, as compared to other models. It is giving a good AUC scores(not overfitting), and best F1-Score(1). Though the Binary log loss is a bit higher when compared to other models.

The next best model is Random Forest Classifier with SMOTE. Compared to Logistic Regression, it is overfitting a seen from AUC scores. However, it also shows good F1-score(1), slightly lower than Logistic Regression. It has a better(lower) binary log loss, when compared to Logistic Regression.

Apart from these, we have used LightGBM Classifier, XGBoost Classifier and a Stacked Classifier and their performance metrics are displayed in the pretty table above.