# Vehicle Loan Default - LTFS

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats, integrate
from sklearn.model_selection import train_test_split
from sklearn import metrics
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt 
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
%matplotlib inline
from sklearn.linear_model import LinearRegression
pd.options.display.float_format = '{:.2f}'.format
plt.rcParams['figure.figsize'] = (10, 8)
plt.rcParams['font.size'] = 14

In [None]:
# Importing the train and test datasets

train = pd.read_csv("../input/train_LTFS.csv")
test = pd.read_csv("../input/test_LTFS.csv")

In [None]:
train_original=train.copy() 
test_original=test.copy()

In [None]:
train.columns, train.shape

We have 40 independent variables and 1 target variable, i.e. Loan_Status in the train dataset. Let’s also have a look at the columns of test dataset.

In [None]:
test.columns, test.shape

In [None]:
# Print data types for each variable 
print(train.dtypes)

We can see there are three format of data types:

object:variables in our dataset are: Date.of.Birth, Employment.Type, DisbursalDate, PERFORM_CNS.SCORE.DESCRIPTION, AVERAGE.ACCT.AGE, CREDIT.HISTORY.LENGTH     

int64:represents the integer variables. 
UniqueID, disbursed_amount, asset_cost, branch_id, supplier_id, manufacturer_id, Current_pincode_ID , State_ID, Employee_code_ID, MobileNo_Avl_Flag, Aadhar_flag, PAN_flag, VoterID_flag, Driving_flag, Passport_flag, PERFORM_CNS.SCORE, PRI.NO.OF.ACCTS, PRI.ACTIVE.ACCTS, PRI.OVERDUE.ACCTS, PRI.CURRENT.BALANCE, PRI.SANCTIONED.AMOUNT, PRI.DISBURSED.AMOUNT, SEC.NO.OF.ACCTS, SEC.ACTIVE.ACCTS, SEC.OVERDUE.ACCTS, SEC.CURRENT.BALANCE, SEC.SANCTIONED.AMOUNT, SEC.DISBURSED.AMOUNT, PRIMARY.INSTAL.AMT, SEC.INSTAL.AMT, NEW.ACCTS.IN.LAST.SIX.MONTHS, DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS, NO.OF_INQUIRIES, loan_default                                                          
                                        
float64:represents the variable which have some decimal values involved: ltv



In [None]:
train.isnull().sum(),train.shape

In [None]:
test.isnull().sum(),test.shape

In [None]:
# Filling Missing values

train['Employment.Type'].fillna('Unemployed', inplace = True)
test['Employment.Type'].fillna('Unemployed', inplace = True)

# let's check if there is any null values still left or not
print("Null values left in the train set:", train.isnull().sum().sum())
print("Null values left in the test set:", test.isnull().sum().sum())

# Target Variable
We will first look at the target variable, i.e., Loan_default. As it is a int64 variable, let us look at its frequency table, percentage distribution and bar plot.

Frequency table of a variable will give us the count of each category in that variable

In [None]:
train['loan_default'].value_counts()

# Normalize can be set to True to print proportions instead of number 
train['loan_default'].value_counts(normalize=True)

train['loan_default'].value_counts().plot.bar()

Not Default(0) =  182543 Count <br>
    Default(1) =   50611 Count

In [None]:
matrix = train.corr() 
f, ax = plt.subplots(figsize=(30, 18)) 
sns.heatmap(matrix, vmax=1.5, square=True,annot=True, fmt=".1f",cmap="BuPu")
plt.show()

**The most correlated variables are:<br>**

- PRI Sanctioned Amount & PRI Disbursed Amount. (1.0)<br> 
- PRI No. of Accts & PRI Active Accts. (0.8)<br>
- SEC. No. of Accts & SEC.Active Accts. (1.0)<br>
- SEC. Current Balance & SEC Sanctioned Amount. (0.9)<br>
- SEC. Current Balance & SEC. Disbursed Amount. (0.9)<br>
- SEC Sanctioned Amount & SEC. Disbursed Amount. (1.0)<br>
- Adhar_flag & VoterID_flag. (-0.9)<br>
- New Accts in last 6 months & PRI Active Accts.(0.7)<br>
- Disbursed_Amount & Asset_cost.(0.7)<br>


    

In [None]:
#performing log transformations on disbursed amount, ltv, and asset cost

# training dataset
train['disbursed_amount'] = np.log1p(train['disbursed_amount'])
train['ltv'] = np.log1p(train['ltv'])
train['asset_cost'] = np.log1p(train['asset_cost'])

# test data set
test['disbursed_amount'] = np.log1p(test['disbursed_amount'])
test['ltv'] = np.log1p(test['ltv'])
test['asset_cost'] = np.log1p(test['asset_cost'])

# plotting training dataset
plt.rcParams['figure.figsize'] = (18, 5)

plt.subplot(1, 3, 1)
sns.distplot(train['disbursed_amount'],  color = 'orange')
plt.title('Disburesed Amount')

plt.subplot(1, 3, 2)
sns.distplot(train['asset_cost'], color = 'pink')
plt.title('Asset Cost')

plt.subplot(1, 3, 3)
sns.distplot(train['ltv'], color = 'red')
plt.title('Loan to value of the asset')

plt.show()

In [None]:
print("Total no. of Unique Ids :", train['UniqueID'].nunique())
print("Total no. of Unique Branches :", train['branch_id'].nunique())
print("Total no. of Unique Suppliers :", train['supplier_id'].nunique())
print("Total no. of Unique Manufactures :", train['manufacturer_id'].nunique())
print("Total no. of Unique Current pincode Ids :", train['Current_pincode_ID'].nunique())
print("Total no. of Unique State IDs :", train['State_ID'].nunique())
print("Total no. of Unique Employee code IDs :", train['Employee_code_ID'].nunique())


In [None]:
print("Total no. of Unique Ids :", test['UniqueID'].nunique())
print("Total no. of Unique Branches :", test['branch_id'].nunique())
print("Total no. of Unique Suppliers :", test['supplier_id'].nunique())
print("Total no. of Unique Manufactures :", test['manufacturer_id'].nunique())
print("Total no. of Unique Current pincode Ids :", test['Current_pincode_ID'].nunique())
print("Total no. of Unique State IDs :", test['State_ID'].nunique())
print("Total no. of Unique Employee code IDs :", test['Employee_code_ID'].nunique())

In [None]:
# normalizing the value 
plt.figure(1) 
plt.subplot(311) 
train['manufacturer_id'].value_counts(normalize=True).plot.bar(figsize=(24,10), title= 'manufacturer_id', fontsize=14) 
plt.subplot(312) 
train['State_ID'].value_counts(normalize=True).plot.bar(title= 'State_ID',fontsize=14) 
plt.subplot(313) 
train['branch_id'].value_counts(normalize=True).plot.bar(title= 'branch_id', fontsize=14) 

plt.show()

In [None]:
# converting the DOB in date-time-format to extract the year of birth 

train['Date.of.Birth'] = pd.to_datetime(train['Date.of.Birth'],errors = 'coerce')

# extracting the year of birth of the customers
train['Year_of_birth'] = train['Date.of.Birth'].dt.year

# checking the values inside date of year
sns.distplot(train['Year_of_birth'], color = 'green')
plt.title('Distribution of Year of birth')

In [None]:
# Changing Employment.Type dtype object to int64

train['Employment.Type'] = train['Employment.Type'].replace(('Self employed', 'Salaried', 'Unemployed'), (2, 1, 0))

# checking the values  of employement type
train['Employment.Type'].value_counts()

In [None]:
test['Employment.Type'] = test['Employment.Type'].replace(('Self employed', 'Salaried', 'Unemployed'), (2, 1, 0))

# checking the values  of employement type
test['Employment.Type'].value_counts()

In [None]:



#Visualizing the Employment Type

sns.countplot(x='Employment.Type',data=train)

plt.show()

train['Employment.Type'].value_counts()

In [None]:
# features extraction from disbursal dates
# Extracting months as all disbursement done in year 2018.

train['DisbursalDate'] = pd.to_datetime(train['DisbursalDate'], errors = 'coerce')

# extracting the month of the disbursement
train['DisbursalMonth'] = train['DisbursalDate'].dt.month

train['DisbursalMonth'].value_counts()



In [None]:
plt.rcParams['figure.figsize'] = (18, 5)
sns.countplot(train['DisbursalMonth'], palette = 'colorblind')
plt.title('Months', fontsize = 30)


In [None]:
# customer has aadhar card or not
sns.countplot(x="Aadhar_flag", data=train)

train['Aadhar_flag'].value_counts()


In [None]:
# customer has shared the mobile no. or not

sns.countplot(x="MobileNo_Avl_Flag", data=train)

train['MobileNo_Avl_Flag'].value_counts()

In [None]:
# customer has pan card or not
sns.countplot(x="PAN_flag", data=train)
train['PAN_flag'].value_counts()

In [None]:
# customer shared voter-id card or not
sns.countplot(x="VoterID_flag", data=train)
train['VoterID_flag'].value_counts()

In [None]:
# customer shared driving license or not
sns.countplot(x="Driving_flag", data=train)
train['Driving_flag'].value_counts()

In [None]:
# customer shared passport or not
sns.countplot(x="Passport_flag", data=train)
train['Passport_flag'].value_counts()

In [None]:
# checking the perform cns score description

sns.countplot(x='PERFORM_CNS.SCORE.DESCRIPTION',data=train)
plt.xticks(rotation = 90)
plt.show()

train['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts()

In [None]:
# encodings for bureau score(perform cns score distribution)

train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('No Bureau History Available', 0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Sufficient History Not Available', 0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Not Enough Info available on the customer', 0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: No Activity seen on the customer (Inactive)',0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: No Updates available in last 36 months', 0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Only a Guarantor', 0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: More than 50 active Accounts found',0)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('M-Very High Risk', 1)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('L-Very High Risk', 1)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('K-High Risk', 2)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('J-High Risk', 2)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('I-Medium Risk', 3)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('H-Medium Risk', 3)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('G-Low Risk', 4)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('F-Low Risk', 4)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('E-Low Risk', 4)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('D-Very Low Risk', 5)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('C-Very Low Risk', 5)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('B-Very Low Risk', 5)
train['PERFORM_CNS.SCORE.DESCRIPTION'] = train['PERFORM_CNS.SCORE.DESCRIPTION'].replace('A-Very Low Risk', 5)

# checing the values in bureau score
train['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts()

In [None]:
# encodings for bureau score(perform cns score distribution)

test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('No Bureau History Available', 0)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Sufficient History Not Available', 0)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Not Enough Info available on the customer', 0)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: No Activity seen on the customer (Inactive)',0)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: No Updates available in last 36 months', 0)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('Not Scored: Only a Guarantor', 0)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('M-Very High Risk', 1)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('L-Very High Risk', 1)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('K-High Risk', 2)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('J-High Risk', 2)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('I-Medium Risk', 3)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('H-Medium Risk', 3)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('G-Low Risk', 4)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('F-Low Risk', 4)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('E-Low Risk', 4)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('D-Very Low Risk', 5)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('C-Very Low Risk', 5)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('B-Very Low Risk', 5)
test['PERFORM_CNS.SCORE.DESCRIPTION'] = test['PERFORM_CNS.SCORE.DESCRIPTION'].replace('A-Very Low Risk', 5)

In [None]:
# checking the bureau score

plt.rcParams['figure.figsize'] = (15, 5)
plt.subplot(1, 2, 1)
sns.distplot(train['PERFORM_CNS.SCORE'], color = 'green')
plt.title('Before Log transformations')

# tranforming to log 

plt.subplot(1, 2, 2)
train['PERFORM_CNS.SCORE'] = np.log1p(train['PERFORM_CNS.SCORE'])
sns.distplot(train['PERFORM_CNS.SCORE'], color = 'blue')
plt.title('After Log transformations')
plt.show()

# for test
test['PERFORM_CNS.SCORE'] = np.log1p(test['PERFORM_CNS.SCORE'])


In [None]:
#  applying log transformations to the primary account attributes

train['PRI.NO.OF.ACCTS'] = np.log1p(train['PRI.NO.OF.ACCTS'])
train['PRI.ACTIVE.ACCTS'] = np.log1p(train['PRI.ACTIVE.ACCTS'])
train['PRI.OVERDUE.ACCTS'] = np.log1p(train['PRI.OVERDUE.ACCTS'])
#train['PRI.CURRENT.BALANCE'] = np.log1p(train['PRI.CURRENT.BALANCE'])
#train['PRI.SANCTIONED.AMOUNT'] = np.log1p(train['PRI.SANCTIONED.AMOUNT'])
train['PRI.DISBURSED.AMOUNT'] = np.log1p(train['PRI.DISBURSED.AMOUNT'])


#  filling  missing values in sec.current.balance
train['PRI.CURRENT.BALANCE'].fillna(train['PRI.CURRENT.BALANCE'].mean(), inplace = True)
train['PRI.SANCTIONED.AMOUNT'].fillna(train['PRI.SANCTIONED.AMOUNT'].mean(), inplace = True)

#  for test
test['PRI.NO.OF.ACCTS'] = np.log1p(test['PRI.NO.OF.ACCTS'])
test['PRI.ACTIVE.ACCTS'] = np.log1p(test['PRI.ACTIVE.ACCTS'])
test['PRI.OVERDUE.ACCTS'] = np.log1p(test['PRI.OVERDUE.ACCTS'])
#test['PRI.CURRENT.BALANCE'] = np.log1p(test['PRI.CURRENT.BALANCE'])
#test['PRI.SANCTIONED.AMOUNT'] = np.log1p(test['PRI.SANCTIONED.AMOUNT'])
test['PRI.DISBURSED.AMOUNT'] = np.log1p(test['PRI.DISBURSED.AMOUNT'])


#  filling  missing values in sec.current.balance
test['PRI.CURRENT.BALANCE'].fillna(test['PRI.CURRENT.BALANCE'].mean(), inplace = True)
test['PRI.SANCTIONED.AMOUNT'].fillna(test['PRI.SANCTIONED.AMOUNT'].mean(), inplace = True)




In [None]:

# plotting distribution plots for these attributes

plt.rcParams['figure.figsize'] = (20, 16)    
plt.subplot(2, 3, 1)
sns.distplot(train['PRI.NO.OF.ACCTS'], color = 'violet')
plt.title('Total loan taken by customer')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 2)
sns.distplot(train['PRI.ACTIVE.ACCTS'], color = 'violet')
plt.title('Active loan taken by customer')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 3)
sns.distplot(train['PRI.OVERDUE.ACCTS'], color = 'violet')
plt.title('Default Accounts')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 4)
sns.distplot(train['PRI.CURRENT.BALANCE'], color = 'violet')
plt.title('Principal Outstanding amount')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 5)
sns.distplot(train['PRI.SANCTIONED.AMOUNT'], color = 'violet')
plt.title('Total Sanctioned Amount')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 6)
sns.distplot(train['PRI.DISBURSED.AMOUNT'], color = 'violet')
plt.title('Total Disbured Amount')
plt.xticks(rotation = 45)

plt.show()

In [None]:
# distribution for different attributesof secondary accounts


plt.rcParams['figure.figsize'] = (20, 14)    
plt.subplot(2, 3, 1)
sns.distplot(train['SEC.NO.OF.ACCTS'], color = 'red')
plt.title('Total loan taken by customer')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 2)
sns.distplot(train['SEC.ACTIVE.ACCTS'], color = 'red')
plt.title('Active loan taken by customer')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 3)
sns.distplot(train['SEC.OVERDUE.ACCTS'], color = 'red')
plt.title('Default Accounts at the time of disbursement')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 4)
sns.distplot(train['SEC.CURRENT.BALANCE'], color = 'red')
plt.title('Principal Outstanding amount')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 5)
sns.distplot(train['SEC.SANCTIONED.AMOUNT'], color = 'red')
plt.title('Total Sanctioned Amount')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 6)
sns.distplot(train['SEC.DISBURSED.AMOUNT'], color = 'red')
plt.title('Total Disbured Amount')
plt.xticks(rotation = 45)

plt.show()

In [None]:
train['SEC.NO.OF.ACCTS'] = np.log1p(train['SEC.NO.OF.ACCTS'])
train['SEC.ACTIVE.ACCTS'] = np.log1p(train['SEC.ACTIVE.ACCTS'])
train['SEC.OVERDUE.ACCTS'] = np.log1p(train['SEC.OVERDUE.ACCTS'])

train['SEC.SANCTIONED.AMOUNT'] = np.log1p(train['SEC.SANCTIONED.AMOUNT'])
train['SEC.DISBURSED.AMOUNT'] = np.log1p(train['SEC.DISBURSED.AMOUNT'])

#  filling  missing values in sec.current.balance
train['SEC.CURRENT.BALANCE'].fillna(train['SEC.CURRENT.BALANCE'].mean(), inplace = True)


# for test 


test['SEC.NO.OF.ACCTS'] = np.log1p(test['SEC.NO.OF.ACCTS'])
test['SEC.ACTIVE.ACCTS'] = np.log1p(test['SEC.ACTIVE.ACCTS'])
test['SEC.OVERDUE.ACCTS'] = np.log1p(test['SEC.OVERDUE.ACCTS'])

test['SEC.SANCTIONED.AMOUNT'] = np.log1p(test['SEC.SANCTIONED.AMOUNT'])
test['SEC.DISBURSED.AMOUNT'] = np.log1p(test['SEC.DISBURSED.AMOUNT'])

#  filling  missing values in sec.current.balance
test['SEC.CURRENT.BALANCE'].fillna(test['SEC.CURRENT.BALANCE'].mean(), inplace = True)

In [None]:
plt.rcParams['figure.figsize'] = (20, 16)    
plt.subplot(2, 3, 1)
sns.distplot(train['SEC.NO.OF.ACCTS'], color = 'blue')
plt.title('Total loan taken by customer')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 2)
sns.distplot(train['SEC.ACTIVE.ACCTS'], color = 'blue')
plt.title('Active loan taken by customer')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 3)
sns.distplot(train['SEC.OVERDUE.ACCTS'], color = 'blue')
plt.title('Default Accounts')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 4)
sns.distplot(train['SEC.CURRENT.BALANCE'], color = 'blue')
plt.title('Principal Outstanding amount')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 5)
sns.distplot(train['SEC.SANCTIONED.AMOUNT'], color = 'blue')
plt.title('Total Sanctioned Amount')
plt.xticks(rotation = 45)

plt.subplot(2, 3, 6)
sns.distplot(train['SEC.DISBURSED.AMOUNT'], color = 'blue')
plt.title('Total Disbured Amount')
plt.xticks(rotation = 45)

plt.show()

In [None]:
# EMI Amount of the Secondary Plan

plt.subplot(1, 2, 1)
sns.distplot(train['SEC.INSTAL.AMT'])
plt.title('EMI Amount Secondary Plan', fontsize = 20)
plt.xticks(rotation = 45)

plt.subplot(1, 2, 2)
sns.distplot(train['PRIMARY.INSTAL.AMT'])
plt.title('EMI Amount Primary Plan', fontsize = 20)
plt.xticks(rotation = 45)

plt.show()

In [None]:
#performing log transformations

train['PRIMARY.INSTAL.AMT'] = np.log1p(train['PRIMARY.INSTAL.AMT'])
train['SEC.INSTAL.AMT'] = np.log1p(train['SEC.INSTAL.AMT'])


plt.subplot(1, 2, 1)
sns.distplot(train['SEC.INSTAL.AMT'], color = 'yellow')
plt.title('EMI Amount Secondary Plan', fontsize = 20)
plt.xticks(rotation = 45)

plt.subplot(1, 2, 2)
sns.distplot(train['PRIMARY.INSTAL.AMT'],color = 'yellow')
plt.title('EMI Amount Primary Plan', fontsize = 20)
plt.xticks(rotation = 45)

plt.show()

# test

test['PRIMARY.INSTAL.AMT'] = np.log1p(test['PRIMARY.INSTAL.AMT'])
test['SEC.INSTAL.AMT'] = np.log1p(test['SEC.INSTAL.AMT'])

In [None]:
# New Accts in last six months
train['NEW.ACCTS.IN.LAST.SIX.MONTHS'].value_counts()

In [None]:
# loans defaulted accounts in last six months
train['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'].value_counts()

In [None]:
plt.subplot(1, 2, 1)
sns.distplot(train['NEW.ACCTS.IN.LAST.SIX.MONTHS'])
plt.title('NEW.ACCTS.IN.LAST.SIX.MONTHS', fontsize = 20)
plt.xticks(rotation = 45)

plt.subplot(1, 2, 2)
sns.distplot(train['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'])
plt.title('DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', fontsize = 20)
plt.xticks(rotation = 45)

plt.show()

In [None]:
# average.acct.age i.e., average loan tenure

sns.countplot(train['AVERAGE.ACCT.AGE'].head(50), palette = 'colorblind')
plt.title('Average Loan Tenure')
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Converting the given 'CREDIT.HISTORY.LENGTH' in months

import re

train['CREDIT.HISTORY.LENGTH']= train['CREDIT.HISTORY.LENGTH'].apply(lambda x: (re.sub('[a-z]','',x)).split())
train['CREDIT.HISTORY.LENGTH']= train['CREDIT.HISTORY.LENGTH'].apply(lambda x: int(x[0])*12+int(x[1]))

# Converting the given 'AVERAGE.ACCT.AGE' in months
train['AVERAGE.ACCT.AGE']= train['AVERAGE.ACCT.AGE'].apply(lambda x: (re.sub('[a-z]','',x)).split())
train['AVERAGE.ACCT.AGE']= train['AVERAGE.ACCT.AGE'].apply(lambda x: int(x[0])*12+int(x[1]))

# Converting the given 'CREDIT.HISTORY.LENGTH' in months
test['CREDIT.HISTORY.LENGTH']= test['CREDIT.HISTORY.LENGTH'].apply(lambda x: (re.sub('[a-z]','',x)).split())
test['CREDIT.HISTORY.LENGTH']= test['CREDIT.HISTORY.LENGTH'].apply(lambda x: int(x[0])*12+int(x[1]))

# Converting the given 'AVERAGE.ACCT.AGE' in months
test['AVERAGE.ACCT.AGE']= test['AVERAGE.ACCT.AGE'].apply(lambda x: (re.sub('[a-z]','',x)).split())
test['AVERAGE.ACCT.AGE']= test['AVERAGE.ACCT.AGE'].apply(lambda x: int(x[0])*12+int(x[1]))

In [None]:
# distribution of AVERAGE LOAN TENURE in years
plt.title('AVERAGE LOAN TENURE', fontsize = 25)
plt.rcParams['figure.figsize'] = (18, 5)
sns.countplot(train['AVERAGE.ACCT.AGE'].head(50))
#(x='AVERAGE.ACCT.AGE',data=train,palette = 'dark')
plt.show()
#train['AVERAGE.ACCT.AGE'].value_counts()

In [None]:
# plotting credit history of users

plt.rcParams['figure.figsize'] = (18, 5)
sns.countplot(train['CREDIT.HISTORY.LENGTH'].head(50))
plt.title('Credit History')
plt.xticks(rotation = 90)
plt.show()
#train['CREDIT.HISTORY.LENGTH'].value_counts()

In [None]:
sns.countplot(train['NO.OF_INQUIRIES'], palette = 'muted')
plt.title('No. of Inquiries',  fontsize = 30)
plt.show()
train['NO.OF_INQUIRIES'].value_counts()

In [None]:
train['Downpayment']=train['asset_cost']-train['disbursed_amount'] 
test['Downpayment']=test['asset_cost']-test['disbursed_amount']

In [None]:

plt.figure(1) 
plt.subplot(121) 
sns.distplot(train['Downpayment']);
plt.subplot(122) 
train['Downpayment'].plot.box(figsize=(16,5))

We can see it is shifted towards left, i.e., the distribution is right skewed. So, let’s tae the log transformation to make the distribution normal.



In [None]:
train['Downpayment_log'] = np.log(train['Downpayment'])
plt.figure(1) 
plt.subplot(121) 
sns.distplot(train['Downpayment_log']);
plt.subplot(122) 
train['Downpayment_log'].plot.box(figsize=(16,5))
test['Downpayment_log'] = np.log(test['Downpayment'])



Now the distribution looks much closer to normal and effect of extreme values has been significantly subsided.

In [None]:
# some attributes are categorical but they are in integer so let's convert them into category

train['branch_id'] = train['branch_id'].astype('category')
train['manufacturer_id'] = train['manufacturer_id'].astype('category')
train['State_ID'] = train['State_ID'].astype('category')


test['branch_id'] = test['branch_id'].astype('category')
test['manufacturer_id'] = test['manufacturer_id'].astype('category')
test['State_ID'] = test['State_ID'].astype('category')


from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['branch_id'] = le.fit_transform(train['branch_id'])
train['manufacturer_id'] = le.fit_transform(train['manufacturer_id'])
train['State_ID'] = le.fit_transform(train['State_ID'])





In [None]:
train.shape, test.shape

In [None]:
train=train.drop(['supplier_id','Current_pincode_ID', 'Date.of.Birth', 'DisbursalDate', 'Employee_code_ID','PRI.DISBURSED.AMOUNT', 'disbursed_amount','PRI.NO.OF.ACCTS','SEC.NO.OF.ACCTS','SEC.SANCTIONED.AMOUNT','SEC.DISBURSED.AMOUNT','VoterID_flag','PRI.ACTIVE.ACCTS','Year_of_birth','DisbursalMonth','branch_id', 'manufacturer_id', 'State_ID'], axis=1) 
test=test.drop(['supplier_id','Current_pincode_ID', 'Date.of.Birth', 'DisbursalDate', 'Employee_code_ID','PRI.DISBURSED.AMOUNT', 'disbursed_amount','PRI.NO.OF.ACCTS','SEC.NO.OF.ACCTS','SEC.SANCTIONED.AMOUNT','SEC.DISBURSED.AMOUNT','VoterID_flag','PRI.ACTIVE.ACCTS','branch_id', 'manufacturer_id', 'State_ID'], axis=1)

In [None]:
train.shape, test.shape

In [None]:
X = train.drop('loan_default',1) 
y = train.loan_default

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3, random_state = 0)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_cv.shape)
print(y_cv.shape)

In [None]:
#calling logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X, y)
print(logreg.coef_)
print(logreg.intercept_)

In [None]:
#fitting the model with x and y attributes of train data
#in this it is goin to learn the pattern
logreg.fit(x_train, y_train)

In [None]:
#now applying our learnt model on test and also on train data
y_log_pred_test = logreg.predict(x_cv)
y_log_pred_train = logreg.predict(x_train)

In [None]:
#creating a confusion matrix to understand the classification
conf = metrics.confusion_matrix(y_cv, y_log_pred_test)
conf

In [None]:
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_cv, y_log_pred_test)
print(confusion)
#[row, column]
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print ("TP",TP)
print ("TN",TN)
print("FN",FN)
print ("FP",FP)

In [None]:
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(conf,cmap = cmap,xticklabels=['predicted_default_NO=0','predicted_default_yes=1'],yticklabels=['actual_default_NO=0','actual_default_yes=1'],annot=True, fmt="d")

In [None]:
# print the first 25 true and predicted responses
print('True', y_cv.values[0:15])
print('Pred', y_log_pred_test[0:15])

In [None]:
#comparing the metrics of predicted lebel and real label of test data
print('Accuracy_Score:', metrics.accuracy_score(y_cv, y_log_pred_test))

In [None]:
# Method to calculate Classification Error
    

print('Classification Error:',1 - metrics.accuracy_score(y_cv, y_log_pred_test))

In [None]:
# Method to calculate Sensitivity

print('Sensitivity or Recall:', metrics.recall_score(y_cv, y_log_pred_test))

In [None]:
specificity = TN / (TN + FP)

print(specificity)

In [None]:

from sklearn.metrics import classification_report
print(classification_report(y_cv, y_log_pred_test))

In [None]:
# print the first 10 predicted responses
# 1D array (vector) of binary values (0, 1)
logreg.predict(x_cv)[0:10]

In [None]:
# print the first 10 predicted probabilities of class membership
logreg.predict_proba(x_cv)[0:10]

In [None]:
# print the first 10 predicted probabilities for class 1   ( predicting Loan_default =1)
logreg.predict_proba(x_cv)[0:10, 1]

In [None]:
# store the predicted probabilities for class 1
y_pred_prob = logreg.predict_proba(x_cv)[:, 1]

In [None]:
y_pred_prob[0:10]

In [None]:
# Plotting predicion through histogram of predicted probabilities
%matplotlib inline
import matplotlib.pyplot as plt

# 8 bins
plt.hist(y_pred_prob, bins=8)

# x-axis limit from 0 to 1
plt.xlim(0,1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of default')
plt.ylabel('Frequency')

Histogram suggest that the predicted probabilities are almost normaly distributed with a tail on left side and most of the probabilities are <0.5. 

<b> As our default probability and threshold probability is same. So there is no change in values. 


In [None]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities

# we pass y_cv and y_pred_prob
# we do not use y_pred, because it will give incorrect results without generating an error
# roc_curve returns 3 objects fpr, tpr, thresholds
# fpr: false positive rate
# tpr: true positive rate
fpr, tpr, thresholds = metrics.roc_curve(y_cv, y_pred_prob)

plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for default classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
# IMPORTANT: first argument is true values, second argument is predicted probabilities

print(metrics.roc_auc_score(y_cv, y_pred_prob))

# Random FOREST

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3,random_state =4)

In [None]:
# Handing Class im balance
from imblearn.combine import SMOTETomek
smt = SMOTETomek(ratio='auto')
X_smt, y_smt = smt.fit_sample(x_train, y_train)

In [None]:
from sklearn.model_selection import StratifiedKFold
cv =StratifiedKFold(n_splits=10,shuffle=True,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(max_depth = 10, n_estimators=300,verbose=1, n_jobs=1,random_state=42)

In [None]:
forest=rf.fit(X_smt,y_smt)

In [None]:
print(forest.score(X_smt,y_smt))

In [None]:
pre = forest.predict(X_smt)

In [None]:
pre_ = forest.predict_proba(X_smt)

In [None]:
test.head()

In [None]:
pred = forest.predict(test)

In [None]:
pred_=forest.predict_proba(test)

# Evaluating the Random Forest Model

In [None]:
df_confusion_rf = metrics.confusion_matrix(y_smt, pre)
df_confusion_rf

In [None]:
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(df_confusion_rf, cmap = cmap,xticklabels=['Prediction No','Prediction Yes'],yticklabels=['Actual No','Actual Yes'], annot=True,
            fmt='d')

In [None]:
y_smt.shape, pre.shape

In [None]:
# print the first 15 true and predicted responses
print('True', y_smt[0:15])
print('Pred', pre[0:15])

In [None]:
#comparing the metrics of predicted lebel and real label of test data
print('Accuracy_Score:', metrics.accuracy_score(y_smt, pre))

In [None]:
# Method to calculate Sensitivity

print('Sensitivity or Recall:', metrics.recall_score(y_smt, pre))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_smt, pre))

In [None]:

from sklearn import metrics 
fpr, tpr, thresholds = metrics.roc_curve(y_smt, pre) 
auc = metrics.roc_auc_score(y_smt, pre) 
plt.figure(figsize=(12,8)) 
plt.plot(fpr,tpr,label="validation, auc="+str(auc)) 
plt.xlabel('False Positive Rate')  
plt.ylabel('True Positive Rate') 
plt.legend(loc=4) 
plt.show()

In [None]:
from sklearn.model_selection import cross_val_predict
pre_rf = cross_val_predict(rf, cv=cv, X=X_smt,y=y_smt, verbose=1)

In [None]:
from sklearn.metrics import roc_auc_score
print("auc score =\t" ,roc_auc_score(y_smt, pre_rf))

In [None]:
pre_rf

In [None]:

rf.fit(X_smt,y_smt)

In [None]:
pred_out = rf.predict(test)

In [None]:
pred_out

In [None]:
prob_out=rf.predict_proba(test)[:,1]

In [None]:
prob_out[0:10], pred_out.shape, prob_out.shape

In [None]:
submission = pd.read_csv("../input/Submission_LTFS.csv")

In [None]:
submission['loan_default']=pred_out            # filling Loan_Status with predictions
submission['UniqueID']=test['UniqueID'] # filling Unique_ID with test Unique_ID

In [None]:
pd.DataFrame(submission, columns=['UniqueID','loan_default']).to_csv('submission_rf.csv')

# XGBoosting

In [None]:
from sklearn import preprocessing 
for f in train.columns: 
    if train[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder() 
        lbl.fit(list(train[f].values)) 
        train[f] = lbl.transform(list(train[f].values))

for f in test.columns: 
    if test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder() 
        lbl.fit(list(test[f].values)) 
        test[f] = lbl.transform(list(test[f].values))

train.fillna((-999), inplace=True) 
test.fillna((-999), inplace=True)

train=np.array(train) 
test=np.array(test) 
train = train.astype(float) 
test = test.astype(float)

 

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X,y, test_size =0.3,random_state =4)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
xgb= XGBClassifier(n_estimators=120, learning_rate=1, n_jobs=-1,random_state=42)
predict = cross_val_predict(xgb, cv=cv, X=X_smt,y=y_smt, verbose=1,method='predict_proba')

In [None]:
boost=xgb.fit(X_smt,y_smt)
pred_xgb = boost.predict(X_smt)


In [None]:
pred_xg=boost.predict(test)

In [None]:

print("auc score =\t" ,roc_auc_score(y_smt, pred_xgb))

In [None]:
pred_xgb, pred_xgb.shape

In [None]:
from sklearn.model_selection import cross_val_score
xgb= XGBClassifier(n_estimators=120, learning_rate=1, n_jobs=-1,random_state=42)
scores = cross_val_score(xgb, cv=cv, X=X_smt,y=y_smt, verbose=1,scoring='roc_auc')
print("auc\t=\t",scores.mean())

In [None]:

submission["loan_default"] = pred_xg
submission.to_csv("submission_xgb.csv", index=False)
submission.head()

END