In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import  RobustScaler

In [None]:
train = pd.read_csv("../input/lt-vehicle-loan-default-prediction/train.csv")
train.head()

In [None]:
train.shape

In [None]:
tr= train

In [None]:
tr.columns

In [None]:
tr.info()

In [None]:
tr.set_index('UniqueID', inplace =True)

In [None]:
tr['loan_default'].value_counts()

In [None]:
test = pd.read_csv('../input/lt-vehicle-loan-default-prediction/test.csv')
test.head()

In [None]:
ts = test.set_index('UniqueID')

In [None]:
ts.head()

In [None]:
ts.columns

In [None]:
tr.isnull().sum()

In [None]:
ts.isnull().sum()

In [None]:
ts.info()

In [None]:
## Train data showing the default proportions where 0 denotes as non-default and 1 denotes as default
tr.loan_default.value_counts().plot.bar()
plt.xlabel('Default Proportion')
plt.ylabel('customers')
plt.title('number of clients')
plt.show()

In [None]:
##Test data showing the employment info of the customers

ts['Employment.Type'].value_counts().plot.bar()
plt.xlabel('Default Proportion')
plt.ylabel('customers')
plt.title('number of clients')
plt.show()

In [None]:
ts['MobileNo_Avl_Flag'].count()

In [None]:
train['Employment.Type'].value_counts()

In [None]:
ts['Employment.Type'].value_counts()

In [None]:
tr.fillna('NAN',inplace=True)
ts.fillna('NAN',inplace=True)

In [None]:
tr['Employment.Type'].value_counts(normalize=True)

In [None]:
ts['Employment.Type'].value_counts(normalize=True)

Thus the missing values in train data is 3.28% and missing values in test data is 3.06% for the employment data. The data reflects that the details of the customers are not updated for income source type and these people can be at a high risk of default if they don't have an actual employment. Since we don't have any info regarding the employment type of these people and they constitute only small dataset, we can update the missing values as NAN and drop them to do further analysis and check if still we can get significant results

In [None]:
#Creating function for checking the correlation between variables
def correlationplot(data,width):
    corr = data.corr()
    plt.figure(num=None,figsize=(width, width), dpi=80, facecolor='w', edgecolor='black')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title('Correlation Matrix')
    plt.show()

In [None]:
train.corr()

In [None]:
#Creating function for checking the relation between variables using histogram

def histogramplot(data, no_of_rows):
    nrow,ncol = data.shape
    for i in range (ncol,no_of_rows):
        plt.subplot(ncol,no_of_rows)
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.show()
        
histogramplot(tr,8)

In [None]:
tr.reset_index(inplace=True)

In [None]:
tr.head()

In [None]:
def print_all_values():
    df1=tr.drop('disbursed_amount',axis=1)
    cols=tr.columns
    for col in cols:
        if (tr[col].dtypes !='object'):

            fig1=plt.figure()
            ax1=plt.axes()
            plt.scatter(tr.disbursed_amount,tr[[col]],alpha=1)
            plt.title('Comparison of features with disbursed amount')
            ax1 = ax1.set(xlabel='disbursed_amount', ylabel=col)
            plt.show()
            
            
print_all_values()

The above graphs are scatterplot to check the impact on different features w.r.t disbusred amount for train data. This helps majorly to check the category of disbursed_amount range which are more prone to default and the LTV on their respective loans.

###### checking the pattern of differnet varibales w.r.t uniqueID

In [None]:
def hist_all_values():
    df1=tr.drop('UniqueID',axis=1)
    cols=tr.columns
    for col in cols:
        if (tr[col].dtypes !='object'):

            fig1=plt.figure()
            tr.hist(column=col,grid=True, figsize=(12,8),bins=40)
            plt.title(col)
            plt.ylabel('counts')
            plt.xticks(rotation = 90)
            plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
            plt.show()
            
hist_all_values()

The histograms above are created to visualize the basics of all feature in train data to know customers general background like if they are from  same county,have same LTV or already have loan default history , etc..

#### Visualizing the test data

In [None]:
ts.reset_index(inplace=True)

In [None]:
ts.head()

In [None]:
def print_test_values():
    df1=ts.drop('disbursed_amount',axis=1)
    cols=ts.columns
    for col in cols:
        if (ts[col].dtypes !='object'):

            fig1=plt.figure()
            ax1=plt.axes()
            plt.scatter(ts.disbursed_amount,ts[[col]],alpha=1)
            plt.title('comparision of disbusred amount vs other features')
            ax1 = ax1.set(xlabel='disbursed_amount', ylabel=col)
            plt.show()
            
            
print_test_values()

The above graphs are scatterplot to check the impact on different features w.r.t disbusred amount for test data. This helps majorly to check the category of disbursed_amount range which are more prone to default and the LTV on their respective loans.

In [None]:
def hist_test_values():
    df1=ts.drop('UniqueID',axis=1)
    cols=ts.columns
    for col in cols:
        if (ts[col].dtypes !='object'):

            fig1=plt.figure()
            ts.hist(column=col,grid=True, figsize=(12,8),bins=40)
            plt.title(col)
            plt.ylabel('counts')
            plt.xticks(rotation = 90)
            plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
            plt.show()
            
hist_test_values()

The histograms above are created to visualize the basics of all feature in test data to know customers general background like if they are from  same county,have same LTV or already have loan default history , etc..

From the scatter plot and histogram we can easiy interpret that mostly the accounts which wre provided loan were around the november month of the year 2018 and still there are some deliquents accounts which means some have already made loan defaults.

The anomalies would be that te risk of loan default increases as the age of the person increases, but as per the data provide there are loan default by people with almost age group as young as born in year 1993.


There is no way to compare the relatred quatities as the probability of default doesnot only remains on one of the factor but it varies as per the many factors acting as features for the profile.

In [None]:
correlationplot(tr,8)

In [None]:
correlationplot(ts,8)

Missing values are shown by the white lines.

From the above correlation matrix we can observe that the loan default is highly correlated with number of inquiries and number of overdue accounts.

Other question can be asked as the disbursed amount should also be related highly with the chance of loan default. Though it affects the chances but the correlation matrix cleraly depicts that the account with previous default history are risky and the loan should be provided to them with more caution.

In [None]:
tr.head()

In [None]:
ts.head()

In [None]:
print(tr.shape)
print(ts.shape)

In [None]:
tr.boxplot(column='disbursed_amount', by='loan_default')

Above boxplot refelcts that the major loan default are reported in the loan disbursed amount under $2,00,000

In [None]:
tr.boxplot(column='disbursed_amount', by='NO.OF_INQUIRIES')
ts.boxplot(column='disbursed_amount', by='NO.OF_INQUIRIES')

The above boxplot graph is made to check the no. of inquries of customers for both test and train dataset

In [None]:
tr.head()

In [None]:
# creating a function to split the credit risk into risk grade and risk type
def credit_risk(tr):
    d1=[]
    d2=[]
    for i in tr:
        a = i.split("-")
        if len(a) == 1:
            d1.append(a[0])
            d2.append('unknown')
        else:
            d1.append(a[1])
            d2.append(a[0])

    return d1,d2

In [None]:
def calc_number_of_ids(row):
#     print(type(row), row.size)
    return sum(row[['Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag',
       'Passport_flag']])
def check_pri_installment(row):
    if row['PRIMARY.INSTAL.AMT']<=1:
        return 0
    else:
        return row['PRIMARY.INSTAL.AMT']

In [None]:
# Now converting the Score description into number rating from 0 to -5

risk_map = {'No Bureau History Available':-1, 
              'Not Scored: No Activity seen on the customer (Inactive)':-1,
              'Not Scored: Sufficient History Not Available':-1,
              'Not Scored: No Updates available in last 36 months':-1,
              'Not Scored: Only a Guarantor':-1,
              'Not Scored: More than 50 active Accounts found':-1,
              'Not Scored: Not Enough Info available on the customer':-1,
              'Very Low Risk':4,
              'Low Risk':3,
              'Medium Risk':2, 
              'High Risk':1,
              'Very High Risk':0}

#Have used the grading system in descending order because A is least risky and going forward risk increases
sub_risk = {'unknown':-1, 'I':5, 'L':2, 'A':13, 'D':10, 'M':1, 'B':12, 'C':11, 'E':9, 'H':6, 'F':8, 'K':3,
       'G':7, 'J':4}

#Firstly converting the employment type to numbers:

employment_map = {'Self employed':0, 'Salaried':1, 'NAN':-1}


In [None]:
def features_engineering(df):
    

# Now converting the Date of birth of customers into the age and creating a new feature age:

    df['Date.of.Birth'] = pd.to_datetime(df['Date.of.Birth'], format = "%d-%m-%y")
    now = pd.Timestamp('now')
    df['Age'] = (now - df['Date.of.Birth']).astype('<m8[Y]').astype(int)
    age_mean = int(df[df['Age']>0]['Age'].mean())
    df.loc[:,'age'] = df['Age'].apply(lambda x: x if x>0 else age_mean)

# Now converting the Disbursal date of loan into no. of month passed from disbural month.

    df['DisbursalDate'] = pd.to_datetime(df['DisbursalDate'], format = "%d-%m-%y")
    df['disbursal_months_passed'] = ((now - df['DisbursalDate'])/np.timedelta64(1,'M')).astype(int)

#Now converting AVERAGE.ACCT.AGE into number of months :
    df['average_act_age_in_months'] = df['AVERAGE.ACCT.AGE'].apply(lambda x : int(re.findall(r'\d+',x)[0])*12 + int(re.findall(r'\d+',x)[1]))

# Now Converting CREDIT.HISTORY.LENGTH into number of months:

    df['credit_history_length_in_months'] = df['CREDIT.HISTORY.LENGTH'].apply(lambda x : int(re.findall(r'\d+',x)[0])*12 + int(re.findall(r'\d+',x)[1]))

#adding a feature of number of zeroes present in a row so that we can count how many zeroes on row has

    df['number_of_0'] = (df == 0).astype(int).sum(axis=1)
    
#creating additional column to split the PERFORM_CNS.SCORE.DESCRIPTION using credit risk function defined above

    df.loc[:,'credit_risk'],df.loc[:,'credit_risk_grade']  = credit_risk(df["PERFORM_CNS.SCORE.DESCRIPTION"])

#adding loan to asset ratio to check which if the clients with default had suufficient assets to repay loan at time of disbursement

    df.loc[:, 'loan_to_asset_ratio'] = df['disbursed_amount'] /df['asset_cost']

#adding total number of accounts feature:

    df.loc[:,'no_of_accts'] = df['PRI.NO.OF.ACCTS'] + df['SEC.NO.OF.ACCTS']

#Now adding columns carrying total number of  various accounts including the primary and secondary and combing them in one

    df.loc[:,'pri_inactive_accts'] = df['PRI.NO.OF.ACCTS'] - df['PRI.ACTIVE.ACCTS']
    df.loc[:,'sec_inactive_accts'] = df['SEC.NO.OF.ACCTS'] - df['SEC.ACTIVE.ACCTS']
    df.loc[:,'tot_inactive_accts'] = df['pri_inactive_accts'] + df['sec_inactive_accts']
    df.loc[:,'tot_overdue_accts'] = df['PRI.OVERDUE.ACCTS'] + df['SEC.OVERDUE.ACCTS']
    df.loc[:,'tot_current_balance'] = df['PRI.CURRENT.BALANCE'] + df['SEC.CURRENT.BALANCE']
    df.loc[:,'tot_sanctioned_amount'] = df['PRI.SANCTIONED.AMOUNT'] + df['SEC.SANCTIONED.AMOUNT']
    df.loc[:,'tot_disbursed_amount'] = df['PRI.DISBURSED.AMOUNT'] + df['SEC.DISBURSED.AMOUNT']
    df.loc[:,'tot_installment'] = df['PRIMARY.INSTAL.AMT'] + df['SEC.INSTAL.AMT']
    df.loc[:,'bal_disburse_ratio'] = np.round((1+df['tot_disbursed_amount'])/(1+df['tot_current_balance']),2)
    df.loc[:,'pri_tenure'] = (df['PRI.DISBURSED.AMOUNT']/( df['PRIMARY.INSTAL.AMT']+1)).astype(int)
    df.loc[:,'sec_tenure'] = (df['SEC.DISBURSED.AMOUNT']/(df['SEC.INSTAL.AMT']+1)).astype(int)
    df.loc[:,'disburse_to_sactioned_ratio'] =  np.round((df['tot_disbursed_amount']+1)/(1+df['tot_sanctioned_amount']),2)
    df.loc[:,'active_to_inactive_act_ratio'] =  np.round((df['no_of_accts']+1)/(1+df['tot_inactive_accts']),2)
    return df


In [None]:
# adding features for the credit risk and sub risk for which we have described numbers and grades above  
def label_data(df):
    df.loc[:,'credit_risk_label'] = df['credit_risk'].apply(lambda x: risk_map[x])
    df.loc[:,'sub_risk_label'] = df['credit_risk_grade'].apply(lambda x: sub_risk[x])
    return df

In [None]:
def data_correction(df):
    #Many customers have invalid date of birth, so immute invalid data with mean age
    df.loc[:,'PRI.CURRENT.BALANCE'] = df['PRI.CURRENT.BALANCE'].apply(lambda x: 0 if x<0 else x)
    df.loc[:,'SEC.CURRENT.BALANCE'] = df['SEC.CURRENT.BALANCE'].apply(lambda x: 0 if x<0 else x)
    df.loc[:,'employment_label'] = df['Employment.Type'].apply(lambda x: employment_map[x])

    #loan that do not have current pricipal outstanding should have 0 primary installment
    df.loc[:,'new_pri_installment']= df.apply(lambda x : check_pri_installment(x),axis=1)
    return df

In [None]:
def new_data(df):
    df = data_correction(df)
    df = features_engineering(df)
    df = label_data(df)

    return df

In [None]:
train_data = new_data(tr)
train_data = train_data[train_data['number_of_0']<=25]
test_data = new_data(ts)


In [None]:
train_data[train_data['number_of_0']>=20]['number_of_0'].value_counts()

In [None]:
train_data.columns


In [None]:
features = ['disbursed_amount', 'asset_cost',
            'Aadhar_flag', 'PAN_flag',
       'PERFORM_CNS.SCORE',
             'PRI.ACTIVE.ACCTS',
       'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT',  'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT',  'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
            'NO.OF_INQUIRIES','disbursal_months_passed',
       'average_act_age_in_months', 'credit_history_length_in_months',
       'number_of_0','loan_to_asset_ratio', 'no_of_accts', 'pri_inactive_accts',
       'sec_inactive_accts', 'tot_inactive_accts', 'tot_overdue_accts',
       'tot_current_balance', 'tot_sanctioned_amount', 'tot_disbursed_amount',
       'tot_installment', 'bal_disburse_ratio', 'pri_tenure', 'sec_tenure',
       'credit_risk_label',
       'employment_label', 'age', 'new_pri_installment'
           ]

In [None]:
print(train_data.shape)
print(test_data.shape)


In [None]:
# std_scaler = StandardScaler()
# RobustScaler is less prone to outliers.
rob_scaler = RobustScaler()

scaled_training = train_data.copy()
scaled_testing = test_data.copy()


scaled_training[features] = rob_scaler.fit_transform(scaled_training[features])
scaled_testing[features] = rob_scaler.fit_transform(scaled_testing[features])

y = scaled_training.loan_default
X = scaled_training[features]


In [None]:

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27,stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape,y_test.shape)


In [None]:
#Random Forest Testing
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfc_pre= rfc.predict(X_test)

In [None]:
accuracy_score(y_test, rfc_pre)

In [None]:
print(confusion_matrix(y_test, rfc_pre))
print(classification_report(y_test, rfc_pre))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=21, stratify=y)
print(X_train.shape, y_train.shape)

In [None]:
# Testing Logistic Regression
logreg= LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

#### Thus as per the rfc the accuracy of the model is coming as 86% which is better than the accurcy of 78% coming from Logistic regression

In [None]:
from sklearn.metrics import roc_curve
rfc_pre_prob = rfc.predict_proba(X_test)[:,1]
fpr, tpr , thresholds = roc_curve(y_test, rfc_pre_prob)

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label= 'Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('RFC ROC Curve')
plt.show()

In [None]:
logreg.predict_proba(X_test)[:,1]

In [None]:
# Trying to use K fold

In [None]:
# Verifying the result of RFC using GridCVsearch 
from sklearn.model_selection import cross_val_score
cv_results = cross_val_score(rfc,X,y,cv=5)

In [None]:
cv_results

In [None]:
np.mean(cv_results)

In [None]:
#Knn method
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(2)
knn.fit(X_train,y_train)
y_pred_knn= knn.predict(X_test)
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test,y_pred_knn))

 Thus rfc is better model to predict  the accuracy of the model which is coming to 85.27%