## Predicting Loan Default


Please upvote if you like this kernel.

In [None]:

# General Libraries
import pandas as pd
import numpy as np
from itertools import product
import warnings
from tqdm import tqdm

# Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing, modelling & evaluation
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score,  accuracy_score, precision_recall_fscore_support
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# Language Processing
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import describe,skew

In [None]:
#I will be using only train data for this project
train_data = pd.read_csv('/kaggle/input/lt-vehicle-loan-default-prediction/train.csv')

In [None]:
 train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.columns

In [None]:
train_data.isnull().sum() #missing values only in employment type

In [None]:
#finding unique data
train_data.apply(lambda x: len(x.unique()))

In [None]:
train_data['Employment.Type'].value_counts()

In [None]:
print('Percentage of missing values is {0}%'.format(round(100*train_data['Employment.Type'].isnull().sum()/len(train_data),3)))

In [None]:
#Replacing null values in Employment.Type to 'unknown'
train_data.fillna('unknown', inplace=True)
train_data['Employment.Type'].value_counts()

In [None]:
train_data.isnull().sum() #Great!, no missing values

In [None]:
#opening the description of file and reading it
file= open("/kaggle/input/lt-vehicle-loan-default-prediction/data_dictionary.csv", "r")
print(file.read())

In [None]:
train_data.loan_default.value_counts()

In [None]:
train_data.loan_default.value_counts().plot(kind = 'bar')

In [None]:
#Fraction of loan default
fraud_frac = train_data.loan_default.value_counts().min() / train_data.shape[0]
print("Fraction of loan default: {}".format("%.3f" % fraud_frac))

In [None]:
#Percentage way of representing loan default 
print('Percentage of loan default is {0}%'.format(round(100*train_data.loan_default.value_counts().min()/len(train_data), 3)))

In [None]:
train_data.nunique()

In [None]:
print(train_data['manufacturer_id'].unique())

In [None]:
train_data.dtypes.value_counts()

In [None]:
train_data.info()

In [None]:
train_data.describe().T

In [None]:
print(train_data['PERFORM_CNS.SCORE'].unique())

In [None]:
print(train_data['PERFORM_CNS.SCORE.DESCRIPTION'].unique())

In [None]:
#converting categorical features into numerical features
train_data['PERFORM_CNS.SCORE.DESCRIPTION'] = train_data['PERFORM_CNS.SCORE.DESCRIPTION'].replace({'No Bureau History Available': 0,'Not Scored: Not Enough Info available on the customer':0,'Not Scored: No Activity seen on the customer (Inactive)':0, 'Not Scored: Sufficient History Not Available':0,'Not Scored: No Updates available in last 36 months': 0,'Not Scored: Only a Guarantor': 0,'Not Scored: More than 50 active Accounts found': 0,'M-Very High Risk': 5,'L-Very High Risk': 5,'K-High Risk': 4,'J-High Risk':4,'I-Medium Risk':3,'H-Medium Risk':3,'G-Low Risk': 2,'F-Low Risk':2,'E-Low Risk': 2,'D-Very Low Risk':1,'C-Very Low Risk': 1, 'B-Very Low Risk':1,'A-Very Low Risk':1 })



In [None]:
#Renaming the column to make it more clear 
train_data.rename(columns={'PERFORM_CNS.SCORE.DESCRIPTION': 'Bureau_description'}, inplace=True)

In [None]:
print (train_data['Employment.Type'].unique())

In [None]:
train_data.columns

In [None]:
# Setting up time marker

d_marker= '08-11-19'
def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%d-%m-%y")
    d2 = datetime.strptime(d2, "%d-%m-%y")
    return abs((d2 - d1).days)

In [None]:
import datetime
from datetime import datetime
# age as on 1-1-2019 (in yrs)
train_data['Date.of.Birth'] = train_data['Date.of.Birth'].apply(lambda x:  days_between(str(x),d_marker)/365)
# Calculating time (in yrs) after disbursal
train_data['DisbursalDate']= train_data['DisbursalDate'].apply(lambda x:  days_between(str(x),d_marker)/365)


In [None]:
train_data.dtypes #AVG Acc age and credit history length is still in 'object'

In [None]:
import re
train_data['average_act_age_in_months'] = train_data['AVERAGE.ACCT.AGE'].apply(lambda x : int(re.findall(r'\d+',x)[0])*12 + int(re.findall(r'\d+',x)[1]))
train_data['credit_history_length_in_months'] = train_data['CREDIT.HISTORY.LENGTH'].apply(lambda x : int(re.findall(r'\d+',x)[0])*12 + int(re.findall(r'\d+',x)[1]))
   

In [None]:
train_data.drop(columns=['AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH'], inplace=True)

In [None]:
#to find correlation
plt.figure(figsize=(50,50))
sns.heatmap(train_data.corr(), annot=True, linewidths=3, linecolor='yellow', vmin= -1, vmax=1, cmap='bwr')
#not so clear redrawn

In [None]:
train_data_corr = train_data.corr()
train_data_corr

In [None]:
train_data.drop(columns=['MobileNo_Avl_Flag'], inplace = True)

In [None]:
#plot correlation

# Set the default matplotlib figure size to 7x7:
fix, ax = plt.subplots(figsize=(50,30))

# Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(train_data_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Plot the heatmap with seaborn.
# Assign the matplotlib axis the function returns. This will let us resize the labels.
ax = sns.heatmap(train_data_corr, mask=mask, ax=ax, annot= True, cmap='bwr')

# Resize the labels.
ax.set_xticklabels(ax.xaxis.get_ticklabels(), fontsize=20)
ax.set_yticklabels(ax.yaxis.get_ticklabels(), fontsize=20)

# If you put plt.show() at the bottom, it prevents those useless printouts from matplotlib.
plt.show()

In [None]:
train_data.dtypes

**Data Visualisations**

In [None]:
import seaborn as sns
df = train_data[train_data['average_act_age_in_months']<175]
sns.lineplot(x=df['average_act_age_in_months'],y=df['loan_default'])

In [None]:
ax = plt.subplots(figsize=(10, 7))
sns.scatterplot(x='asset_cost', y='loan_default', data=train_data, alpha = 0.3)

In [None]:
ax = plt.subplots(figsize=(10, 7))
sns.scatterplot(x='PRI.CURRENT.BALANCE', y='loan_default', data=train_data, alpha = 0.3)


In [None]:
f, ax = plt.subplots(figsize=(10,8))
x = train_data['Date.of.Birth']
plt.xlabel('Age')
ax = sns.distplot(x, bins=10, color='blue')
ax.set_title("Distribution of age variable")

plt.show()

In [None]:
train_data['Bureau_description'].value_counts()

In [None]:
# plotting a donut chart for visualizing 'loan_default','Driving_flag', 'Bureau_description', 'Passport_flag'

fig, ax = plt.subplots(1,5,figsize=(20,20))
columns = ['loan_default','Driving_flag', 'Bureau_description', 'Passport_flag']

for i,column in enumerate(columns):
    plt.subplot(1,5,i+1)
    size = train_data[column].value_counts()
    colors = ['lightblue', 'lightgreen', 'pink', 'orange', 'yellow']
    

    my_circle = plt.Circle((0, 0), 0.7, color = 'white')

    plt.rcParams['figure.figsize'] = (20, 20)
    plt.pie(size, colors = colors, shadow = True, autopct = '%.2f%%')
    plt.title('Distribution of {}'.format(column), fontsize = 15)
    p = plt.gcf()
    p.gca().add_artist(my_circle)
plt.legend()
plt.show()

In [None]:
dummies_Emp_t = pd.get_dummies(train_data['Employment.Type'])

In [None]:
dummies_Emp_t.head()

**Feature Engineering**

By using accounts data (primary or secondary accs), let's do feature engineering

In [None]:
def features_engineering(df):
    print('new_columns')
    df.loc[:,'no_of_accts'] = df['PRI.NO.OF.ACCTS'] + df['SEC.NO.OF.ACCTS']
    df.loc[:,'pri_inactive_accts'] = df['PRI.NO.OF.ACCTS'] - df['PRI.ACTIVE.ACCTS']
    df.loc[:,'sec_inactive_accts'] = df['SEC.NO.OF.ACCTS'] - df['SEC.ACTIVE.ACCTS']
    df.loc[:,'total_inactive_accts'] = df['pri_inactive_accts'] + df['sec_inactive_accts']
    df.loc[:,'total_overdue_accts'] = df['PRI.OVERDUE.ACCTS'] + df['SEC.OVERDUE.ACCTS']
    df.loc[:,'total_current_balance'] = df['PRI.CURRENT.BALANCE'] + df['SEC.CURRENT.BALANCE']
    df.loc[:,'total_sanctioned_amount'] = df['PRI.SANCTIONED.AMOUNT'] + df['SEC.SANCTIONED.AMOUNT']
    df.loc[:,'total_disbursed_amount'] = df['PRI.DISBURSED.AMOUNT'] + df['SEC.DISBURSED.AMOUNT']
    df.loc[:,'total_installment'] = df['PRIMARY.INSTAL.AMT'] + df['SEC.INSTAL.AMT']
    df['number_of_0'] = (df == 0).astype(int).sum(axis=1)
    df.loc[:, 'loan_to_asset_ratio'] = df['disbursed_amount'] /df['asset_cost']
    df.loc[:,'pri_tenure'] = (df['PRI.DISBURSED.AMOUNT']/( df['PRIMARY.INSTAL.AMT']+1)).astype(int)
    df.loc[:,'sec_tenure'] = (df['SEC.DISBURSED.AMOUNT']/(df['SEC.INSTAL.AMT']+1)).astype(int)
    df.loc[:,'disburse_to_sactioned_ratio'] =  np.round((df['total_disbursed_amount']+1)/(1+df['total_sanctioned_amount']),2)
    df.loc[:,'active_to_inactive_act_ratio'] =  np.round((df['no_of_accts']+1)/(1+df['total_inactive_accts']),2)
    df.loc[:,'bal_disburse_ratio'] = np.round((1+df['total_disbursed_amount'])/(1+df['total_current_balance']),2)
    print('done')
    return df


In [None]:
features_engineering(train_data)

In [None]:
features_subset = train_data[['disbursed_amount', 'asset_cost', 'Date.of.Birth', 'Employment.Type', 'DisbursalDate',
                             'Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag',
                             'Passport_flag', 'PERFORM_CNS.SCORE', 'Bureau_description',
                             'PRI.DISBURSED.AMOUNT', 'SEC.DISBURSED.AMOUNT',
                             'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
                             'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES',
                             'loan_default', 'average_act_age_in_months',
                             'credit_history_length_in_months', 'no_of_accts', 'pri_inactive_accts',
                             'sec_inactive_accts', 'total_inactive_accts', 'total_overdue_accts',
                             'total_current_balance', 'total_sanctioned_amount',
                             'total_disbursed_amount', 'total_installment',
                             'loan_to_asset_ratio', 'pri_tenure', 'sec_tenure']]
    

In [None]:
X_subset = pd.concat([features_subset, dummies_Emp_t], axis=1).drop('Employment.Type', axis=1)


In [None]:
X_subset.shape


Creating X and y

In [None]:
y = X_subset['loan_default']
X = X_subset.drop('loan_default', axis =1)

In [None]:
#**find columns with low variance!**
low_var_columns = []
for column in list(X.columns):
    if np.max(X[column].value_counts(normalize=True)) > 0.99:
        low_var_columns.append(column)
low_var_columns

In [None]:
X.columns

In [None]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Modelling

In [None]:
max(y.mean(), 1 - y.mean())

Baseline -- 78.3%

## Classification Model

## Logistic Regression

In [None]:
lr= LogisticRegression()
print(lr)

In [None]:
lr.fit(X_train, y_train)
## Predict
pred_train_lr = lr.predict(X_train)
pred_test_lr = lr.predict(X_test)

### Train data accuracy
from sklearn.metrics import accuracy_score
print("Accuracy on train is:", accuracy_score(y_train, pred_train_lr))
      
### Test data accuracy
print("Accuracy on test is:", accuracy_score(y_test, pred_test_lr))


## PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=7).fit(X)
X = pca.fit_transform(X)
X = pd.DataFrame(X, columns = ['p1','p2','p3','p4','p5','p6','p7'])
test_df = pd.DataFrame(pca.fit_transform(X_test), columns = ['p1','p2','p3','p4','p5','p6','p7'])
#Plotting the Cumulative Summation of the Explained Variance
plt.figure(figsize=(15,5))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Dataset Explained Variance')
plt.show()

In [None]:
def train_model(model):
    # Checking accuracy
    model = model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print('accuracy_score',accuracy_score(y_test, pred))
    print('recall_score',recall_score(y_test, pred))
    print('f1_score',f1_score(y_test, pred))
    print('roc_auc_score',roc_auc_score(y_test, pred))
    # confusion matrix
    print('confusion_matrix')
    print(pd.DataFrame(confusion_matrix(y_test, pred)))
    return model

Random Forest Classifier

In [None]:
# train model random forest
rfc = RandomForestClassifier()
rfc = train_model(rfc)

KNN Classifier

In [None]:
#train model KNN
knn = KNeighborsClassifier()
knn = train_model(knn)


Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc = train_model(dtc)

**Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB()

NB = train_model(NB)

Adaboost Classifier

In [None]:
# import modules as necessary
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Create adaboost-decision tree classifer object
Adaboost = AdaBoostClassifier()
Adaboost = train_model(Adaboost)

Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBM = GradientBoostingClassifier()
GBM = train_model (GBM)

I will use Random Forest model for feature importance

I define a few functions to make analysis more convenient and presentable

In [None]:
# udfs ----

# function for creating a feature importance dataframe
def imp_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

# plotting a feature importance dataframe (horizontal barchart)
def var_imp_plot(imp_df, title):
    imp_df.columns = ['feature', 'feature_importance']
    sns.barplot(x = 'feature_importance', y = 'feature', data = imp_df, orient = 'h', color = 'royalblue') \
       .set_title(title, fontsize = 20)

**Benchmark Model**

I train a plain Random Forest model to have a benchmark. I set a random_state to ensure results comparability. I also use bootstrap and set oob_score = True to later use the out-of-bag error.

Briefly, each tree in the random forest is trained on a different dataset, sampled with replacement from the original data. This results in around ~2/3 of distinct observations in each training set. The out-of-bag error is calculated on all the observations, but for calculating each row's error the model only considers trees which have not seen this row during training. This is similar to evaluating the model on a validation(test) set.

In [None]:
rfc_gs = RandomForestClassifier(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)
rfc_gs.fit(X_train, y_train)

In [None]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Testing Score: {:.2f}'.format(rfc_gs.score(X_train, y_train), 
                                                                                    rfc_gs.oob_score_,
                                                                                    rfc_gs.score(X_test, y_test)))

Well, there is no overfitting in the model, as it performs well on OOB sample and on the test set which is good. Let's move forward to feature importances (measured on the training set performance). Some of the approaches can also be used for testing/OOB sets, to gain further interpretability on the unseen data.

**Overall feature importances**

By overall feature importances I mean the ones derived at model level, i.e., saying that in a given model these features are most important in explaining the target variable.

**Default Scikit-learn's feature importances**

Let's start with decision trees to build some intuition. In decision trees, every node is a condition how to split values in a single feature, so that similar values of dependent variable end up in the same set after the split. The condition is based on impurity, which in case of classification problems is Gini impurity / information gain (entropy), while for regression trees its variance. So when training a tree we can compute how much each feature contributes to decreasing the weighted impurity. feature_importances_ in Scikit-Learn is based on that logic, but in case of Random Forest we are talking about averaging the decrease in impurity over trees.

Pros:

fast calculation easy to retrieve - one command

Cons:

biased approach, as it has a tendency to inflate the importance of continuous features or high-cardinality categorical variables

In [None]:
base_imp = imp_df(X_train.columns, rfc_gs.feature_importances_)
base_imp

In [None]:
fig = plt.figure(figsize=(15,20))
var_imp_plot(base_imp, 'Default feature importance (scikit-learn)')

In [None]:
pip install rfpimp

In [None]:
from sklearn.metrics import r2_score
from rfpimp import permutation_importances
from rfpimp import plot_corr_heatmap
viz = plot_corr_heatmap(X_train, figsize=(15,10))
viz.view()

**Permutation feature importance**

This approach directly measures feature importance by observing how random re-shuffling (thus preserving the distribution of the variable) of each predictor influences model performance.

The approach can be described in the following steps:

Train the baseline model and record the score (accuracy/R^2/any metric of importance) by passing validation set (or OOB set in case of Random Forest). This can also be done on the training set, at the cost of sacrificing information about generalisation. Re-shuffle values from one feature in the selected dataset, pass the dataset to the model again to obtain predictions and calculate the metric for this modified dataset. The feature importance is the difference between the benchmark score and the one from the modified (permuted) dataset. Repeat 2. for all feature in the dataset.

Pros:

applicable to any model reasonably efficient reliable technique no need to retrain the model at each modification of the dataset

Cons:

more computationally expensive than default feature_importances permutation importance overestimates the importance of correlated predictors - Strobl et al (2008) As for the second problem with this method, I have already plotted the correlation matrix above. However, I will use a function from one of the libraries I use to visualise Spearman's correlations. The difference between standard Pearson's correlation is that this one first transforms variables into ranks and only then runs Pearson's correlation on the ranks.

Spearman's correlation:

-is nonparametric -does not assume linear relationship between variables -it looks for monotonic relationships.

In [None]:
from rfpimp import plot_corr_heatmap
viz = plot_corr_heatmap(X_train, figsize=(15,10))
viz.view()

In [None]:
def r2(rf, X_train, y_train):
    return r2_score(y_train, rfc_gs.predict(X_train))

perm_imp_rfpimp = permutation_importances(rfc_gs, X_train, y_train, r2)
perm_imp_rfpimp.reset_index(drop = False, inplace = True)

In [None]:
fig = plt.figure(figsize=(15,20))
var_imp_plot(perm_imp_rfpimp, 'Permutation feature importance (rfpimp)')


**Conclusions**

 Top 6 important predictors of loan default are  loan to asset ratio, disbursed amount, disbursal date, date of birth, asset cost, perform CNS score(credit score).