# Import Libraries

In [None]:
#Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Dataframe Preparation

In [None]:
#Read file and check for shape and column names
loan = pd.read_csv('G:\\Financial Analytics\\Assignment 3\\loan_a.csv')
loan.head()

In [None]:
#Convert dates from object to datetime
loan['Date.of.Birth'] = pd.to_datetime(loan['Date.of.Birth'])
loan['DisbursalDate'] = pd.to_datetime(loan['DisbursalDate'])

In [None]:
#Check for NA's
loan.isna().sum()

In [None]:
#Remove rows with NA's in Employment.Type
loan = loan[loan['Employment.Type'].notna()].reset_index(drop=True)

In [None]:
#Rename columns to have name convenstion consistency
loan_final = loan.rename(columns={'UniqueID':'unique_id', 'Current_pincode_ID':'current_pincode_id', 'Date.of.Birth':'date_of_birth', 'Employment.Type': 'employment_type', 'DisbursalDate':'disbursal_date', 'State_ID':'state_id'})
loan_final.columns

In [None]:
#Calculate Age of borrower when loan is disbursed
loan_final['age_at_disbursal'] = ((loan_final['disbursal_date'] - loan_final['date_of_birth']).dt.days / 365).round(2)

# Exploratory Data Analysis

In [None]:
#Find distribution of Age
sns.distplot(loan_final['age_at_disbursal'], bins=20)

In [None]:
#Scatter Plot of Age at Loan Disbursal and Asset Cost
sns.scatterplot(x=loan_final['age_at_disbursal'], y=loan_final['asset_cost']);

In [None]:
#Count of Defaulters and Non-Defaulters 
loan_final.groupby(['loan_default'])['unique_id'].count()

In [None]:
#Distribution of Age of Non-Defaulters and Defaulters
no_default = loan_final[loan_final['loan_default'] == 0 ].reset_index(drop=True)
default = loan_final[loan_final['loan_default'] == 1].reset_index(drop=True)

f, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.distplot(no_default['age_at_disbursal'], bins=20, ax=axes[0]).set_title('Age Distribution of Non-Defaulters');
sns.distplot(default['age_at_disbursal'], bins=20, ax=axes[1]).set_title('Age Distribution of Defaulters');

In [None]:
#Distribution of borrowers by state_id
state_id_count = loan_final.groupby('state_id')['unique_id'].count().reset_index()
state_id_count.columns = ['state_id', 'count']
state_id_count = state_id_count.sort_values(by='count', ascending=False).reset_index(drop=True)
sns.barplot(x='state_id', y='count', data=state_id_count, color='blue');

In [None]:
#Distribution of Defaulters and Non-Defaulters by state_id
loan_default_dist = loan_final.groupby(['state_id', 'loan_default'])['unique_id'].count().reset_index()
loan_default_dist.columns = ['state_id', 'default_status', 'count']
loan_default_pivot = loan_default_dist.pivot(index='state_id', columns='default_status', values='count').reset_index().fillna(0.0)
loan_default_pivot.columns = ['state_id', 'non_default', 'default']
loan_default_pivot['ratio_default'] = loan_default_pivot['default'] / ( loan_default_pivot['default'] + loan_default_pivot['non_default'])
loan_default_pivot['ratio_no_default'] = 1 - loan_default_pivot['ratio_default']
to_plot_def = loan_default_pivot[['state_id', 'ratio_default', 'ratio_no_default']]
to_plot_def_melt = pd.melt(to_plot_def, id_vars='state_id', var_name='status', value_name='default_status_ratio')
sns.barplot(x='state_id', y='default_status_ratio', data=to_plot_def_melt, hue='status');

In [None]:
#Distribution of Defaulters by Employment.Type
emp_type_default_dist = loan_final.groupby(['employment_type', 'loan_default'])['unique_id'].count().reset_index()
emp_type_default_dist.columns = ['employment_type', 'default_status', 'count']
sns.barplot(x='employment_type', y='count', data=emp_type_default_dist, hue='default_status');

In [None]:
#Distribution of Defaulters by disbursed_amount and asset_type
loan_final['disbursed_amount_range'] = pd.cut(loan_final['disbursed_amount'], 20)
loan_final['asset_cost_range'] = pd.cut(loan_final['asset_cost'], 20)

disb_amt_default_range_dist = loan_final.groupby(['disbursed_amount_range', 'loan_default'])['unique_id'].count().reset_index()
disb_amt_default_range_dist.columns = ['disbursed_amount_range', 'default_status', 'count']

asset_amt_default_range_dist = loan_final.groupby(['asset_cost_range', 'loan_default'])['unique_id'].count().reset_index()
asset_amt_default_range_dist.columns = ['asset_cost_range', 'default_status', 'count']

f, axes = plt.subplots(2, 2, figsize=(15, 8))
plt.subplots_adjust(wspace=0.5, hspace=1.1)

for ax in f.axes:
    plt.sca(ax)
    plt.xticks(rotation=90)
    
sns.distplot(loan_final['disbursed_amount'], bins=20, ax=axes[0][0]).set_title('Distribution of disbursed_amount_range')
sns.barplot(x='disbursed_amount_range', y='count', hue='default_status', data=disb_amt_default_range_dist, ax=axes[0][1]).set_title('Bar Plot of disbursed_amount categories')
sns.distplot(loan_final['asset_cost'], bins=20, ax=axes[1][0]).set_title('Distribution of asset_cost range')
sns.barplot(x='asset_cost_range', y='count', hue='default_status', data=asset_amt_default_range_dist, ax=axes[1][1]).set_title('Bar Plot of asset_cost categories')

# Generating Feature and Target Dataframes

In [None]:
#Develop feature and target dataframe
model_frame = loan_final.loc[:,['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id', 'manufacturer_id', 'current_pincode_id', 'employment_type', 'state_id', 'age_at_disbursal', 'loan_default']]
model_frame['employment_type'] = model_frame['employment_type'].astype('category')
model_frame = pd.get_dummies(model_frame)
X = model_frame.drop(['loan_default'], axis=1)
Y = model_frame['loan_default']

# Model Generation and Validation : KNN Classifier

In [None]:
#Model the dataframe using KNN Classifier
def KNNClassifier(X,Y, random_state, test_size, neighbors):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=random_state, test_size=test_size)
    knn = KNeighborsClassifier(n_neighbors=neighbors)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    
    train_score = knn.score(X_train, Y_train)
    test_score = knn.score(X_test, Y_test)
    con_matrix = confusion_matrix(Y_test, Y_pred)
    
    return train_score, test_score, con_matrix

In [None]:
#KNN Model with random_state=0.9, test_size=0.2, neighbors=3
tr_score, te_score, conf_mat = KNNClassifier(X, Y, 9, 0.2, 3)
print("Training Score : {}".format(tr_score))
print("Test Score : {}".format(te_score))
print("Confusion Matrix : {}".format(conf_mat))

In [None]:
#KNN Model with random_state=0.9, test_size=0.2, neighbors=5
tr_score, te_score, conf_mat = KNNClassifier(X, Y, 9, 0.2, 5)
print("Training Score : {}".format(tr_score))
print("Test Score : {}".format(te_score))
print("Confusion Matrix : {}".format(conf_mat))

In [None]:
#KNN Model with random_state=0.9, test_size=0.2, neighbors=7
tr_score, te_score, conf_mat = KNNClassifier(X, Y, 9, 0.2, 7)
print("Training Score : {}".format(tr_score))
print("Test Score : {}".format(te_score))
print("Confusion Matrix : {}".format(conf_mat))

In [None]:
#KNN Model with random_state=0.9, test_size=0.2, neighbors=9
tr_score, te_score, conf_mat = KNNClassifier(X, Y, 9, 0.2, 9)
print("Training Score : {}".format(tr_score))
print("Test Score : {}".format(te_score))
print("Confusion Matrix : {}".format(conf_mat))

In [None]:
#KNN Model with random_state=0.9, test_size=0.2, neighbors=11
tr_score, te_score, conf_mat = KNNClassifier(X, Y, 9, 0.2, 11)
print("Training Score : {}".format(tr_score))
print("Test Score : {}".format(te_score))
print("Confusion Matrix : {}".format(conf_mat))

# Model Generation and Validation : Logistic Regression

In [None]:
#Model the dataframe using Logistic Regression 
def Logistic_Regression(X, Y, random_state, test_size):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=random_state, test_size=test_size)
    
    #Specifying value of solver to suppress FutureWarning
    #Ref: https://machinelearningmastery.com/how-to-fix-futurewarning-messages-in-scikit-learn/
    logit = LogisticRegression(solver='liblinear')
    logit.fit(X_train, Y_train)
    Y_pred = logit.predict(X_test)
    
    train_score = logit.score(X_train, Y_train)
    test_score = logit.score(X_test, Y_test)
    con_matrix = confusion_matrix(Y_test, Y_pred)
    
    return train_score, test_score, con_matrix

In [None]:
#Logistic Regression Model with random_state=9, test_size=0.2
tr_score, te_score, conf_mat = Logistic_Regression(X, Y, 9, 0.2)
print("Training Score : {}".format(tr_score))
print("Test Score : {}".format(te_score))
print("Confusion Matrix : {}".format(conf_mat))