In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import basic libraries for exploratory data analysis, visualization and cleaning.

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Import the data to a Pandas DataFrame

In [None]:
#read CSV file for df
df1 = pd.read_csv('/kaggle/input/loan-data/loan_data.csv')

Explore DataFrame

In [None]:
#ascertain number of rows and columns
df1.shape

In [None]:
#ascertain column names, no. of values, dtypes and identify missing values.
df1.info()

No apparent missing values. All data types appear appropriate given variable descriptions. Purpose column must be encoded as k-1 dummy variables to avoid introducing multicollinearity.

In [None]:
#Initiate a list for categoricals
categ_list = ['purpose']
#create new df with dummy variables
df2 = pd.get_dummies(df1, columns = categ_list, drop_first=True)
#print info to confirm successful and check dtypes
df2.info()

In [None]:
#understand the loication and range of values for each variable
df2.describe()

Explore final dataset before deciding best approach for modelling.

In [None]:
#calculate Pearson correlation coefficients for all variables in df
df2.corr()

In [None]:
#create conditions for subsetting
default = df2['not.fully.paid']==1

Plot variables with histograms based on meeting default condition

In [None]:
#plot installment value according to default
df2[default==True]['installment'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['installment'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('Installment')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot interest rate according to default
df2[default==True]['int.rate'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['int.rate'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('Interest Rate')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot log of annual income according to default
df2[default==True]['log.annual.inc'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['log.annual.inc'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('Log of Annual Income')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot debt-to-income ratio according to default
df2[default==True]['dti'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['dti'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('Debt-to-Income Ratio')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot FICO score according to default
df2[default==True]['fico'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['fico'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('FICO Score')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot Days with Credit Line according to default
df2[default==True]['days.with.cr.line'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['days.with.cr.line'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('Days with Credit Line')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot Revolving Balance Utilisation Rate according to default
df2[default==True]['revol.util'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['revol.util'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('Revolving Balance Utilisation Rate')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot No. of credit enquires in last 6 months according to default
df2[default==True]['inq.last.6mths'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['inq.last.6mths'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('No. of credit enquires in last 6 months')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot No. of defaults in past 2 years according to default
df2[default==True]['delinq.2yrs'].hist(alpha=0.65, color='red', bins=5, label = 'Defaulted')
df2[default==False]['delinq.2yrs'].hist(alpha=0.65, color='green', bins=5, label = 'Performing')
plt.xlabel('No. of defaults in past 2 years')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#plot No. of derogatory public records according to default
df2[default==True]['pub.rec'].hist(alpha=0.65, color='red', bins=25, label = 'Defaulted')
df2[default==False]['pub.rec'].hist(alpha=0.65, color='green', bins=25, label = 'Performing')
plt.xlabel('No. of derogatory public records')
plt.ylabel('Frequency')
plt.legend()

In [None]:
df1.groupby('purpose')["not.fully.paid"].sum().plot(kind='bar', color='red', alpha=0.65, label = 'Defaulted')
df1.groupby('purpose')["not.fully.paid"].count().plot(kind='bar', color='blue', alpha=0.65, label = 'All')
plt.xlabel('Loan Purposes')
plt.ylabel('Frequency')
plt.legend()

In [None]:
#Import modelling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
#define X and y
X = df2.drop(["not.fully.paid"], axis=1)
y = df2["not.fully.paid"]

#split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=66, stratify=y)

In [None]:
#Instantiate logistic regression
logreg = LogisticRegression()

#train/fit the model
logreg.fit(X_train, y_train)

#generate predictions
y_pred = logreg.predict(X_test)

In [None]:
#Evaluate
logreg.score(X_test, y_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
y_pred_prob = logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

In [None]:
plt.plot(fpr, tpr, label = "Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("LogReg ROC Curve")
plt.show()