In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
sns.set_style('whitegrid')
import warnings
import gc
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [None]:
df = pd.read_csv('../input/loan-final/loans_full_schema.csv', low_memory=False)

In [None]:
df.head()

In [None]:
df.info()  # there are 55 columns and over 10000 rows in our dataset.

# Exploratory Data Analysis

In [None]:
f, axes = plt.subplots(1, 2, figsize=(15,5))
sns.countplot(x='loan_status', data=df, ax=axes[0])
sns.distplot(df['loan_amount'], kde=False, bins=40, ax=axes[1])
sns.despine()
axes[0].set(xlabel='Status', ylabel='')
axes[0].set_title('Count of Loan Status', size=20)
axes[1].set(xlabel='Loan Amount', ylabel='')
axes[1].set_title('Loan Amount Distribution', size=20)

In [None]:
#In the loan amount distribution we can see spikes in even ten thousand dollar, 
#so this is indicating that there are certain amounts that are basically standard loans.

This is an imbalance problem, because we have a lot more entries of people that have not fully paid their loans then people that fully paid.
We can expect to probably do very well in terms of accuracy but our precision and recall are going to be the true metrics that we will have to evaluate our model based off of.


In [None]:
sns.boxplot(x='loan_status', y='loan_amount', data=df)
sns.despine()

In [None]:
f, axes = plt.subplots(1, 2, figsize=(15,5), gridspec_kw={'width_ratios': [1, 2]})
sns.countplot(x='grade', hue='loan_status', data=df, order=sorted(df['grade'].unique()), palette='seismic', ax=axes[0])
sns.countplot(x='sub_grade', data=df, palette='seismic', order=sorted(df['sub_grade'].unique()), ax=axes[1])
sns.despine()
axes[0].set(xlabel='Grade', ylabel='Count')
axes[0].set_title('Count of Loan Status per Grade', size=20)
axes[1].set(xlabel='Sub Grade', ylabel='Count')
axes[1].set_title('Count of Loan Status per Sub Grade', size=20)
plt.tight_layout()

Here we can see the count of grades and sub grades, the sub grades in blue are the good ones and in red are either late or charged off.

In [None]:
plt.figure(figsize=(18,7))
sns.countplot(x="loan_purpose", data=df)
plt.show() # looks like debt consolidation is the main reason for people taking loans.

In [None]:
df['application_type'].value_counts()

In [None]:
sns.barplot(x = 'homeownership', y = 'loan_amount', hue = 'application_type',data = df, color = 'red')

It seems like home owners with mortage status have the maximum loan amount and people with joint application type have more loan amount than individual application type.

***Missing values***

In [None]:
def null_values(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
miss_values = null_values(df) # we can see total of 10 columns with missing values.
miss_values.head(55)

In [None]:
perc = 55.0 # here we have removed the columns with missing values more that 55%.
min_count =  int(((100-perc)/100)*df.shape[0] + 1)
df = df.dropna( axis=1, 
                thresh=min_count)

In [None]:
miss_values = null_values(df)
miss_values.head(50)

In [None]:
df.describe().transpose() # the stats of all the columns

In [None]:
df.groupby('loan_status')['loan_amount'].describe()

In [None]:
plt.figure(figsize=(10,5))  # we still have 5 columns with some null values
((df.isnull().sum())/len(df)*100).plot.bar(title='Percentage of missing values per column', color='green')

In [None]:
print(df['emp_title'].nunique()) 
df['emp_title'].value_counts()

Realistically there are too many unique job titles to try to convert this to a dummy variable feature.Therefore we will drop it.

In [None]:
df = df.drop(['emp_title', 'emp_length'], axis = 1)

In [None]:
df['months_since_last_credit_inquiry'].fillna(int(df['months_since_last_credit_inquiry'].mean()), inplace=True)

In [None]:
df['num_accounts_120d_past_due'].fillna(int(df['num_accounts_120d_past_due'].mean()), inplace=True)

We will fill the null values for month since last inquiry and num_accounts_120d_past_due with there mean values. 

In [None]:
df = df.dropna() #Since the debt to income has very less null values.

In [None]:
df.isnull().sum()

In [None]:
sns.set(style="whitegrid", font_scale=1)

plt.figure(figsize=(20,20))
plt.title('Pearson Correlation Matrix',fontsize=25)
sns.heatmap(df.corr(),linewidths=0.25,vmax=0.7,square=True,cmap="GnBu",linecolor='w',
            annot=True, annot_kws={"size":10}, cbar_kws={"shrink": .7})

We can see a strong correlation between loan_amount and installment, loan_amount and balance,paid_principal and paid_total.

***Categorical and dummy variables***

In [None]:
df['loan_status'].value_counts()

In [None]:
target_list = [1 if i=='Fully Paid' else 0 for i in df['loan_status']]

df['TARGET'] = target_list
df['TARGET'].value_counts() # we have a target variable for loan status

In [None]:
df.drop('loan_status',axis=1,inplace=True) # we do not need loan status column now.

In [None]:
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)
# Shows the list of columns with categorical values .

In [None]:
#we can drop state and grade as they have high number of categorical values
df.drop('state',axis = 1, inplace= True)
df.drop('sub_grade', axis = 1, inplace= True)

In [None]:
fig = plt.figure(figsize=(12,6))
sns.violinplot(x="TARGET",y="loan_amount",data=df, hue = "application_type",split=True)
plt.title("Disbursement - Loan Amount", fontsize=20)
plt.xlabel("TARGET", fontsize=15)
plt.ylabel("Loan Amount", fontsize=15);

In [None]:
corr = df.corr()['TARGET'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', corr.tail(10))
print('\nMost Negative Correlations:\n', corr.head(10))

Besides from the perfect correlation of TARGET column with itself, columns like int_rate which is interest rate, paid_principal, paid_total have high positive correlation with the TARGET column and these are quite true as higher the interest rate, higher it is harder for a borrower to pay back a loan. However, columns like balance, paid_interest are bound to be higher when a borrower doesn't pay back a loan.

Also, columns like recoveries, total_rev_hi_lim, etc. have negative correlation with the TARGET column as a borrower who has paid back money is more likely to repay the loan.

In [None]:
df.corr()['debt_to_income'].sort_values().tail(10)
#It can be seen that the interest rate is also highly positively correlated 
#with the debt to income ratio.

In [None]:
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
grade_dummies = pd.get_dummies(df['grade'],drop_first=True)
df = pd.concat([df.drop('grade',axis=1),grade_dummies],axis=1)
# we will convert the grade column to dummy variables and drop it.

In [None]:
loan_purpse_dummies = pd.get_dummies(df['loan_purpose'],drop_first=True)
df = pd.concat([df.drop('loan_purpose',axis=1),loan_purpse_dummies],axis=1)
# we will convert the loan purpose to dummy variables and drop it.

In [None]:
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
df['verified_income'].value_counts()
 # we can change the verified income values to verified and not verified

In [None]:
df['verified_income'] = df['verified_income'].replace(['Source Verified', 'Verified'], 'S_Verified')
dummies = pd.get_dummies(df['verified_income'],drop_first=True)
df = df.drop('verified_income',axis=1)
df = pd.concat([df,dummies],axis=1)

In [None]:
dummies = pd.get_dummies(df[['homeownership', 'application_type','initial_listing_status','disbursement_method']], drop_first=True)

df = df.drop(['homeownership', 'application_type','initial_listing_status','disbursement_method'],axis=1)

df = pd.concat([df,dummies],axis=1)

In [None]:
df['issue_month'].value_counts()

This would be data leakage, we wouldn't know beforehand whether or not a loan would be issued when using our model, so in theory we wouldn't have an issue_date, drop this feature.

In [None]:
df.drop('issue_month', axis = 1, inplace = True)

In [None]:
df.columns

In [None]:
df['TARGET'].value_counts()

# Model

In [None]:
# Features
X = df[['interest_rate','paid_total','paid_principal','accounts_opened_24m','total_credit_lines','total_credit_limit','num_mort_accounts','inquiries_last_12m']].values

# Label
y = df['TARGET'].values

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train) # We will train the model in training data

In [None]:
predictions = dtree.predict(X_test) # create the predictions
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
print('\n')
print(confusion_matrix(y_test,predictions))

In [None]:
from sklearn.metrics import mean_absolute_error
predicted_loan = dtree.predict(X)
mean_absolute_error(y, predicted_loan)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=40, random_state=21)
rfc.fit(X_train, y_train)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
import sklearn.metrics as metrics
print(metrics.classification_report(y_test,predictions))
print('\n')
print(confusion_matrix(y_test,predictions))

In [None]:
predicted_l = rfc.predict(X)
mean_absolute_error(y, predicted_l)