In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

In [None]:
df = pd.read_csv('/kaggle/input/loan-eligible-dataset/loan-train.csv')

In [None]:
print(df.shape)
print(df.columns)
df.head()

# EDA

We want to check firstly if there is any difference between different groups when it comes to loan approval

In [None]:
groups = ['Gender','Married','Education','Self_Employed','Dependents','Property_Area']
fig, ax = plt.subplots(3,2, figsize = (12,12))
axs=ax.ravel()
for i, group in enumerate(groups):
    sns.countplot(x = group, hue = 'Loan_Status',ax=axs[i], data=df)

We can see that people who haven't graduated are far less likely to have their loans approved, also there exists a biad towards married couples over those who haven't married

**Are there disparities in how large a loan different groups take out?**

In [None]:
fig, ax = plt.subplots(3,2, figsize = (12,12))
axs=ax.ravel()
for i, group in enumerate(groups):
    sns.kdeplot(x = 'LoanAmount' , hue = group,ax=axs[i], data=df, fill=True)

Things to notice:
* Far more men take out loans than women, likewise with education level
* Most distributions are similar; most demographics will take out similar sized loans (between 100 and 200 thousand dollars)
* The exception to this ^ is that married couples are more likely to take out higher loands than non-married couples. This is likely due to the phenomena of married couples settling down and taking out a mortgage

**Disparities in income?**

In [None]:
fig, ax = plt.subplots(3,2, figsize = (12,12))
axs=ax.ravel()
for i, group in enumerate(groups):
    sns.kdeplot(x = 'ApplicantIncome' , hue = group,ax=axs[i], data=df, fill=True)

Nothing particularly noticeable here

Look at continuous variables. First we will fill na values with the mean of each respective column

In [None]:
cont = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term']
fig, ax = plt.subplots(2,2, figsize = (12,10))
axs=ax.ravel()
for i, continuous_group in enumerate(cont):
    sns.kdeplot(x = continuous_group, hue = 'Loan_Status', ax = axs[i], data = df, fill = True)


Overall differences are quite subtle however there are a couple of things to note:
* Applicant and coapplicant income show slightly fatter left-tails for rejected loans than for accepted loans as expected
* The loans that are rejected have a more concentrated loan amount value than loans that are accepted


# Building and Training Classifiers

In [None]:
df.head()

First we will rename the variables:
* Status: Married : 1, Not Married: 0 
* Education: Graduated : 1, Not Graduated: 0
* Self Employed: Yes : 1, No : 0 
* Loan_Status: Y : 1, No : 0 

For gender we will impute 

In [None]:
fig,ax = plt.subplots(figsize = (8,8))
sns.heatmap(df.isna(),ax=ax)

**Treating NA Values**

* Gender na can be replaced with unknown/other
* Married can be replaced the most common answer, which is married (since there aren't many unknowns in this category this is not a huge issue)
* Dependents na can be filled with 0, since most likely people who filled in na as a way to say this was not applicable. * * Likewise with self_employed




In [None]:
def dfimputation(df):
    df['Married'] = df['Married'].apply(lambda x: 0 if x == 'No' else 1 if x == 'Yes' else np.nan)
    df['Education'] = df['Education'].apply(lambda x: 0 if x == 'Not Graduate' else 1 if x == 'Graduate' else np.nan)
    df['Self_Employed'] = df['Self_Employed'].apply(lambda x: 0 if x == 'No' else 1 if x == 'Yes' else np.nan)
    if 'Loan_Status' in df.columns:
        df['Loan_Status'] = df['Loan_Status'].apply(lambda x: 0 if x == 'N' else 1 if x == 'Y' else np.nan)
    df['Credit_History'] = df['Credit_History'].apply(lambda x: 'Y' if x == 1 else 'N' if x == 0 else 'Unknown')
    df['Gender'].fillna(value = 'Unknown/Other', inplace = True)
    df['Married'].fillna(value = 0, inplace = True)
    df['Dependents'].fillna(value = '0', inplace = True)
    df['Self_Employed'].fillna(value = 0, inplace = True)
    df['LoanAmount'].fillna(value = df['LoanAmount'].mean(), inplace = True)
    df['Loan_Amount_Term'].fillna(value = df['Loan_Amount_Term'].mean(), inplace = True)
    return(df)

In [None]:
df = dfimputation(df)

We can sense check this by looking comparing the average marriage status of the 'nan' dependents and '0' dependents: as it can be seen the two groups have a similar marriage rate

In [None]:
print(df.groupby('Credit_History').mean())
sns.countplot(x = 'Credit_History', hue = 'Loan_Status', data=df)

**One Hot Encoding Categorical Variables**

In [None]:
df.drop(['Dependents'],axis=1)

In [None]:
from sklearn.preprocessing import OneHotEncoder
OHEgroups = ['Property_Area','Gender','Dependents','Credit_History']
def OHEdf(df):
    enc = OneHotEncoder(handle_unknown = 'ignore')
    onehotdf = pd.DataFrame(enc.fit_transform(df[OHEgroups]).toarray(),columns = enc.get_feature_names())
    dfnew = pd.concat([df.drop(OHEgroups,axis = 1),onehotdf],axis =1)
    return(dfnew)
df = OHEdf(df)
df.dtypes

**Building classifier**

Now that we have pre-processed the data we will build a range of different classifiers and train and test them

In [None]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

Read in test data set and apply preprocessing steps

In [None]:
dftest = pd.read_csv('/kaggle/input/loan-eligible-dataset/loan-test.csv')
dftest = OHEdf(dfimputation(dftest))
X_test = df.drop(['Loan_Status','Loan_ID'],axis =1)


In [None]:
X_train = df.drop(['Loan_Status','Loan_ID'],axis =1)
y_train = df['Loan_Status']

**Support Vector Machine Classifier**

In [None]:
SVCPipeline = make_pipeline(StandardScaler(), SVC(gamma='auto'))
SVCPipeline.fit(X_train, y_train)
scores = cross_val_score(SVCPipeline, X_train, y_train, cv=3)
print('Support Vector Machine Training Set CV Scores = {}. Average = {}'.format(scores, scores.mean()))
fig, ax = plt.subplots(figsize =(7,7))
sns.heatmap(confusion_matrix(y_train,SVCPipeline.predict(X_train)),cmap = 'coolwarm',annot=True, ax=ax, fmt = '.1f')
ax.set_title('SVM Confusion Matrix')
ax.set_ylabel('Actual Loan Statuses')
ax.set_xlabel('Predicted Loan Statuses')
ax.set_xticklabels(['No','Yes'])
ax.set_yticklabels(['No','Yes'])

Our classifier is far more likely to succesful loan applications than atually exist

**Random Forest Classifier**

In [None]:
RFPipeline = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth =8))
RFPipeline.fit(X_train, y_train)
scores = cross_val_score(DTPipeline, X_train, y_train, cv=3)
print('Random Forest Classifier CV Scores = {}. Average = {}'.format(scores, scores.mean()))
fig, ax = plt.subplots(figsize =(7,7))
sns.heatmap(confusion_matrix(y_train,RFPipeline.predict(X_train)),cmap = 'coolwarm',annot=True, ax=ax, fmt = '.1f')
ax.set_title('RF Confusion Matrix')
ax.set_ylabel('Actual Loan Statuses')
ax.set_xlabel('Predicted Loan Statuses')
ax.set_xticklabels(['No','Yes'])
ax.set_yticklabels(['No','Yes'])