In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Let's read in the sample dataset from Analytics Vidhya (Loan Prediction course):**
<div></div>
Dataset: https://www.kaggle.com/burak3ergun/loan-data-set
<div></div>
Problem Statement: Taken from the kaggle dataset link
<div></div>
https://courses.analyticsvidhya.com/courses/loan-prediction-practice-problem-using-python?utm_source=practice_problem_Loan_Prediction-III&utm_medium=Datahack

In this course, we are solving a real life case study of Dream Housing Finance. The company deals in all home loans. They have a presence across all urban, semi-urban and rural areas. Customers first apply for a home loan after that company validates the customer's eligibility. The company wants to automate the loan eligibility process (real-time) based on customer detail provided while filling online application form.

By the end of the course, you will have a solid understanding of Classification problem and Various approaches to solve the probem


In [None]:
#Import libraries for descriptive analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
df = pd.read_csv('/kaggle/input/loan-data-set/loan_data_set.csv')

**1. Let's understand the dataset and attributes**

In [None]:
#Visualize the table
df.head()

In [None]:
#Types of attributes/missing values
df.info()

**Feature Understanding (defining some less obvious features)**

* Loan_ID: Unique identifier for loan
* Applicant Income & Co-applicant Income: applicable when applying as a family
* Loan amount: requested loan amount
* Loan Amount Term: Requested time period in months
* Credit History: Flag to identify if credit history meets requirements
* Loan Status: Target = Y or N if house loan is approved or rejected

**Moving on to data visualization**

1.a. Total number of records in data (rows) = 614 and exploring the target (Loan Status Y/N) shows that 69% of loans have been approved

In [None]:
#df['Loan_Status'].value_counts(normalize=True).plot.bar(label = '% of loan approvals')

plt.figure(figsize=(2, 2))

fig = df['Loan_Status'].value_counts(normalize=True).plot(kind='bar')
fig.set_title('% Loan of approvals')

1.b. We can explore all categorical values with barplots & suplots
* 80% of loan applicants are male, while only 20% are female - this could introduce historical bias when training a model for loan approvals
* More than 60% of house loan applicants are married
* About 40% have dependents (children)
* Less than 20% are self employed
* Less than 30% of loan applicants are applying from rural areas
* 80% of the loans are given to graduates



In [None]:
fig = plt.figure(figsize=(18,2))
ax1 = fig.add_subplot(161)
df['Gender'].value_counts(normalize=True).plot(kind='bar')
ax2 = fig.add_subplot(162)
df['Married'].value_counts(normalize=True).plot(kind='bar')
ax3 = fig.add_subplot(163)
df['Dependents'].value_counts(normalize=True).plot(kind='bar')
ax4 = fig.add_subplot(164)
df['Self_Employed'].value_counts(normalize=True).plot(kind='bar')
ax5 = fig.add_subplot(165)
df['Property_Area'].value_counts(normalize=True).plot(kind='bar')
ax6 = fig.add_subplot(166)
df['Education'].value_counts(normalize=True).plot(kind='bar')
ax1.title.set_text('Gender')
ax2.title.set_text('Married?')
ax3.title.set_text('Dependents?')
ax4.title.set_text('Self Employed?')
ax5.title.set_text('Property_Area')
ax6.title.set_text('Education')
plt.show()

1.c. We want to overlay another dimension to our plots to visualize loan approvals by group and identify potential sources of bias

In [None]:
#Function to plot multiple colour bar charts
def plot_bar(dataframe, group_col, hue_col, count_col):
    df_grp = dataframe.groupby([group_col, hue_col]).count()[count_col]
    df_grp = df_grp.reset_index()
    df_grp['Percentage'] = 100*(df_grp[count_col]/len(dataframe))
    print(df_grp)
    g = sns.catplot(x=group_col, y="Percentage",
                hue=hue_col,
                data=df_grp, kind="bar",
                height=4, aspect=.7);

def plot_bar_within(dataframe, group_col, hue_col, count_col):
    #Create another set of plots to inspect within class approvals
    df_grp = dataframe.groupby([group_col, hue_col]).count()[count_col]
    df_grp = df_grp.reset_index()
    #df_grp['Percentage'] = 100*(df_grp[count_col]/len(dataframe))
    #print(df_grp)
    categories = list(set(df_grp[group_col]))
    print(categories)
    for item in categories:
        df_temp = df_grp[df_grp[group_col] == item]
        df_temp['Percentage'] = 100*(df_temp[count_col]/df_temp[count_col].sum())
        plt.figure()
        g2 = sns.barplot(x=hue_col, y = 'Percentage', data=df_temp).set_title(item)
        print(df_temp)
        


**1.c.1 Does gender have an impact on loan applications?**
Looks like 20% of loan applications are from female applicants and remaining are male applicants, though within gender, proportion of loan rejections is ~30% for both males and females. 80% of approved applications (n = 414) are female while the remainder are male. However, within class, rejection rates for males and females are similar so this is an artifact of having a lower number of female applicants **historical bias**. Models could potentially learn Gender = M as a feature for approving a loan application

In [None]:
#Custom plotting function for categorical attributes
plot_bar(df, 'Gender', 'Loan_Status', 'Loan_ID')
approved_applications= 75+339
prop_female_approved = 75/approved_applications
print('No of approved applications:')
print(approved_applications)
print('Proportion of female approved applications')
print(prop_female_approved)

In [None]:
#Plot bar for 
plot_bar_within(df, 'Loan_Status', 'Gender', 'Loan_ID')

In [None]:
#plot_bar_within(df, 'Gender', 'Loan_Status', 'Loan_ID')

In [None]:
Gender = pd.crosstab(df['Gender'], df['Loan_Status'])
Gender.div(Gender.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)

**1.c.2. what about marital status?**
Looks like 64% of all loan applications are from married individuals. 68% of all approved loans come from individuals who are married. Within the married group, 70% of loans were approved while this figure was 60% in the non-married group

In [None]:
#Custom plotting function for categorical attributes
plot_bar(df, 'Loan_Status', 'Married', 'Loan_ID')
approved = 134+285
print(approved)
approved_married = 285/approved
print(approved_married)

In [None]:
plot_bar_within(df, 'Married', 'Loan_Status', 'Loan_ID')

In [None]:
plot_bar_within(df, 'Loan_Status', 'Married', 'Loan_ID')

Can keep doing this exploration for all the variable sin the data

In [None]:
Married = pd.crosstab(df['Married'], df['Loan_Status'])
Dependents = pd.crosstab(df['Dependents'], df['Loan_Status'])
Education = pd.crosstab(df['Education'], df['Loan_Status'])
Self_Employed = pd.crosstab(df['Self_Employed'], df['Loan_Status'])

Married.div(Married.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()
Dependents.div(Dependents.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()
Education.div(Education.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()
Self_Employed.div(Self_Employed.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()

Explore other attributes like credit history and property type against loan approval rates. Looks like more loans are approved when credit history requirements are met... and also in urban areas and semi urban compared to rural. 

In [None]:
Credit_History = pd.crosstab(df['Credit_History'], df['Loan_Status'])
Property_Area = pd.crosstab(df['Property_Area'], df['Loan_Status'])

Credit_History.div(Credit_History.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()
Property_Area.div(Property_Area.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()

1.d. Explore numerical data

In [None]:
df2 = df.reset_index()
df2 = df2.dropna()
print(df2.head())
plt.figure(figsize=(15,5))
plt.subplot(131)
sns.distplot(df['ApplicantIncome'])
plt.subplot(132)
df['ApplicantIncome'].plot(kind='box')
plt.subplot(133)
sns.scatterplot(data=df, x="ApplicantIncome", y="LoanAmount", hue='Loan_Status')

Explore Applicant Income by different demographic variables, given the wide range:
* Average income for males higher than females
* Graduates have a wider range of higher incomes than non-graduates
* Married applicants also have higher income

In [None]:
fig2 = plt.figure()
df.boxplot(column = 'ApplicantIncome', by='Gender')

In [None]:
fig2 = plt.figure()
df.boxplot(column = 'ApplicantIncome', by='Education')

In [None]:
fig2 = plt.figure()
df.boxplot(column = 'ApplicantIncome', by='Married')

In [None]:
fig2 = plt.figure()
df.boxplot(column = 'ApplicantIncome', by='Property_Area')

In [None]:
fig2 = plt.figure()
df.boxplot(column = 'ApplicantIncome', by='Loan_Status')

In [None]:
#Bins on the applicant income
bins = [0,2500,4000,6000,81000]
group = ['Low', 'Average', 'High', 'Very High']
df['IncomeBin'] = pd.cut(df['ApplicantIncome'], bins, labels = group)
#df.head()
Income_bin = pd.crosstab(df['IncomeBin'], df['Loan_Status'])
Income_bin

In [None]:
df.info()

In [None]:
###MOdel building for explanability

Now let's try to build the model - preprocess the data

In [None]:
#Define the target and the training set - dropping the ID to avoid too many categories/confusion
y = df['Loan_Status']
features_raw = df.drop(columns=['Loan_Status', 'Loan_ID'], axis = 1)

In [None]:
features_proc = features_raw
features_raw.info()

In [None]:
#Convert missing values to NA in float fields
features_proc['LoanAmount'] = features_raw.LoanAmount.replace(0,np.nan)
features_proc['Loan_Amount_Term'] = features_raw.Loan_Amount_Term.replace(0,np.nan)
features_proc['Credit_History'] = features_raw.Credit_History.replace(0,np.nan)

In [None]:
#Transform categorical data
features_proc = pd.get_dummies(features_proc)
features_proc.head()

In [None]:
###Now take trainingd ata into array
X = features_proc.values
X.shape

In [None]:
features_proc.iloc[0]

In [None]:
#Impute missing values in the data (using mean value)
from sklearn.impute import SimpleImputer 
imp = SimpleImputer(strategy='mean')
imp.fit(X)
X = imp.transform(X)

In [None]:
X[0] #All missing values are filled
X.shape

Now create a tree model to see how a default classifier outputs Y/N predictions

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
tree_model = DecisionTreeClassifier(random_state=0, max_depth=5, min_samples_split=5).fit(train_X, train_y)

In [None]:
y_pred = tree_model.predict(val_X)
from sklearn.metrics import confusion_matrix
confusion_matrix(val_y, y_pred)

In [None]:
val_y.value_counts()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(val_y, y_pred))

In [None]:
#Validation dataframe
df_val = pd.DataFrame(data=val_X, columns=feature_names)
df_val['true_label'] = list(val_y)
df_val['pred_label'] = y_pred

In [None]:
df_val.info()

In [None]:
df_val.head()

Compare Gender for approved and not approved loans

In [None]:
True_vals = pd.crosstab(df_val['Married_Yes'], df_val['true_label'])
Predicted_vals = pd.crosstab(df_val['Married_Yes'], df_val['pred_label'])

True_vals.div(True_vals.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()
Predicted_vals.div(Predicted_vals.sum(1).astype(float), axis = 0).plot(kind='bar', stacked=True)
plt.show()

In [None]:
True_vals

In [None]:
Predicted_vals

visualise the output and score the model

In [None]:
feature_names = [i for i in features_proc.columns]

In [None]:
from sklearn import tree
import graphviz

tree_graph = tree.export_graphviz(tree_model, out_file=None, feature_names=feature_names)
graphviz.Source(tree_graph)

In [None]:
row_to_show = 7
df_val2 = df_val.drop(columns=['true_label', 'pred_label'], axis = 1)
data_for_prediction = df_val2.iloc[row_to_show]  # use 1 row of data here. Could use multiple rows if desired
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


tree_model.predict_proba(data_for_prediction_array)

In [None]:
df_val.iloc[row_to_show]

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(tree_model)

# Calculate Shap values
shap_values = explainer.shap_values(data_for_prediction)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction)

Being Married seems to be an important feature... 

In [None]:
data_for_prediction_array

In [None]:
import shap  # package used to calculate Shap values

# Create object that can calculate shap values
explainer = shap.TreeExplainer(tree_model)

# calculate shap values. This is what we will plot.
# Calculate shap_values for all of val_X rather than a single row, to have more data for plot.
shap_values = explainer.shap_values(df_val2)

# Make plot. Index of [1] is explained in text below.
shap.summary_plot(shap_values[1], df_val2)