In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Can we predict whether or not a loan should be granted ?**


<h3><b>This is my first notebook on Kaggle, I hope you'll appreciate it  :-).</b></h3>

## What to expect ?

<h3><ul>
  <li><a href="#exploratory">Exploratory analysis & Data visualization</a></li>
  <li><a href="#outliers">Treatement of outliers</a></li>
  <li><a href="#missing_values"> Treatement of missing values</a></li>
  <li><a href="#encoding">Encoding</a></li>
  <li><a href="#features_selection">Features selection</a></li>
  <li><a href="#use_of_models">Use of different models [Decision Tree / Naive Bayes / Logistic Regression]</a></li>
  <li><a href="#metrics_threshold">Model metrics & classification threshold modification</a></li>
  <li><a href="#conclusion">Conclusion</a></li>
</ul></h3>

<h1 id="exploratory">Exploratory analysis & Data visualization</h1>

In [None]:
# Import libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing


In [None]:
# Import data set. 

train_data = pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')

In [None]:
# Display first five rows.

train_data.head()

In [None]:
# Check number of entries and columns types. There are missing values in the data set.

train_data.info() 

In [None]:
# Gather all columns that are numerical, exclude Loan_ID because it's not necessary for what we do and Credit History because it only contains 0 or 1.

num_cols = train_data.dtypes != 'object' 
exclude_id = train_data.columns != 'Loan_ID'
exclude_credit_history = train_data.columns != 'Credit_History'

num_columns = num_cols[num_cols & exclude_id & exclude_credit_history].index
num_columns

In [None]:
# Boxplot to get an idea of the range of the numerical values, looking for eventual outliers.

i = 1
plt.figure(figsize=(18,10))
sns.set_theme(style="darkgrid")

for col in num_columns:
    plt.subplot(2,2, i)
    sns.boxplot(x = train_data[col] , palette='viridis')
    i = i + 1
    
plt.show()

In [None]:
# Visualize numerical values according to their Loan Status (YES/NO) to observe outliers.
    
for col in num_columns:
    sns.catplot(x = 'Loan_Status', y=col, kind="strip", palette='viridis', data=train_data)
plt.show()

In [None]:
# Gather all columns that are categoricals and check what are their possible values.

obj_cols = train_data.dtypes == 'object'
exclude_loan_id = train_data.columns != 'Loan_ID'
exclude_loan_status = train_data.columns != 'Loan_Status'


categorical_columns = obj_cols[obj_cols & exclude_loan_id & exclude_loan_status].index
for col in categorical_columns:
    print(train_data[col].value_counts())

In [None]:
# Distribution of the categorical values according to their Loan Status (YES/NO).

i = 1
plt.figure(figsize=(20,15))
for catcol in categorical_columns:
    plt.subplot(3,3, i)
    sns.countplot(x = catcol, data=train_data, palette='viridis', hue='Loan_Status')
    plt.xlabel(catcol, fontsize=14)
    plt.ylabel(' ')
    i = i + 1
plt.show()

In [None]:
# Now that we have a better idea of the data set, we need to treat the outliers and missing values.

<h1 id="outliers">Treatement of outliers</h1>

In [None]:
# Copy of the train data.

df_train_no_outliers = train_data.copy()
df_train_no_outliers.shape

In [None]:
condAI = (df_train_no_outliers['ApplicantIncome'] > 30000) | (df_train_no_outliers['ApplicantIncome'] < 1000)
df_train_no_outliers.drop(df_train_no_outliers[condAI].index, axis=0, inplace=True)

In [None]:
condCAI = (df_train_no_outliers['CoapplicantIncome'] > 15000)
df_train_no_outliers.drop(df_train_no_outliers[condCAI].index, axis=0, inplace=True)

In [None]:
condLA = (df_train_no_outliers['LoanAmount'] > 400) | (df_train_no_outliers['LoanAmount'] < 40)
df_train_no_outliers.drop(df_train_no_outliers[condLA].index, axis=0, inplace=True)

In [None]:
condLAT = (df_train_no_outliers['Loan_Amount_Term'] > 400) | (df_train_no_outliers['Loan_Amount_Term'] < 100)
df_train_no_outliers.drop(df_train_no_outliers[condLAT].index, axis=0, inplace=True)

In [None]:
# We need to re index our data frame with the new number of entries.

df_train_no_outliers.reset_index(drop=True, inplace=True)
df_train_no_outliers.shape

In [None]:
# Number of rows deleted :

train_data.shape[0] - df_train_no_outliers.shape[0]

In [None]:
# Shape after deleting all the outliers.

df_train_no_outliers.info()

<h1 id="missing_values">Treatement of mising values</h1>

In [None]:
# Copy of our df to a new one. It allows us to go back to each step of the process.

df_no_omv = df_train_no_outliers.copy() # = df_no_outliers missing values.

In [None]:
# Display columns with missing values.

col_missing_values = df_no_omv.isna().sum()
col_missing_values = col_missing_values[col_missing_values > 0]

print(col_missing_values)

In [None]:
# How important are those missing values in regard to the total number of cells in our Data frame ?

total_cells = train_data.shape[0] * train_data.shape[1]
print('Total cells :', total_cells)    
print('Total missing values :', col_missing_values.sum())                                                     
print('Percentage of missing cells (missing/total values) :' , (col_missing_values.sum()/total_cells) * 100, "%")

In [None]:
# Treating missing values for numerical values LoanAmount & Loan_Amount_Term with their mean.

In [None]:
df_no_omv.LoanAmount.describe()

In [None]:
df_no_omv.Loan_Amount_Term.describe()

In [None]:
df_no_omv['LoanAmount'].fillna(df_no_omv['LoanAmount'].mean(), inplace=True)
df_no_omv['Loan_Amount_Term'].fillna(df_no_omv['Loan_Amount_Term'].mean(), inplace=True)

In [None]:
# Credit_History has only 2 values possible 0 and 1

# We are going fill the missing value with the most frequent one : 1

df_no_omv['Credit_History'].describe()

In [None]:
df_no_omv['Credit_History'].fillna(df_no_omv['Credit_History'].quantile(0.25), inplace=True)

In [None]:
# Now we are left with missing values only in categorical columns

col_missing_values = df_no_omv.isna().sum()
col_missing_values = col_missing_values[col_missing_values > 0]

print(col_missing_values)

In [None]:
# We fill those missing values with the most frequent one (that we get by taking the first value of the value_count return)

col_miss = ['Gender','Married','Dependents','Self_Employed']

for col in col_miss: 
    df_no_omv[col].fillna(df_no_omv[col].value_counts().index[0], inplace=True)

In [None]:
# No missing values left

df_no_omv.isna().sum()

<h1 id="encoding">Encoding</h1>

In [None]:
# Now, we are going to encode our categorical columns in numerical value (1/2) so our dataframe has only numerical values

In [None]:
categorical_columns

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

my_df_encoded = pd.DataFrame(encoder.fit_transform(df_no_omv[categorical_columns])) # Encoding our categorical columns 

my_df_encoded.columns = encoder.get_feature_names(categorical_columns) # One Hot Encoder remove colmuns names so we need to put them back

df_no_omv.drop(categorical_columns ,axis=1, inplace=True) # We drop the old categorical columns in our original df

my_df_encoded = pd.concat([df_no_omv, my_df_encoded], axis=1) # We concat the encoded columns with the numerical columns of our original df


In [None]:
my_df_encoded.head() # Each values for categorical column has now their own column with 0 or 1 as values.

In [None]:
my_df_encoded.info() # Their types has change from object to numeric

In [None]:
# Encoding the target columns Loan_Status. The column we want to predict. It's recommanded to encode the tharget with labelBinarize.

lb = preprocessing.LabelBinarizer()
target_encoded = pd.DataFrame(lb.fit_transform(my_df_encoded['Loan_Status']), columns=['Loan_Status']) # Encoding the columns with 0/1.

In [None]:
my_df_encoded.drop('Loan_Status',axis=1, inplace=True) # Drop the Loan_Status columns of our original df.

my_df_encoded = pd.concat([target_encoded, my_df_encoded], axis=1) # Concat the encoded df with the target_encoded.

In [None]:
my_df_encoded.head() # We can see that our target is now encoded

In [None]:
# Distribution of the data after processing them (missing values, outliers, encoding..)

col_features = ['Credit_History', 'Gender_Female',
       'Gender_Male', 'Married_No', 'Married_Yes', 'Dependents_0',
       'Dependents_1', 'Dependents_2', 'Dependents_3+', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban']

i = 1
plt.figure(figsize=(20,25))
for feature in col_features:
    plt.subplot(4,4, i)
    sns.countplot(x = feature, data=my_df_encoded,  palette='viridis', hue='Loan_Status')
    plt.xlabel(feature, fontsize=14)
    plt.ylabel(' ')
    i = i + 1
plt.show()

<h1 id="features_selection">Features selection</h1>

In [None]:
# Distribution of the data in the target. 

# Our model must do at minimum 70%. It has to be at least better than if we were predicting based on the approved loan (yes)




ax = sns.countplot(x = 'Loan_Status', data = my_df_encoded,  palette='viridis')

# To put a rate number for each bar.
total = 168 + 389
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2.,
            height + 3,
            '{:1.1f}'.format(height / total),
            ha="center") 

In [None]:
# First feature selection, the best correlation with Loan_Status seems to be the Credit_History

correlations = my_df_encoded.corr()
matrix = np.triu(correlations) # We do a mask on the upper triangle of the heatmap to make it more clear

plt.figure(figsize=(15,9))
sns.heatmap(correlations, mask=matrix, cmap="GnBu",annot=True, cbar=False)

<h1 id="use_of_models">Use of different models [Decision Tree / Naive Bayes / Logistic Regression]</h1>

In [None]:
# It's a classification problem so we are going to use :

#- Logistic regression  
#- Random Forest
#- Naive Bayes

# Then we'll get deeper into the metrics of the logistic regression

In [None]:
# We import all the good stuff from sklearn :-)

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import binarize

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from math import sqrt



In [None]:
# Let's start by testing with all the feature in X and Loan_Status in the target

X = my_df_encoded.iloc[:, 2:] # [first_row : last_row , first_col : last_col]
y = my_df_encoded.iloc[:,0] 

In [None]:
# Split the data into a train set and a test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
lgr = LogisticRegression(max_iter=1000)

lgr.fit(X_train, y_train) # Train the model

y_pred = lgr.predict(X_test) # Predict based on our X_test from the train_test_split

print('Train set score : ', lgr.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred)) 

In [None]:
# Good result. No overfitting (variance between result you get on the train set and the test set, when your model generalize poorly).

# What are the best features to choose ? 

In [None]:
# Maybe we can optimize those results using less features.

# We can do this with RFECV = Recursive Feature Elimination and Cross-Validated selection of the best number of features.


# In KFolds, each test set should not overlap, even with shuffle. 
# With KFolds and shuffle, the data is shuffled once at the start, 
# and then divided into the number of desired splits. 
# The test data is always one of the splits, the train data is the rest.

skf = StratifiedKFold(n_splits=10) # Cross-validation 10 times
estimator = LogisticRegression(C=10,max_iter=1000, penalty='l2', solver='lbfgs') # The estimator used is Logistic Regression.

selector = RFECV(estimator, step=1, cv=skf, scoring="accuracy") # Run RFE.
selector = selector.fit(X, y) # Fit the datasolvers support only.

In [None]:
#Ranking of the best features

print(selector.ranking_)
print(selector.support_)

In [None]:
# Credit_history is the best feature. Which is what we found with the heatmap. Other features seem interesting as well.

my_df_encoded.iloc[:, 2:].columns

In [None]:
# Let's try only with the top 1 feature, Credit_History.

X = my_df_encoded.loc[:, ['Credit_History']]
y = my_df_encoded.iloc[:,0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Split our data

In [None]:
lgr = LogisticRegression(max_iter=1000)

lgr.fit(X_train, y_train)

y_pred = lgr.predict(X_test)

print('Train set score : ', lgr.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred)) 

In [None]:
# We get the same score..

In [None]:
# The Credit_History is the column that has the most influences on the results, 

# but we are going to test with the top 5 features so we can make our model more stable

In [None]:
top_5_features = ['Credit_History', 'Property_Area_Semiurban', 'Education_Graduate', 'Dependents_2', 'Married_Yes']
print('Top 5 features : ', top_5_features)

In [None]:
X = my_df_encoded.loc[:, top_5_features] #Features with the top 5.
y = my_df_encoded.iloc[:,0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
lgr = LogisticRegression(max_iter=1000)

lgr.fit(X_train, y_train)

y_pred = lgr.predict(X_test)

print('Train set score : ', lgr.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred)) 

In [None]:
# We still get our score. It seems to be a good idea to keep those columns.
# It seems risky to me to base our prediction only on one column. With more data, maybe the situation would change so it's better to back it up with other features.

In [None]:
X = my_df_encoded.iloc[:, 2:] # Select all features
y = my_df_encoded.iloc[:,0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## **Decision Tree**

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)
score = accuracy_score(y_test, y_pred_dtc) 

print('Train set score : ', dtc.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred_dtc)) 

## **Random Forest**

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

print('Train set score : ', rfc.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred_rfc)) 

## **Naive Bayes**

In [None]:
print('==============================')
print('GAUSSIAN :')

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test)
print('Train set score : ', gnb.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred1)) 


print('==============================')
print('MULTINOMIAL :')
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test)
print('Train set score : ', mnb.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred2)) 

print('==============================')
print('BERNOUILLI :')
bnm = BernoulliNB()
bnm.fit(X_train, y_train)
y_pred3 = bnm.predict(X_test)
print('Train set score : ', bnm.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred3)) 

In [None]:
# Our Logisitic Regression does as good as those others classifiers if we select the top 5 features but has overfitting or poor results when we select all columns.

In [None]:
# Most of our models are more accurate than a dummy classifier (which use very simple rules to make prediction)

from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

y_pred_dummy = dummy_clf.predict(X_test)
scr = dummy_clf.score(X_train, y_train)
score = accuracy_score(y_test, y_pred_dummy) 
print('Score on train set : ', scr)
print('Score on training set : ', score)

<h1 id="metrics_threshold">Model metrics & classification threshold modification<h1>


In [None]:
# Let's run again the model we selected...

X = my_df_encoded.loc[:, top_5_features] # Features within the top 5.
y = my_df_encoded.iloc[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

lgr = LogisticRegression(max_iter=1000)

lgr.fit(X_train, y_train)

y_pred = lgr.predict(X_test)

print('Train set score : ', lgr.score(X_train, y_train))
print('Test set score : ', accuracy_score(y_test,y_pred)) 

In [None]:
print(classification_report(y_pred, y_test))

In [None]:
#Plot the confusion matrix for the test set and the train set

plot_confusion_matrix(lgr, X_test, y_test, cmap=plt.cm.Blues, display_labels=['No', 'Yes'], normalize='true')  
plt.grid(False)
plt.title('Confusion matrix test set')
plt.show()

plot_confusion_matrix(lgr, X_train, y_train, cmap=plt.cm.Blues, display_labels=['No', 'Yes'], normalize='true')  
plt.grid(False)
plt.title('Confusion matrix train set')
plt.show()

What do we see ? There seem to be an unbalanced toward the Yes for 0.99. It's the True positive rate. As we saw, our data are already biais toward the Yes (70%).

Our model does poorly when it comes to predict No. It predicts a Yes instead for 0.56. That's the False positive rate. Which isn't good.
The choice of the metric depend on the business goals and risk
We can imagine that for a bank accepting wrongly a loan can be very costly.

So what would be interesting is to change the threshold of the probabilities
Indeed, Sci-Kit learn is using a threshold of P>0.5 for binary classifications
The sensibility to accept a loan should be higher. 

Let's try to do that!!

In [None]:
# For our 10 first prediction, we can see the distribution of the probabilities.

y_pred_proba = lgr.predict_proba(X_test)
y_pred_proba[:10] 

In [None]:
# Distribution of the probabilities for the No and the Yes for each X_test prediction

plt.figure(figsize=(13,7))
plt.hist(y_pred_proba)
plt.xlim(0,1)
plt.xlabel('Pred proba of Yes/No')
plt.ylabel('Frequency')
plt.title('Histogram of predicted probabilities')

In [None]:
# Changing the threshold would be detrimental for both side. It's not as if we could just change to 0.55 the threshold and have better sensibility.

# I'm going to show you the process just for the learning. But it won't help us here.


y_pred_class = binarize(y_pred_proba, 0.75)[:,1] # We make prediction according to a 0.75 threshold on the right column of y_pred_proba.

In [None]:
y_pred_class[0:5] # Results we get

In [None]:
y_pred_proba[0:5] # We can see that it matches the right column of the y_pred_proba. 0.68 is put to 0 because it's < 0.75.

In [None]:
print(confusion_matrix(y_test, y_pred)) # confusion matrix for a threshold of 0.5.

In [None]:
print(confusion_matrix(y_test, y_pred_class)) # confusion matrix for threshold of 0.75.

In [None]:
# When we compare our confusion matrix we can see that the false positive 16 become 6. We have lowered them. 
# But it's at the cost of getting a lower true positive (77 become 58)
# So it may not really be interesting.

# We can find the best balance of both with the ROC curve

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

plt.figure(figsize=(12,7))
plt.plot(fpr, tpr)
plt.title('ROC curve for logistic regression')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])

In [None]:
# If we want a false positive rate at 0.2 we have to accept a true positive rate at 0.4 ...

# The curve doesn't show us the threshold value. We have to calculate it from those datas.

# We can see that for our 0.97 of true positive rate, we got around 0.5 of false positive rate. That's what our confusion matrix showed us.

<h1 id="conclusion">Conclusion</h1>

### The score of false positive (incorrect attribution of loan) can be improved but this is going to be costly for our true positive rate (correct attribution of loan). The choice depends of the business goal. We don't have enough data to make a strong conclusion. 

### Our best model still get an accuracy of :
    
## **==> 83.9% <==**


### If there is any way I can improve my work, please feel free to make a suggestion.
### Thank you! :)