In [None]:
!pip install seaborn --upgrade

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')

# EDA

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
sb.countplot(data=data, x='Gender')

In [None]:
sb.countplot(data=data, x='Married')

In [None]:
sb.countplot(data=data, x='Dependents')

In [None]:
sb.countplot(data=data, x='Self_Employed')

In [None]:
sb.displot(data=data, x='ApplicantIncome', binwidth=2000)

In [None]:
sb.displot(data=data, x='LoanAmount', binwidth=20)

In [None]:
sb.countplot(data=data, x='Loan_Amount_Term')

In [None]:
sb.countplot(data=data, x='Credit_History')

# Preprocessing

## Cleaning

In [None]:
data.drop('Loan_ID', axis=1, inplace=True)

## Handling missing values

In [None]:
data.isna().sum().sort_values(ascending=False)

In [None]:
data.fillna({
    'Credit_History': data.Credit_History.mode()[0],
    'Self_Employed': data.Dependents.mode()[0],
    'LoanAmount': data.LoanAmount.median(),
    'Dependents': data.Dependents.mode()[0],
    'Loan_Amount_Term': data.Loan_Amount_Term.mode()[0],
    'Gender': data.Gender.mode()[0],
    'Married': data.Married.mode()[0]
}, inplace=True)

## Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

gender_le = LabelEncoder()
data.Gender = gender_le.fit_transform(data.Gender)
married_le = LabelEncoder()
data.Married = married_le.fit_transform(data.Married)
dependents_le = LabelEncoder()
data.Dependents = dependents_le.fit_transform(data.Dependents)
education_le = LabelEncoder()
data.Education = education_le.fit_transform(data.Education)
self_employed_le = LabelEncoder()
data.Self_Employed = self_employed_le.fit_transform(data.Self_Employed)
property_area_le = LabelEncoder()
data.Property_Area = property_area_le.fit_transform(data.Property_Area)
loan_status_le = LabelEncoder()
data.Loan_Status = loan_status_le.fit_transform(data.Loan_Status)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('Loan_Status', axis=1)
y = data.Loan_Status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_cls = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_cls.fit(X_train, y_train)

In [None]:
print("Decision tree depth :", dt_cls.get_depth())
print("Decision tree number of leaves :", dt_cls.get_n_leaves())

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_train_pred = dt_cls.predict(X_train)
y_test_pred = dt_cls.predict(X_test)

dt_acc = accuracy_score(y_test, y_test_pred)
dt_f1 = f1_score(y_test, y_test_pred)
print("Decision tree cls accuracy on test set : {} (on train set : {})".format(dt_acc, accuracy_score(y_train, y_train_pred)))
print("Decision tree cls F1 score on test set : {} (on train set : {})".format(dt_f1, f1_score(y_train, y_train_pred)))

#### Results

We notice a perfect fit on the training set with a pretty big drop in performance on the test set indicating we are probably overfitting the training set.  
That is a common issue with decision trees mainly solved by fine-tuning the hyperparameters of our decision tree or switching to random forest.

### Decision tree : hyperparameters tuning with CV

In [None]:
from sklearn.model_selection import GridSearchCV

dt_estimator = DecisionTreeClassifier(random_state=42)
grid_search_params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': range(1, 10)
}
cv_dt_model = GridSearchCV(estimator=dt_estimator, param_grid=grid_search_params)
cv_dt_model.fit(X_train, y_train)

In [None]:
print("Best depth :", cv_dt_model.best_estimator_.get_depth())
print("Best number of leaves :", cv_dt_model.best_estimator_.get_n_leaves())
print("Best params :", cv_dt_model.best_estimator_.get_params())

In [None]:
y_train_pred = cv_dt_model.predict(X_train)
y_test_pred = cv_dt_model.predict(X_test)

cv_dt_acc = accuracy_score(y_test, y_test_pred)
cv_dt_f1 = f1_score(y_test, y_test_pred)
print("Optimized decision tree cls accuracy on test set : {} (on train set : {})".format(cv_dt_acc, accuracy_score(y_train, y_train_pred)))
print("Optimized decision tree cls F1 score on test set : {} (on train set : {})".format(cv_dt_f1, f1_score(y_train, y_train_pred)))

#### Results

So our optimized decision tree does indeed perform better on the test set than our previous decision tree indicating we may have reduced overfitting.  
**But we notice that cross-validation led to a weird discovery : our best tree has only a depth of 1 and 2 leaves !**

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_cls = RandomForestClassifier(criterion='entropy', random_state=42)
rf_cls.fit(X_train, y_train)

In [None]:
y_train_pred = rf_cls.predict(X_train)
y_test_pred = rf_cls.predict(X_test)

rf_acc = accuracy_score(y_test, y_test_pred)
rf_f1 = f1_score(y_test, y_test_pred)
print("Random Forest cls accuracy on test set : {} (on train set : {})".format(rf_acc, accuracy_score(y_train, y_train_pred)))
print("Random Forest cls F1 score on test set : {} (on train set : {})".format(rf_f1, f1_score(y_train, y_train_pred)))

#### Results

We're once again fitting the training set perfectly but our evaluation metrics improved compared to our simple decision tree.

### Explainability : feature importances

Let's try to understand how our models made their predictions, especially our optimized decision tree with only 2 leaves.

In [None]:
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'decision tree': dt_cls.feature_importances_,
    'optimized decision tree': cv_dt_model.best_estimator_.feature_importances_,
    'random forest': rf_cls.feature_importances_
})

feature_importances_melted = feature_importances.melt(id_vars='feature', value_vars=feature_importances.columns.values[1:])
feature_importances_melted.sort_values(by='value', ascending=False, inplace=True)

sb.barplot(data=feature_importances_melted, y='feature', x='value', hue='variable')

As we could have guessed, *that* is the reason why our optimized decision tree has only 2 leaves : it makes all its predictions around **one single feature** !\
Interestingly enough, this feature (the credit history of the applicant) is also the most important one for our random forest but not for our simple decision tree.

### Correlation matrix

Correlation matrices allow us to check for correlation between our features and help us foresee such outcomes.  
Inspecting correlation matrices is usually part of EDA and it was my mistake not to do it in the first place.

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
sb.heatmap(data.corr(), annot=True, fmt=".2f", linewidths=0.1, ax=ax)

As expected, our target variable Loan_Status has a high (>.5) positive Pearson correlation coefficient with Credit_History.

# Conclusion

#### Final results (on test set)

In [None]:
from tabulate import tabulate

print(tabulate([
    ['Decision tree', round(dt_acc, 2), round(dt_f1, 2)],
    ['Optimized DT', round(cv_dt_acc, 2), round(cv_dt_f1, 2)],
    ['Random forest', round(rf_acc, 2), round(rf_f1, 2)]],
    headers=['Model', 'Accuracy', 'F1 score']))

Our optimized decision tree revolving solely around the credit history feature ends up achieving the best performance.