# Loan approvals prediction 

### Data Exploration 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

from ipywidgets import widgets
import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("/kaggle/input/loan-predication/train_u6lujuX_CVtuZ9i (1).csv")
data.head()

- Loan_ID: Unique Loan ID
- Gender: Male/ Female
- Married: Applicant married (Y/N)
- Dependents: Number of dependents
- Education: Applicant Education (Graduate/ Under Graduate)
- Self_Employed: Self employed (Y/N)
- ApplicantIncome: Applicant income
- CoapplicantIncome: Coapplicant income
- LoanAmount: Loan amount in thousands
- Loan_Amount_Term:	Term of loan in months
- Credit_History: credit history meets guidelines
- Property_Area: Urban/ Semi Urban/ Rural
- Loan_Status: Loan approved (Y/N)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

##### Features engineering 

In [None]:
def missing_values(data):
    temp = [feature for feature in data.columns if data[feature].isnull().sum()>1]
    for feature in temp:
        print(feature, ": Number of missing values ==> ",data[feature].isnull().sum(),
             "  --- missing data percentage ==> " ,np.round(data[feature].isnull().mean(), 4),"%")
missing_values(data)

In [None]:
data.drop('Loan_ID', axis = 1, inplace=True)

In [None]:
data.columns

In [None]:
categoricalValues = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Loan_Amount_Term',
                     'Credit_History', 'Property_Area']
numericalValues = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [None]:
ax = sns.countplot(data['Loan_Status'])
for p in ax.patches:
    plt.title('loan approvement status', fontsize = 16)
    ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2.5, p.get_height()), fontsize=12)

In [None]:
def plotFunction(column):
    ax = sns.countplot(x=column, data=data, hue='Loan_Status')
    plt.ylabel('Count')
    for p in ax.patches:
        ax.annotate(p.get_height(), (p.get_x()+p.get_width()/2.6, p.get_height()), fontsize=12)

dropdown_menu = {i:i for i in categoricalValues}

widgets.interact(plotFunction, column=dropdown_menu);

In [None]:
def plotFunction(column):
    sns.distplot(data[column])
    plt.xlabel(str(column))
    plt.ylabel('Count')

dropdown_menu = {i:i for i in numericalValues}

widgets.interact(plotFunction, column=dropdown_menu);

In [None]:
sns.boxplot(y=numericalValues[2], data=data, x = 'Loan_Status')
plt.show()

#### Imputing Missing Values

In [None]:
# Categorical variables will be filled by the most frequent class
data['Gender'].fillna(data['Gender'].value_counts().idxmax(), inplace=True)
data['Married'].fillna(data['Married'].value_counts().idxmax(), inplace=True)
data['Dependents'].fillna(data['Dependents'].value_counts().idxmax(), inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].value_counts().idxmax(), inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].value_counts().idxmax(), inplace=True)
data['Credit_History'].fillna(data['Credit_History'].value_counts().idxmax(), inplace=True)

# We use the median to fill missing numerical value
data["LoanAmount"].fillna(data["LoanAmount"].median(skipna=True), inplace=True)

#### Encode categorical variables using weight of evidence (WOE)

WOE = ln (p(1) / p(0))

p(1) is the probability of the target being 1.<br>
p(0) is the probability of the target being 0.

**Advantages of WOE Encoding:**
- Creates a monotonic relationship between the target and the variables.
- Orders the categories on a logistic scale, which is natural for logistic regression.
- Determine which variable is more predictive.

In [None]:
data['Loan_Status'] = data['Loan_Status'].map({'N': 0, 'Y': 1})

In [None]:
# Correlation Matrix (only numerical variables)
sns.heatmap(data.corr(),annot=True,linewidths=0.2,annot_kws={'size':12})
fig=plt.gcf()
fig.set_size_inches(12,8)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data['Loan_Status'], test_size  = 0.2, 
                                                    random_state=42, stratify = data['Loan_Status'])

In [None]:
train = X_train.copy()
test = X_test.copy()
train['target'] = y_train
test['target'] = y_test

In [None]:
for variable in ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Credit_History', 'Property_Area']: 
    dataframe = pd.DataFrame(train.groupby([variable])['target'].mean())
    dataframe['non-target'] = 1 - dataframe['target']
    dataframe['woe'] = np.log(dataframe['target'] / dataframe['non-target'])
    ratio_mapping = dataframe['woe'].to_dict()
    train[variable] = train[variable].map(ratio_mapping)
    test[variable] = test[variable].map(ratio_mapping)

In [None]:
sns.heatmap(train.corr(),annot=True,linewidths=0.2,annot_kws={'size':12})
fig=plt.gcf()
fig.set_size_inches(12,8)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

==> The credit history and the applicant income are the most positively correlated variables with our target

## Modelling

In [None]:
X_train, X_test = train.iloc[:,:-1], test.iloc[:,:-1]
y_train, y_test = train['target'], test['target']

In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

In [None]:
scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
import graphviz
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, roc_curve

We are dealing with **binary** classification where  $y_j$ can take only 0 or 1. 

$$ y_j = f(X_j). $$

Firstly, we are going to use the logistic regression model which is a classifier version of linear regression. It is a probabilistic model (predict probability values that can then be used to assign class labels) <br>
$$ p_j = \sigma \left( \sum_{i} X_{ji}\beta_i  + \beta_0 \right), $$
where
$$ \sigma(x) = \frac{1}{1 + e^{-x}} $$

**Performance Metrics**

|                     | Positive Observation     | Negative Observation    |
|---------------------|:------------------------:|:-----------------------:|
| Positive Prediction |     True Positive (TP)   | False Positive (FP)     |
| Negative Prediction | False Negative (FN)      |     True Negative (TN)  |

$$ \text{accuracy} = \frac{\text{number of correct observations}}{\text{number of observations}}.$$

$$ \text{precision} = \frac{\text{TP}}{TP + FP}$$

$$ \text{recall} = \frac{\text{TP}}{TP + FN}. $$

$$ \text{F1-Score} = \frac{\text{2.precision.recall}}{precision + recall}. $$

In [None]:
lr = LogisticRegression(max_iter=1000, solver='liblinear')
lr.fit(X_train_std, y_train)
y_pred_lr = lr.predict(X_test_std)  
print("Accuracy Score: ", accuracy_score(y_test, y_pred_lr),"****   F1_score: ", f1_score(y_test, y_pred_lr))

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test) 
print("Accuracy Score: ", accuracy_score(y_test, y_pred_lr),"****   F1_score: ", f1_score(y_test, y_pred_lr))

In [None]:
lr_probs = lr.predict_proba(X_test)[:, 1]
lr_auc = roc_auc_score(y_test, lr_probs)
print('Logistic: ROC AUC= ' ,lr_auc)
# calculate roc curves

fpr, tpr, _ = roc_curve(y_test, lr_probs)
plt.plot(fpr, tpr, label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend()
plt.show()

In [None]:
# Changing threshold
y_pred_lr = (lr.predict_proba(X_test)[:,1]>0.75).astype(np.int32)
print("Accuracy Score: ", accuracy_score(y_test, y_pred_lr),"****   F1_score: ", f1_score(y_test, y_pred_lr))

In [None]:
from sklearn.dummy import DummyClassifier
# DummyClassifier is a baseline classifier that makes predictions using simple rule
# we will use it as benchmark to evaluate our model performance 

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test) 
print("Accuracy Score: ", accuracy_score(y_test, y_pred_dummy),"****   F1_score: ", f1_score(y_test, y_pred_dummy))

xgboost classifier

In [None]:
# xgboost_parameters = {
#     'max_depth' : [3,5,7,9,12,15,17,25],
#     'learning_rate' : [0.01,0.015,0.025,0.05,0.1],
#     'gamma' : [0.05,0.1,0.3,0.5,0.7,0.9,1],
#     'min_child_weight' : [1,3,5,7], 
#     'subsample' : [0.6,0.7,0.8,0.9,1],
#     'colsample_bytree' : [0.6,0.7,0.8,0.9,1],
#     'reg_alpha' : [0.01,0.1,1],
#     'reg_lambda' : [0.1,0.5,1],
# }

In [None]:
# xgbmodel = XGBClassifier()
# gs = GridSearchCV(xgbmodel, xgboost_parameters)
# gs.fit(X_train, y_train)

gs.best_params_<br>
{'colsample_bytree': 0.6,
 'gamma': 0.1,
 'learning_rate': 0.05,
 'max_depth': 7,
 'min_child_weight': 5,
 'reg_alpha': 0.01,
 'reg_lambda': 0.5,
 'subsample': 1}

| hyperparameter | description |
| --- | --- |
| learning_rate | step size shrinkage used to prevent overfitting. Range is [0,1] |
| max_depth | determines how deeply each tree is allowed to grow during any boosting round |
| subsample | percentage of samples used per tree. Low value can lead to underfitting. |
| colsample_bytree | percentage of features used per tree. High value can lead to overfitting. |
| n_estimators | number of trees you want to build. |
| objective | determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability. |
| gamma | controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners. |
| alpha | L1 regularization on leaf weights. A large value leads to more regularization. |
| lambda | L2 regularization on leaf weights and is smoother than L1 regularization. |

In [None]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/youssef.amdouni/Anaconda3/Library/bin/graphviz'
import xgboost as xgb
import graphviz

In [None]:
xgbmodel = XGBClassifier(colsample_bytree = 0.6,
                        gamma= 0.1,
                        learning_rate = 0.05,
                        max_depth = 7,
                        min_child_weight = 5,
                        reg_alpha = 0.01,
                        reg_lambda = 0.5,
                        subsample = 1)
xgbmodel.fit(X_train, y_train)
y_pred_xgb = xgbmodel.predict(X_test) 
print("Accuracy Score: ", accuracy_score(y_test, y_pred_xgb),"****   F1_score: ", f1_score(y_test, y_pred_xgb))

In [None]:
xgb.plot_tree(xgbmodel, num_trees=0)
plt.rcParams['figure.figsize'] = [16, 10]
plt.show()

In [None]:
xgb.plot_importance(xgbmodel)
plt.rcParams['figure.figsize'] = [16, 6]
plt.show()

#### Classification using neural network 

Good tutorial to build neural network with pytorch [Tutorial link.](https://uvadlc-notebooks.readthedocs.io/en/latest/index.html)

In [None]:
X_train_std = pd.DataFrame(X_train_std, columns=X_train.columns) 
X_test_std = pd.DataFrame(X_test_std, columns=X_train.columns)
print(X_train_std.shape, X_test_std.shape, data.shape)

In [None]:
np.random.seed(250)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader

In [None]:
class loadData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [None]:
train_data = loadData(torch.FloatTensor(X_train_std.values), 
                       torch.FloatTensor(y_train.values))
test_data = loadData(torch.FloatTensor(X_test_std.values), 
                       torch.FloatTensor(y_test.values))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=64)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
class binaryClassification(nn.Module):
    def __init__(self, num_inputs, num_hidden, num_outputs):
        super(binaryClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_inputs, num_hidden) 
        self.layer_2 = nn.Linear(num_hidden, num_hidden)
        self.layer_out = nn.Linear(num_hidden, num_outputs) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model = binaryClassification(num_inputs=11, num_hidden=64, num_outputs=1)
model.to(device)

In [None]:
loss_module = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
from tqdm.notebook import tqdm
def train_model(model, optimizer, data_loader, loss_module, num_epochs=80):
    model.train()

    # Training loop
    for epoch in tqdm(range(num_epochs)):
        for data_inputs, data_labels in data_loader:

            data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)

            ## Run the model on the input data
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1) 
            
            ## loss
            loss = loss_module(preds, data_labels.float())

            ## Backpropagation
            optimizer.zero_grad()
            loss.backward()

            ## Update parameters
            optimizer.step()

In [None]:
train_model(model, optimizer, train_loader, loss_module)

In [None]:
state_dict = model.state_dict()
#print(state_dict)

In [None]:
def eval_model(model, data_loader):
    model.eval() 
    true_preds, num_preds = 0., 0.

    with torch.no_grad(): 
        for data_inputs, data_labels in data_loader:

            # Determine prediction of model on dev set
            data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)
            preds = torch.sigmoid(preds) 
            pred_labels = (preds >= 0.5).long() 

            true_preds += (pred_labels == data_labels).sum()
            num_preds += data_labels.shape[0]

    acc = true_preds / num_preds
    print("Accuracy of the model: %4.2f%%" % (100.0*acc))

In [None]:
eval_model(model, test_loader)