In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Load & Explore Data

In [None]:
df = pd.read_csv("/kaggle/input/esigning-of-loan-based-on-financial-history/financial_data.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

# Data Cleaning

In [None]:
# Check missing values
df.isnull().any()

In [None]:
# Remove unwanted column
df_new = df.drop(columns=["entry_id", "e_signed", "pay_schedule"])

In [None]:
df.head()

# Visualization

In [None]:
# Histogram for every single column
plt.figure(figsize=(15, 12))
for i in range(df_new.shape[1]):
    plt.subplot(6, 3, i+1)
    f = plt.gca()
    f.set_title( df_new.columns.values[i])
    bins = len(df_new.iloc[:, i].unique())
    if bins >= 100:
        bins = 100
    plt.hist(df_new.iloc[:, i], bins=bins)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

# Correlation Histogram

Correlation is very important factor. it show how your variables are connected to each other. High magnitude means high correlation. + sign means if one increases, other also increases. - sign means they are inversly proportional. that mean if one increases, other will decrease. But still its a negative relationship.
Point to remember:
<ul>
    <li>High Maginitude - High Correlation</li>
    <li>+ sign means directly proportional.</li>
    <li>- sign means inversely proportional</li>
    </ul>

In [None]:
df_new.corrwith(df.e_signed)

In [None]:
df_new.corrwith(df.e_signed).plot.bar(rot=60, figsize=(16, 13), title="Correlation with e-signed", grid=True
                                     , fontsize=15)

In [None]:
plt.figure(figsize=(15, 12))
sns.heatmap(df_new.corr(), annot=True)

In [None]:
df_new.head()

# Feature Engineering

Feature engineering is a vast term. It means to do something in your feature so that we can get a smaller dimensions. It may include feature extraction, deleting a feature or creating a new feature with the combination of two or more features. So in our case we can see personal_account_m, personal_account_y are almost same one show months and other show years. So we can combine both columns in a single column called personal_account_months and drop both the columns. In this way we can reduce our dimension by one column. Not very big diffrence but still something is better than nothing.

In [None]:
# personal_account_m & personal_account_y column can be changed to single column personal_Account_months

df["personal_account_month"] = df["personal_account_m"] + 12* df["personal_account_y"]

In [None]:
users = df["entry_id"]
response = df["e_signed"]
df.drop(columns=["entry_id", "months_employed", "e_signed","personal_account_m", "personal_account_y"], inplace=True)

In [None]:
df.head()

# Handling Categorical Features

Dummy variables are for handling categorical variable. While creating dummy variable always use drop_first=True otherwise you have to drop one dummy variable column manually to avoid dummy variable trap. you can search this term on google. In short, it says never include all dummy variables, always leave one. e.g. if you have n dummy variables then you should take n-1 variables.

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
X = df.values
y= response.values

In [None]:
y

# Data Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# SVM Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators =10, max_features=10, random_state=0,  criterion='entropy')
clf.fit(X_train, y_train)

In [None]:
y_predict = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_predict)
#sns.heatmap(cm, annot=True)
cm

In [None]:
# accuracy
clf.score(X_test, y_test) # we can see score is very bad. Lets apply gridsearchcv to fine tune our model

# Fine Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    "n_estimators" : [10, 100, 200 ],  "criterion":["entropy", "gini"]
}
gs = GridSearchCV(estimator=clf, param_grid=params, cv=10)

In [None]:
result = gs.fit(X_train, y_train)

In [None]:
result.best_params_

In [None]:
result.best_score_

In [None]:
#So we can see our best accuracy is 63%. Actually acocrding to our data it is a good accuracy and our case study is not that sensitive so it may compromise with some %of accuracy.
# lets train and fit our final model with the best params.
clf = RandomForestClassifier(n_estimators=200, criterion="entropy")
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
cm

In [None]:
classification_report(y_test, y_predict)

# Deep Learning Model

In [None]:
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential

In [None]:
clf_n = Sequential([
    Dense(activation="relu", init="uniform", input_dim=19, output_dim=10),
    Dense(activation="relu", init="uniform", output_dim=10),
    Dropout(0.5),
    Dense(activation="relu", init="uniform", output_dim=10),
    Dense(activation="sigmoid", init="uniform", output_dim=1)
])

In [None]:
clf_n.summary()

In [None]:
clf_n.compile(optimizer="adam", metrics=["accuracy"], loss="binary_crossentropy")

In [None]:
print(X_train.shape)
print(np.array(y_train.shape))

In [None]:
clf_n.fit((X_train), (y_train), batch_size=20, epochs=50)

In [None]:
# So here we can see our deep learning network accuracy is very less. It may be because of data. As deep learning requires a large datasets. So for our case we will settle with
# random forest classifier. You also try diffrent model to fine tune and let us know which one do you feel suits our case study.
# Thanks... UPVOTE IF YOU LIKE THE KERNEL