# Load Prediction based on Customer Behavious

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle

## 1. Problem defination

* Classify the Potential Bank Loan Defaulters

## 2. Data
There are 2 main datasets:

* Training Data.csv is the training set, which contains data of previous bank customers and defaulters.*
* Test Data.csv is the test set, which is going to used for testing our model prediction accuracy*


In [1]:
train_df = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Training Data.csv')
train_df.head()

In [1]:
train_df.info()

In [1]:
## How Likely Married or single people are loan defaulters
sns.countplot(x = train_df['Married/Single'] , hue = train_df['Risk_Flag']);

In [1]:
# Checking the people distribution with their working experience
sns.histplot(train_df['Experience'])

In [1]:
# Checking, people with how much expreince are usually loan defaulters
sns.countplot(train_df['Experience'] , hue = train_df['Risk_Flag']);

In [1]:
# How Likely house owners, house renters and withour house people does not pay loan
sns.countplot(train_df['House_Ownership'] , hue = train_df['Risk_Flag']);

In [1]:
##How many Loan Defaulters are their in the dataset
train_df["Risk_Flag"].value_counts().plot.bar(figsize=(6,6))

In [1]:
train_df.Profession.unique()

In [1]:
# Droping the Id, CITY, STATE and Profession columns from the dataset and copying the new dataset
df_copy = train_df.drop(['Id', 'CITY', 'STATE', 'Profession'], axis=1).copy()
df_copy.head()

In [1]:
df_copy.corr()

In [1]:
df_copy.describe()

# Data Processing

In [1]:
## This will convert all the string values into category value
# def convert_string_to_category():
for label,content in df_copy.items():
    if pd.api.types.is_string_dtype(content):
        df_copy[label] = content.astype("category").cat.as_ordered()

In [1]:
df_copy.info()

In [1]:
# Turn Categorical variables into numbers
for labels, content in df_copy.items():
    if not pd.api.types.is_numeric_dtype(content):
        df_copy[labels] = pd.Categorical(content).codes + 1

In [1]:
df_copy['Married/Single'].value_counts()

In [1]:
df_copy.House_Ownership.value_counts()

In [1]:
df_copy.describe()

In [1]:
# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(df_copy.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

From the correlation matrix, we can see that there is not much effect of the features on the risk_flag

## Splitting the dataset into train and test set

In [1]:
X_train , X_test , y_train , y_test = train_test_split(df_copy.drop("Risk_Flag", axis=1) , df_copy["Risk_Flag"] , train_size = 0.8, random_state=42)

In [1]:
X_train.shape, y_train.shape

# Fitting the Classification models on the train dataset 

### Using Logistic Regression

In [1]:
%%time
logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)

In [1]:
def evaluation_metrics(y_test,X_test, model):
    pred = model.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    score = f1_score(y_test,pred)
    cm = confusion_matrix(y_test, pred)
    
    
    print("Accuracy Score of the classification model is:", acc_score)
    print("ROC_AUC Score of the classification model is:", roc_score)
    print("F1 Score of the classification model is:", score)
    plt.figure(figsize=(9,9))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    all_sample_title = 'Accuracy Score: {0}'.format(acc_score)
    plt.title(all_sample_title, size = 15);
    

In [1]:
evaluation_metrics(y_test, X_test, logistic_model)

### Using Random Forest Classifier

In [1]:
%%time
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [1]:
evaluation_metrics(y_test, X_test, rfc)

### Using Decision Tree Classifier

In [1]:
%%time
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [1]:
evaluation_metrics(y_test, X_test, dtc)

### Using Bagging Classifier

In [1]:
%%time
bgc = BaggingClassifier()
bgc.fit(X_train, y_train)

In [1]:
evaluation_metrics(y_test, X_test, bgc)

### Using Voting Classifier

In [1]:
vc = VotingClassifier(estimators=[('bagging classifier', bgc), ('random forest', rfc), ('decision tree classifier', dtc)],voting='soft')
vc.fit(X_train, y_train)

In [1]:
evaluation_metrics(y_test, X_test, vc)

## Checking Features Importance for this task

In [1]:
rfc.feature_importances_

In [1]:
def plot_features(columns, importances, n=8):
    df = pd.DataFrame({"features": columns,
                          "features_importances": importances}).sort_values("features_importances", ascending=False).reset_index(drop=True)

    #Plot the DataFRame\n",
    fig, ax = plt.subplots()
    ax.barh(df["features"][:n], df["features_importances"][:n])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature Importance")
    ax.invert_yaxis()

In [1]:
plot_features(X_train.columns,rfc.feature_importances_)

## Saving Machine Learning Model

In [1]:
filename = 'Random_Forest_Classifier.sav'
pickle.dump(rfc, open(filename, 'wb'))

In [1]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

# Make Predictions on Test Dataset
I am going to use Random Forest Classifier as it gives the best f1 and accuracy score

In [1]:
# Import the test data
test_df = pd.read_csv("../input/loan-prediction-based-on-customer-behavior/Test Data.csv")

test_df.head()

In [1]:
def preprocess_data(df):
    """
    Perform data preprocessing for model prediction
    """
    
    df = df.drop(["ID","Profession", "CITY", "STATE"], axis=1)
    
    ##Fill the categorical datab missing data and turned categories into numbers
    for labels, content in df.items():
        if not pd.api.types.is_numeric_dtype(content):
            df[labels] = pd.Categorical(content).codes + 1
            
        
    return df

In [1]:
processed_test_df = preprocess_data(test_df)
processed_test_df.head()

In [1]:
test_preds = rfc.predict(processed_test_df)

In [1]:
df_preds = pd.DataFrame()
df_preds["id"] = test_df["ID"]
df_preds["risk_flag"] = test_preds

df_preds

In [1]:
df_preds.risk_flag.value_counts()

In [1]:
#Export predictions data
df_preds.to_csv("Submission.csv", index=False)