In [None]:
#importing helping hands
import os
import time
from collections import Counter

import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.2f' % x)

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
train_path = '../input/credit-risk-dataset/loan/loan.csv'
data = pd.read_csv(train_path)

In [None]:
data.head()

In [None]:
data['loan_status'].unique()

#### Loan Status
Before Getting into feature engineering lets understand about the loan status. I have created 4 different categories for given statuses.

1. **Issued**
    * Recently issued a fresh loan to borrower
2. **Issued, Current, Fully Paid, Does not meet the credit policy. Status:Fully Paid (Category 1)**
    * Current - All outstanding payments are paid up to date 
    * Fully Paid - All outstanding payments are paid up and loan is finished
3. **In Grace Period, Late (16-30 days), Late (31-120 days) (Category 2)**
    * In Grace Period - 1-15 days passed after the due date
    * Late (16-30 days) - 16-30 days passed after the due date
    * Late (31-120 days) - 31-120 days passed after the due date
4. **Charged Off , Does not meet the credit policy. Status:Charged Off, Default (Category 3)**
    * Default - The borrower is not able to make outstanding payments for an extended period of time
    * Charged Off - A charge-off usually occurs when the creditor has deemed an outstanding debt is uncollectible
    
Inorder to get idea about the different loan status, please visit https://bit.ly/35Vv3W3

## Normal Random Forest

In [None]:
#copying the dataframe
df = data.copy()

In [None]:
#removing columns havinf more than 97% NaNs 
columns = []
for column in df.columns:
    if ((df[column].isna().sum()/len(df))*100) >= 97:
        columns.append(column)
df.drop(columns = columns, axis = 1, inplace = True)

In [None]:
df.info()

In [None]:
#fillna in dataframe
df.fillna(0, inplace = True)

In [None]:
#dividing the loan_statuses into 3 categories
df.loc[df['loan_status'].isin(['Issued','Current','Does not meet the credit policy. Status:Fully Paid', 'Fully Paid']), 'loan_category'] = 1
df.loc[df['loan_status'].isin(['In Grace Period', 'Late (16-30 days)', 'Late (31-120 days)']), 'loan_category'] = 2
df.loc[df['loan_status'].isin(['Charged Off' , 'Does not meet the credit policy. Status:Charged Off', 'Default']), 'loan_category'] = 3

#droping the loan_status column 
df.drop(['loan_status'], axis = 1, inplace = True)

#object type columns 
object_columns = df.select_dtypes('object')

#separating the features and labels
features = df.drop(columns = ['loan_category'], axis = 1)
labels = df['loan_category']

#train-test spliting
X_train, X_test, y_train, y_test =  train_test_split(features, labels, test_size = 0.25, random_state = 43, stratify = labels)

In [None]:
def category_encoding(columns, X_train, X_test):
    #encoding the categorical data
    encoder = ce.OrdinalEncoder(cols = columns)
    encoder = encoder.fit(X_train)

    X_train_encoded = encoder.transform(X_train)
    X_test_encoded = encoder.transform(X_test)
    
    return X_train_encoded, X_test_encoded

In [None]:
def training_predicting(X_train, X_test, y_train, n, imp_features = False):
    #selecting random forest for training
    model = RandomForestClassifier(n_estimators = n, verbose = 1)

    #training
    model.fit(X_train, y_train)

    #prediction and metrix score
    y_predict = model.predict(X_test)
    
    if imp_features == True:
        return y_predict, model.feature_importances_
    
    return y_predict

In [None]:
#categorical encoding
X_train_encoded, X_test_encoded = category_encoding(object_columns, X_train, X_test)

#training, predicting, feature importance  
y_predict, imp_features = training_predicting(X_train_encoded, X_test_encoded, y_train, 20, imp_features = True)

In [None]:
#confusion matrix
cf_matrix = confusion_matrix(y_test, y_predict, labels = [1,2,3])
sns.heatmap(cf_matrix, annot=True, fmt = 'g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
#10 important features
imp_features = pd.DataFrame(imp_features, index = X_train.columns, columns = ['importance']).sort_values('importance', ascending = False)
imp_features.head(10)

## Feature Creation

In [None]:
#copying the dataframe
df = data.copy()

In [None]:
#removing columns havinf more than 97% NaNs 
columns = []
for column in df.columns:
    if ((df[column].isna().sum()/len(df))*100) >= 97:
        columns.append(column)
df.drop(columns = columns, axis = 1, inplace = True)

In [None]:
#last payment date can be splitted into months and years as well
df['last_pymnt_d_month'] = df['last_pymnt_d'].map(lambda x: str(x).split("-")[0])
df['last_pymnt_d_year'] = df['last_pymnt_d'].map(lambda x: str(x).split("-")[-1])
df['last_pymnt_d_nan'] = df['last_pymnt_d'].map(lambda x: 0 if pd.isna(x) else 1)

In [None]:
#next payment date can be splitted into months and years as well
df['next_pymnt_d_month'] = df['next_pymnt_d'].map(lambda x: str(x).split("-")[0])
df['next_pymnt_d_year'] = df['next_pymnt_d'].map(lambda x: str(x).split("-")[-1])
df['next_pymnt_d_nan'] = df['next_pymnt_d'].map(lambda x: 0 if pd.isna(x) else 1)

In [None]:
#loan issue date can be splitted into months and years as well
df['issue_d_month'] = df['issue_d'].map(lambda x: str(x).split("-")[0])
df['issue_d_year'] = df['issue_d'].map(lambda x: str(x).split("-")[-1])
df['issue_d_nan'] = df['issue_d'].map(lambda x: 0 if pd.isna(x) else 1)

In [None]:
#new percentage features created from existed ones
df['rec_prncp_per'] = (df['total_rec_prncp']/df['funded_amnt'])*100
df['rem_prncp_per'] = (df['out_prncp']/df['funded_amnt'])*100
df['installment _perc'] = (df['installment']/df['funded_amnt'])*100
df['open_loc_perc'] = (df['open_acc']/df['total_acc'])*100

In [None]:
le = LabelEncoder()
le.fit(df['last_pymnt_d_month'].unique())

df['last_pymnt_d_month'] = le.transform(df['last_pymnt_d_month'])
df['next_pymnt_d_month'] = le.transform(df['next_pymnt_d_month'])
df['issue_d_month'] = le.transform(df['issue_d_month'])

In [None]:
#fillna in dataframe
df.fillna(0, inplace = True)

In [None]:
#dividing the loan_statuses into 3 categories
df.loc[df['loan_status'].isin(['Issued','Current','Does not meet the credit policy. Status:Fully Paid', 'Fully Paid']), 'loan_category'] = 1
df.loc[df['loan_status'].isin(['In Grace Period', 'Late (16-30 days)', 'Late (31-120 days)']), 'loan_category'] = 2
df.loc[df['loan_status'].isin(['Charged Off' , 'Does not meet the credit policy. Status:Charged Off', 'Default']), 'loan_category'] = 3

#droping the loan_status column 
df.drop(['loan_status'], axis = 1, inplace = True)

#object type columns 
object_columns = df.select_dtypes('object')

#separating the features and labels
features = df.drop(columns = ['loan_category'], axis = 1)
labels = df['loan_category']

In [None]:
#train-test spliting
X_train, X_test, y_train, y_test =  train_test_split(features, labels, test_size = 0.25, random_state = 43, stratify = labels)

In [None]:
#categorical encoding
X_train_encoded, X_test_encoded = category_encoding(object_columns, X_train, X_test)

In [None]:
#training, predicting, feature importance  
y_predict, imp_features = training_predicting(X_train_encoded, X_test_encoded, y_train, 20, imp_features = True)

In [None]:
#confusion matrix
cf_matrix = confusion_matrix(y_test, y_predict, labels = [1,2,3])
sns.heatmap(cf_matrix, annot=True, fmt = 'g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
#finding feature importance
imp_features = pd.DataFrame(imp_features, index = X_train.columns, columns = ['importance']).sort_values('importance', ascending = False)
imp_features.head(10)

## SMOTE for balancing the data

In [None]:
Counter(y_train)

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train_encoded, y_train)

In [None]:
#training, predicting, feature importance  
y_predict, imp_features = training_predicting(X_train, X_test_encoded, y_train, 20, imp_features = True)

In [None]:
#confusion matrix
cf_matrix = confusion_matrix(y_test, y_predict, labels = [1,2,3])
sns.heatmap(cf_matrix, annot=True, fmt = 'g')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

#### I would like your thoughts on how to improve this model's False Negative problem