<h1>Lending Club: Loan Default Prediction</h1>

### This notebook will focus on further processing of the cleaned data

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [105]:
pd.set_option('display.max_columns', 1000)

In [106]:
df = pd.read_csv('cleaned_and_reduced.csv')

In [107]:
df.shape

(1262917, 16)

In [108]:
df['loan_repaid'].value_counts()

loan_repaid
1    1007811
0     255106
Name: count, dtype: int64

In [109]:
df.head()

Unnamed: 0,sub_grade,verification_status,int_rate,mort_acc,term,pct_tl_nvr_dlq,home_ownership,num_actv_rev_tl,num_bc_sats,purpose,loan_amnt,dti,open_acc,id,loan_repaid,region
0,C4,Not Verified,13.99,1,36 months,76.9,MORTGAGE,4,2.0,debt_consolidation,3600.0,5.91,7.0,68407277,1,Northeast
1,C1,Not Verified,11.99,3,36 months,97.4,MORTGAGE,10,13.0,small_business,24700.0,16.06,22.0,68355089,1,Midwest
2,B4,Not Verified,10.78,1,60 months,100.0,MORTGAGE,3,2.0,home_improvement,20000.0,10.78,6.0,68341763,1,Midwest
3,F1,Source Verified,22.45,2,60 months,96.6,MORTGAGE,6,5.0,major_purchase,10400.0,25.37,12.0,68476807,1,Northeast
4,C3,Source Verified,13.44,0,36 months,100.0,RENT,3,2.0,debt_consolidation,11950.0,10.2,5.0,68426831,1,South


Here we can see that there are several categorical features which need to be encoded before training and for this we will use one hot encoding.

In [110]:
non_categorical_features = df.select_dtypes(['number']).columns.to_list()
non_categorical_features.remove('mort_acc')
non_categorical_features.remove('id')
non_categorical_features.remove('loan_repaid')

categorical_features = list(set(df.columns) - set(non_categorical_features))
categorical_features.remove('id')
categorical_features.remove('loan_repaid')
categorical_features.append('mort_acc')

### Creating dummy variables.

In [111]:
dummies = pd.get_dummies(df[categorical_features], drop_first=True)
df = pd.concat([df.drop(axis=1, columns=categorical_features), dummies], axis=1)
df.head()

Unnamed: 0,int_rate,pct_tl_nvr_dlq,num_actv_rev_tl,num_bc_sats,loan_amnt,dti,open_acc,id,loan_repaid,mort_acc,mort_acc.1,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,region_Northeast,region_South,region_West,verification_status_Source Verified,verification_status_Verified,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,term_ 60 months
0,13.99,76.9,4,2.0,3600.0,5.91,7.0,68407277,1,1,1,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,11.99,97.4,10,13.0,24700.0,16.06,22.0,68355089,1,3,3,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,10.78,100.0,3,2.0,20000.0,10.78,6.0,68341763,1,1,1,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,22.45,96.6,6,5.0,10400.0,25.37,12.0,68476807,1,2,2,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
4,13.44,100.0,3,2.0,11950.0,10.2,5.0,68426831,1,0,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False


In [112]:
df.replace({True : 1, False : 0}, inplace=True)

  df.replace({True : 1, False : 0}, inplace=True)


In [113]:
df.head()

Unnamed: 0,int_rate,pct_tl_nvr_dlq,num_actv_rev_tl,num_bc_sats,loan_amnt,dti,open_acc,id,loan_repaid,mort_acc,mort_acc.1,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,region_Northeast,region_South,region_West,verification_status_Source Verified,verification_status_Verified,sub_grade_A2,sub_grade_A3,sub_grade_A4,sub_grade_A5,sub_grade_B1,sub_grade_B2,sub_grade_B3,sub_grade_B4,sub_grade_B5,sub_grade_C1,sub_grade_C2,sub_grade_C3,sub_grade_C4,sub_grade_C5,sub_grade_D1,sub_grade_D2,sub_grade_D3,sub_grade_D4,sub_grade_D5,sub_grade_E1,sub_grade_E2,sub_grade_E3,sub_grade_E4,sub_grade_E5,sub_grade_F1,sub_grade_F2,sub_grade_F3,sub_grade_F4,sub_grade_F5,sub_grade_G1,sub_grade_G2,sub_grade_G3,sub_grade_G4,sub_grade_G5,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,term_ 60 months
0,13.99,76.9,4,2.0,3600.0,5.91,7.0,68407277,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,11.99,97.4,10,13.0,24700.0,16.06,22.0,68355089,1,3,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10.78,100.0,3,2.0,20000.0,10.78,6.0,68341763,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,22.45,96.6,6,5.0,10400.0,25.37,12.0,68476807,1,2,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,13.44,100.0,3,2.0,11950.0,10.2,5.0,68426831,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


___
#### Even though we are predicting probability, in the end we are going to classify datapoints into two labels. 

#### As we can see that the dataset is not well balanced and a not well balanced dataset can lead to poor performance on the minority class and overall imbalanced performance on the dataset, performance metrics such as accuracy can be misleading.

#### We have a large dataset and the size of the minority class is also large enough to perform <b>undersampling.</b>

In [114]:
from imblearn.under_sampling import RandomUnderSampler

X = df.drop(axis=1, columns=['loan_repaid'])
y = df['loan_repaid']

rus = RandomUnderSampler(random_state=0)
rus.fit(X, y)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [115]:
y_resampled.value_counts()

loan_repaid
0    255106
1    255106
Name: count, dtype: int64

___
train_test_split

In [116]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_resampled.drop(axis=1, columns=['id']), y_resampled, test_size=0.3, random_state=42)

In [117]:
x_train.shape

(357148, 65)

___


In [118]:
x_train.to_csv('x_train.csv', index=False)
x_test.to_csv('x_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

___