### Neural Networks
#### We will apply neural network algorithms on the lending club cleaned and filtered dataset, that we have already cleaned and filtered considering our persona Rick who is a risk averse. We will use the dataset to predict the interest rate.
#### We will be performing
##### 1. MLPRegressor and check MAPE score
##### 2. MLPRegressor with learning rate tuning
##### 3. MLPRegressor with optimizer tuning
##### 4. MLPRegressor with epoch tuning

In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

#### Reading the Data from the cleaned CSV

In [3]:
dataset = pd.read_csv('../Data/finaldataset.csv')

In [4]:
dataset.pop('Unnamed: 0')

0              0
1             18
2             21
3             25
4             33
5             51
6             59
7             64
8             69
9             76
10            80
11            88
12            90
13            93
14            98
15           110
16           115
17           123
18           128
19           137
20           152
21           156
22           166
23           170
24           171
25           174
26           178
27           184
28           185
29           196
           ...  
157902    886618
157903    886621
157904    886623
157905    886625
157906    886626
157907    886628
157908    886634
157909    886636
157910    886639
157911    886648
157912    886649
157913    886660
157914    886663
157915    886664
157916    886666
157917    886667
157918    886669
157919    886674
157920    886677
157921    886679
157922    886680
157923    886685
157924    886686
157925    886694
157926    886699
157927    886701
157928    886703
157929    8867

In [5]:
loan_status=dataset.pop('loan_status')

#### Checking the Info, Describe and head on the dataset

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157932 entries, 0 to 157931
Data columns (total 85 columns):
dti                           157932 non-null float64
revol_bal                     157932 non-null float64
total_pymnt                   157932 non-null float64
loan_amnt                     157932 non-null float64
int_rate                      157932 non-null float64
sub_grade                     157932 non-null int64
annual_inc                    157932 non-null float64
acc_now_delinq                157932 non-null float64
delinq_2yrs                   157932 non-null float64
pub_rec                       157932 non-null float64
open_acc                      157932 non-null float64
inq_last_6mths                157932 non-null float64
revol_util                    157932 non-null float64
emp_length                    157932 non-null int64
addr_state_AL                 157932 non-null int64
addr_state_AR                 157932 non-null int64
addr_state_AZ                 157

In [7]:
dataset.describe()

Unnamed: 0,dti,revol_bal,total_pymnt,loan_amnt,int_rate,sub_grade,annual_inc,acc_now_delinq,delinq_2yrs,pub_rec,...,purpose_small_business,purpose_vacation,purpose_wedding,term_ 60 months,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,loan_income_ratio
count,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,...,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0,157932.0
mean,18.046043,19732.37,8465.719783,14397.327489,12.436624,10.422302,87664.37,0.0063,0.314559,0.225971,...,0.013594,0.006484,0.002653,0.21344,0.552713,6.3e-05,0.000266,0.085803,0.361155,17.730147
std,8.678728,31227.2,8446.508531,8783.155774,3.154233,4.382727,65928.17,0.08842,0.857169,0.664907,...,0.1158,0.080261,0.051439,0.409737,0.497215,0.007957,0.016305,0.280073,0.480337,7.26319
min,0.0,0.0,0.0,1000.0,5.32,1.0,4000.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21097
25%,11.3,6502.0,1872.71,7500.0,10.0,7.0,50000.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.048193
50%,17.8,12470.5,5813.35,12000.0,12.69,11.0,74500.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,18.056891
75%,24.37,23696.0,12312.44,20000.0,14.65,14.0,108000.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,23.809524
max,69.35,2904836.0,51236.151003,35000.0,19.2,18.0,6100000.0,6.0,26.0,86.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0


In [8]:
dataset.head()

Unnamed: 0,dti,revol_bal,total_pymnt,loan_amnt,int_rate,sub_grade,annual_inc,acc_now_delinq,delinq_2yrs,pub_rec,...,purpose_small_business,purpose_vacation,purpose_wedding,term_ 60 months,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,loan_income_ratio
0,27.65,13648.0,5861.071414,5000.0,10.65,7,24000.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,20.833333
1,18.44,0.0,7164.499852,6000.0,11.71,8,84000.0,0.0,2.0,0.0,...,0,0,0,0,1,0,0,0,0,7.142857
2,13.22,32135.0,14025.4,21000.0,12.42,9,105000.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,20.0
3,29.44,13707.0,15823.48,15000.0,9.91,6,92000.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,16.304348
4,11.93,2224.0,5714.592129,5000.0,8.9,5,24044.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,20.795209


In [9]:
Y = dataset.pop('int_rate').values
Y

array([10.65, 11.71, 12.42, ...,  9.49, 11.99, 11.99])

In [10]:
X = dataset.values
X

array([[2.76500000e+01, 1.36480000e+04, 5.86107141e+03, ...,
        0.00000000e+00, 1.00000000e+00, 2.08333333e+01],
       [1.84400000e+01, 0.00000000e+00, 7.16449985e+03, ...,
        0.00000000e+00, 0.00000000e+00, 7.14285714e+00],
       [1.32200000e+01, 3.21350000e+04, 1.40254000e+04, ...,
        0.00000000e+00, 1.00000000e+00, 2.00000000e+01],
       ...,
       [2.56300000e+01, 6.36100000e+03, 2.87473000e+03, ...,
        0.00000000e+00, 1.00000000e+00, 1.87500000e+01],
       [2.36900000e+01, 2.11100000e+04, 4.76625000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 1.90476190e+01],
       [1.08300000e+01, 3.32660000e+04, 7.94376000e+03, ...,
        0.00000000e+00, 1.00000000e+00, 2.00000000e+01]])

#### Function for training, evaluating a model and calculating the error metrics

In [11]:
def get_result(model, X_train, X_test, Y_train, Y_test):
    sc = StandardScaler()  
    X_train = sc.fit_transform(X_train)  
    X_test = sc.transform(X_test) 
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    RMSE = np.sqrt(metrics.mean_squared_error(Y_test, y_pred))
    MAPE_test = np.mean(np.abs((Y_test - y_pred) / Y_test)) * 100
    MAPE_train = np.mean(np.abs((Y_train - y_train_pred) / Y_train)) * 100
    return RMSE, MAPE_test,MAPE_train, y_pred,y_train_pred, model

#### Test Train Split conventional way

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

#### Feed the model into the function and get the MAPE score

In [13]:
mlp= MLPRegressor()
rmse, mape_test, mape_train, y_pred, y_train_pred, model = get_result(mlp,X_train, X_test, Y_train, Y_test)
mape_test, mape_train

(3.2140860788596974, 3.0340240342252667)

In [19]:
model.score

<bound method RegressorMixin.score of MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)>

In [15]:
dataset_copy=dataset
columnnames= dataset_copy.columns.values
x = dataset_copy[columnnames]
y = Y
x.shape

(157932, 84)

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x= scaler.fit_transform(x)
x_df = pd.DataFrame(x,columns=columnnames)
x_df

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,dti,revol_bal,total_pymnt,loan_amnt,sub_grade,annual_inc,acc_now_delinq,delinq_2yrs,pub_rec,open_acc,...,purpose_small_business,purpose_vacation,purpose_wedding,term_ 60 months,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,loan_income_ratio
0,1.106612,-0.194843,-0.308371,-1.069929,-0.780864,-0.965666,-0.071253,-0.366976,-0.339854,-1.613554,...,-0.117396,-0.080784,-0.051576,-0.520921,-1.111620,-0.007958,-0.01631,-0.306359,1.329997,0.427250
1,0.045394,-0.631899,-0.154055,-0.956075,-0.552695,-0.055581,-0.071253,1.966294,-0.339854,-1.433309,...,-0.117396,-0.080784,-0.051576,-0.520921,0.899588,-0.007958,-0.01631,-0.306359,-0.751882,-1.457669
2,-0.556079,0.397175,0.658224,0.751745,-0.324525,0.262948,-0.071253,-0.366976,-0.339854,-0.892572,...,-0.117396,-0.080784,-0.051576,-0.520921,-1.111620,-0.007958,-0.01631,-0.306359,1.329997,0.312516
3,1.312865,-0.192953,0.871104,0.068617,-1.009033,0.065763,-0.071253,-0.366976,-0.339854,-0.712326,...,-0.117396,-0.080784,-0.051576,-0.520921,0.899588,-0.007958,-0.01631,-0.306359,-0.751882,-0.196306
4,-0.704719,-0.560679,-0.325713,-1.069929,-1.237202,-0.964998,-0.071253,-0.366976,-0.339854,-0.712326,...,-0.117396,-0.080784,-0.051576,-0.520921,-1.111620,-0.007958,-0.01631,-0.306359,1.329997,0.422001
5,-0.269170,0.203529,2.513461,1.275476,-1.009033,0.338788,-0.071253,-0.366976,-0.339854,-0.171589,...,-0.117396,-0.080784,-0.051576,-0.520921,-1.111620,-0.007958,-0.01631,-0.306359,1.329997,0.763108
6,0.686043,-0.439086,0.072647,-0.830835,1.500828,-0.829153,-0.071253,-0.366976,-0.339854,-1.433309,...,-0.117396,-0.080784,-0.051576,-0.520921,0.899588,-0.007958,-0.01631,-0.306359,-0.751882,0.521123
7,0.424483,-0.274197,0.036369,-0.500656,-0.780864,-0.632029,-0.071253,-0.366976,-0.339854,-0.892572,...,-0.117396,-0.080784,-0.051576,1.919677,-1.111620,-0.007958,-0.01631,-0.306359,1.329997,0.552221
8,-0.710480,-0.010740,1.195371,0.068617,0.588151,-0.404447,-0.071253,-0.366976,-0.339854,-0.712326,...,-0.117396,-0.080784,-0.051576,-0.520921,-1.111620,-0.007958,-0.01631,3.264144,-0.751882,0.944494
9,0.434853,-0.267408,0.585746,-0.386802,0.131813,-0.601632,-0.071253,-0.366976,-0.339854,0.549394,...,-0.117396,-0.080784,-0.051576,-0.520921,0.899588,-0.007958,-0.01631,-0.306359,-0.751882,0.714085


In [20]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(x_df[['total_pymnt', 'loan_amnt', 'sub_grade', 'revol_util',
       'term_ 60 months', 'loan_income_ratio']], Y, test_size = 0.3, random_state = 0)
mlp1 = MLPRegressor()
rmse1, mape1,mape_train1, y_pred1,y_train_pred1, model1 = get_result(mlp1,X_train1, X_test1, Y_train1, Y_test1)
print("MAPE on Test : ", mape1," mape on train data: ",mape_train1)

MAPE on Test :  2.798292454306365  mape on train data:  2.796209912232274


#### Test Train Split K-Fold

In [20]:
kf = KFold(n_splits=5)
kf.get_n_splits(X)


MAPE_TEST = []
MAPE_TRAIN = []
X_test_array=[]
y_pred_array=[]
Y_test_ret_array = []
Y_train_ret_array = []
y_train_pred_array=[]
mlp= MLPRegressor() # Model without any regularization
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    rmse, mape_test, mape_train, y_pred, y_train_pred, model = get_result(mlp,X_train, X_test, Y_train, Y_test)
    MAPE_TEST.append(mape_test)
    MAPE_TRAIN.append(mape_train)
    X_test_array.append(X_test)
    y_pred_array.append(y_pred)
    Y_train_ret_array.append(Y_train)
    y_train_pred_array.append(y_train_pred)
print("MAPE on test data is :", MAPE_TEST)
print("MAPE on train data is :", MAPE_TRAIN)

TRAIN: [ 31587  31588  31589 ... 157929 157930 157931] TEST: [    0     1     2 ... 31584 31585 31586]
TRAIN: [     0      1      2 ... 157929 157930 157931] TEST: [31587 31588 31589 ... 63171 63172 63173]
TRAIN: [     0      1      2 ... 157929 157930 157931] TEST: [63174 63175 63176 ... 94757 94758 94759]
TRAIN: [     0      1      2 ... 157929 157930 157931] TEST: [ 94760  94761  94762 ... 126343 126344 126345]
TRAIN: [     0      1      2 ... 126343 126344 126345] TEST: [126346 126347 126348 ... 157929 157930 157931]
MAPE on test data is : [5.323755969383304, 4.760789509534943, 3.1453909848929804, 2.596816981509556, 4.5719399701031325]
MAPE on train data is : [2.4272302708199867, 2.832378053543635, 3.046167167083844, 3.2996064872691075, 3.0570236885810784]


In [21]:
avg_MAPE_Test = sum(MAPE_TEST)/len(MAPE_TEST)
avg_MAPE_Test

4.079738683084782

In [22]:
avg_MAPE_Train = sum(MAPE_TRAIN)/len(MAPE_TRAIN)
avg_MAPE_Train

2.93248113345953

#### Regularization of  MLPRegressor

In [23]:
mlp

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [24]:
# mlp= MLPRegressor(hidden_layer_sizes=(10,4), max_iter=300, learning_rate='constant',solver='adam', tol=-1)

# rmse, mape, y_pred, y_train_pred, model = get_result(mlp,X_train, X_test, Y_train, Y_test)
# mape

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
kf = KFold(n_splits=5)
kf.get_n_splits(X)

print(kf)
mlp_regressor = MLPRegressor()
param_grid = {'learning_rate': ['constant','adaptive','invscaling'],
              'solver': ['adam','lbfgs'],
    'activation' : ['identity', 'tanh', 'relu']}

# run grid search
grid_search = GridSearchCV(mlp_regressor, scoring="neg_mean_absolute_error", param_grid=param_grid, cv=5)
grid_search.fit(X, Y)

print(grid_search.score)

KFold(n_splits=5, random_state=None, shuffle=False)
<bound method BaseSearchCV.score of GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': ['constant', 'adaptive', 'invscaling'], 'solver': ['adam', 'lbfgs'], 'activation': ['identity', 'tanh', 'relu']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)>


In [31]:
grid_search.best_params_

{'activation': 'identity', 'learning_rate': 'adaptive', 'solver': 'lbfgs'}

In [32]:
grid_search.best_estimator_

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [33]:
grid_search.best_index_

3

In [41]:
grid_search.best_score_

-1.0603901814613617