# Multi-layer Perceptron

<ol>
    <li> Components: Input layer, Hidden layer(s) and Output layer. </li>
    <li> Fully connected </li>
</ol>


### Steps:

For every epoch,

<ol>
    <li> Training data is propagated to the MLP through input layers. It passes through the hidden layers, if any forwarding outputs of activation functions to the next layer. Finally the output is generated at the output layer by applying activation functions. </li>
    <li> The predicted output will be compared with actual output and hence error will be calculated. </li>
    <li> If error>0, apply backpropagation methodology to modify weights starting from output layer moving towards input layer. </li>
    <li> Check accuracy score. If satisfied, stop. Else, go to step 1. </li>
</ol>


### Import dataset

In [1]:
import pandas as pd
import numpy as np

bnotes = pd.read_csv('2019_S2_predictors.csv')
bnotes = bnotes.drop(['Unnamed: 0', 'class2', 'class5', 'class6', 'class9', 'class10', 'class13', 'class14', 'class17', 'class18', 'class21', 'class22', 'y_coord','x_coord'], axis=1)

print(bnotes.head())
print(bnotes['class'].unique())

   class  jan_ca   jan_b  jan_g  jan_r  jan_r1  jan_r2  jan_r3    jan_n  \
0      1    2.00   56.50  122.5  211.0  444.00  1253.5  1462.0  1678.00   
1      1  303.00  368.25  357.5  374.0  717.00  1101.0  1308.0  1699.00   
2      1  511.25  252.50  254.5  388.0  643.25  1104.0  1309.0  1688.00   
3      1   53.50   51.00  102.0  117.0  358.00  1221.0  1525.0  1795.50   
4      1  279.00  180.00  258.0  274.5  507.00  1258.0  1524.0  1668.25   

   jan_nn  ...  dec_TCG_GSO   dec_TCG   dec_LAI  dec_SAVI  dec_MSAVI  \
0  1815.5  ...    -0.059647  0.045820  0.717313  0.186339   0.153891   
1  1700.0  ...    -0.109955  0.047009  0.809818  0.187611   0.164786   
2  1639.0  ...    -0.025636  0.084585  0.827439  0.256545   0.216626   
3  1813.0  ...    -0.438328  0.039137  1.154730  0.222647   0.216470   
4  1761.0  ...    -0.029993  0.134382  1.425100  0.365924   0.339766   

    dec_BUI  dec_NDBI  dec_NDMI  dec_BAEI   dec_BSI  
0 -0.582977 -0.203604  0.203604  2.284158 -0.151742  
1 -0.448

In [2]:
bnotes.shape

(12797, 313)

In [3]:
bnotes.describe(include = 'all')

Unnamed: 0,class,jan_ca,jan_b,jan_g,jan_r,jan_r1,jan_r2,jan_r3,jan_n,jan_nn,...,dec_TCG_GSO,dec_TCG,dec_LAI,dec_SAVI,dec_MSAVI,dec_BUI,dec_NDBI,dec_NDMI,dec_BAEI,dec_BSI
count,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,...,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0,12797.0
mean,2.803704,822.118407,638.117274,631.781863,637.62458,921.765101,1437.895679,1634.711729,1833.712569,1863.929671,...,-0.101086,0.052865,0.774201,0.205182,0.177617,-0.617333,-0.170129,0.170129,2.375074,-0.13493
std,1.999588,699.870301,583.899705,511.859025,534.747996,577.787031,605.650197,643.713982,682.038767,703.942369,...,0.128884,0.045418,0.397603,0.084729,0.078238,0.348811,0.162565,0.162565,1.821511,0.15092
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,...,-1.657939,-0.518141,-2.33138,-0.137408,-0.139201,-1.61784,-0.876543,-0.433603,0.581521,-0.704497
25%,0.0,354.0,275.0,340.0,336.0,603.0,1136.0,1319.0,1497.0,1520.0,...,-0.127264,0.023448,0.545062,0.144795,0.122939,-0.850075,-0.261681,0.05478,1.449853,-0.21586
50%,4.0,642.0,506.0,527.5,548.0,834.5,1353.0,1550.0,1763.5,1791.0,...,-0.073674,0.056684,0.756764,0.208185,0.177115,-0.559696,-0.127275,0.127275,1.923391,-0.092744
75%,4.0,1071.0,822.0,786.0,799.25,1110.0,1619.5,1837.0,2084.0,2116.0,...,-0.029332,0.081996,0.97776,0.262614,0.228072,-0.350964,-0.05478,0.261681,2.663507,-0.029306
max,6.0,7964.0,10696.0,12132.0,14788.0,16075.0,16062.0,15971.0,15856.0,15811.0,...,0.07343,0.28209,11.103901,0.553132,0.571734,0.415682,0.433603,0.876543,44.132355,0.359759


In [4]:
training_data = '2019_S2_predictors.csv'
df = pd.read_table(training_data, sep=',')
# clear residuals
# for non balanced classification
df = df.drop(['Unnamed: 0', 'class2', 'class5', 'class6', 'class9', 'class10', 'class13', 'class14', 'class17', 'class18', 'class21', 'class22', 'y_coord','x_coord'], axis=1)
# for balanced classification
#df = df.drop(['Unnamed: 0', 'class2', 'class5', 'class6', 'class9', 'class10', 'class13', 'class14', 'class17', 'class18', 'class21', 'class22'], axis=1)
#df = df.drop('fid', axis=1)

model_variables = df.drop('class', axis=1).columns.values.tolist()
column_names = model_variables
y = df['class']
X = df.drop('class', axis=1)

In [5]:
X

Unnamed: 0,jan_ca,jan_b,jan_g,jan_r,jan_r1,jan_r2,jan_r3,jan_n,jan_nn,jan_wv,...,dec_TCG_GSO,dec_TCG,dec_LAI,dec_SAVI,dec_MSAVI,dec_BUI,dec_NDBI,dec_NDMI,dec_BAEI,dec_BSI
0,2.00,56.50,122.5,211.0,444.00,1253.5,1462.0,1678.00,1815.5,2021.0,...,-0.059647,0.045820,0.717313,0.186339,0.153891,-0.582977,-0.203604,0.203604,2.284158,-0.151742
1,303.00,368.25,357.5,374.0,717.00,1101.0,1308.0,1699.00,1700.0,1649.0,...,-0.109955,0.047009,0.809818,0.187611,0.164786,-0.448827,-0.140714,0.140714,1.457509,-0.098871
2,511.25,252.50,254.5,388.0,643.25,1104.0,1309.0,1688.00,1639.0,2230.0,...,-0.025636,0.084585,0.827439,0.256545,0.216626,-0.869244,-0.245951,0.245951,2.616980,-0.185399
3,53.50,51.00,102.0,117.0,358.00,1221.0,1525.0,1795.50,1813.0,1894.5,...,-0.438328,0.039137,1.154730,0.222647,0.216470,-0.236880,-0.018406,0.018406,0.789884,0.025922
4,279.00,180.00,258.0,274.5,507.00,1258.0,1524.0,1668.25,1761.0,1874.0,...,-0.029993,0.134382,1.425100,0.365924,0.339766,-0.952528,-0.305394,0.305394,1.919225,-0.248522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12792,140.00,198.00,321.0,284.0,613.00,1376.0,1570.0,1852.00,1853.0,1800.0,...,-0.010064,0.096308,0.958723,0.296436,0.253684,-1.041837,-0.296599,0.296599,2.827416,-0.244999
12793,575.50,608.50,446.0,448.0,635.50,1220.5,1306.0,1720.00,1463.0,1430.0,...,-0.148923,0.027291,0.797694,0.161353,0.143223,-0.415802,-0.162687,0.162687,1.385545,-0.109597
12794,64.00,433.00,580.0,752.0,995.00,1294.0,1299.0,1504.00,1508.0,1672.0,...,-0.058570,0.033750,0.406939,0.144411,0.113540,-0.475634,-0.040483,0.040483,2.348404,-0.034799
12795,465.00,477.00,554.0,705.0,836.00,1073.0,1147.0,1322.00,1388.0,1961.0,...,-0.091083,0.032056,0.408460,0.139940,0.111585,-0.304417,0.062159,-0.062159,2.008159,0.080313


### Splitting to training and testing


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3)
print(X_train.shape)
print(y_test.shape)

(8957, 312)
(3840,)


Normalized input X train

## Train the model

Import the MLP classifier model from sklearn

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [14]:
mlp = MLPClassifier(hidden_layer_sizes=(200,150,100,50),max_iter=500, 
                    activation='relu',random_state=42, solver='adam')
mlp

In [19]:
mlp = RandomForestClassifier(n_estimators=1000,n_jobs=30,random_state=42,class_weight='balanced')
mlp

### About parameters 

1. hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)

The ith element represents the number of neurons in the ith hidden layer.

2. activation : {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’

Activation function for the hidden layer.

‘identity’, no-op activation, useful to implement linear bottleneck, returns f(x) = x
‘logistic’, the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).
‘tanh’, the hyperbolic tan function, returns f(x) = tanh(x).
‘relu’, the rectified linear unit function, returns f(x) = max(0, x)

3. learning_rate : {‘constant’, ‘invscaling’, ‘adaptive’}, default ‘constant’

4. learning_rate_init : double, optional, default 0.001

5. max_iter : int, optional, default 200

Maximum number of iterations. The solver iterates until convergence (determined by ‘tol’) or this number of iterations. For stochastic solvers (‘sgd’, ‘adam’), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.

6. shuffle : bool, optional, default True

Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.

7. momentum : float, default 0.9

Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.

8. early_stopping : bool, default False

Whether to use early stopping to terminate training when validation score is not improving. If set to true, it will automatically keep 10% of training data as validation and terminate training when validation score is not improving by at least tol for two consecutive epochs. Only effective when solver=’sgd’ or ‘adam’


### Training

In [15]:
%%time
mlp.fit(X_train,y_train)

CPU times: user 9h 59min 1s, sys: 9h 35min 59s, total: 19h 35min
Wall time: 29min 25s


In [20]:
%%time
mlp.fit(X_train,y_train)

CPU times: user 3min 56s, sys: 0 ns, total: 3min 56s
Wall time: 9.21 s


### Testing

In [21]:
pred = mlp.predict(X_test)
pred

array([4, 4, 0, ..., 4, 4, 4])

## Evaluation metrics- Confusion matrix and F1 score

In [22]:
from sklearn.metrics import classification_report,confusion_matrix

confusion_matrix(y_test,pred)

array([[1152,    0,    5,    0,   73,    1,    1],
       [  11,    0,    0,    0,   19,    0,    0],
       [  60,    0,   15,    0,    8,    0,    0],
       [  15,    0,    0,    5,    8,    0,    0],
       [ 130,    0,    0,    1, 1819,   31,    2],
       [  12,    0,    0,    0,  146,  229,    0],
       [  14,    0,    0,    0,   51,    4,   28]])

In [18]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1232
           1       0.00      0.00      0.00        30
           2       0.14      0.12      0.13        83
           3       0.00      0.00      0.00        28
           4       0.80      0.89      0.84      1983
           5       0.60      0.32      0.41       387
           6       0.29      0.02      0.04        97

    accuracy                           0.77      3840
   macro avg       0.38      0.32      0.32      3840
weighted avg       0.74      0.77      0.75      3840



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88      1232
           1       0.00      0.00      0.00        30
           2       0.75      0.18      0.29        83
           3       0.83      0.18      0.29        28
           4       0.86      0.92      0.89      1983
           5       0.86      0.59      0.70       387
           6       0.90      0.29      0.44        97

    accuracy                           0.85      3840
   macro avg       0.72      0.44      0.50      3840
weighted avg       0.84      0.85      0.83      3840



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
