# Model training

The purpose of this notebook is to take advantage of the data exploration process to train a performant model.

In addition, we will iterate on some hyperparameters to better fit the data.

In [1]:
import numpy as np
import pandas as pd

import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from utils.model import FTMultilayerPerceptron
from utils.data_processing.normalizers import FTStandardScaler
from utils.data_processing.transform_labels import get_labels, labels_to_numbers
from utils.data_processing.one_hot import one_hot_encoder, one_hot_decoder
from utils.data_processing.selection import train_dev_split, KFold
from utils.metrics import *

## Download dataset

In [2]:
df_orig = pd.read_csv('../data.csv', header=None)

In [3]:
df_orig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Set a seed

Here we set a seed to be able to reproduce those 'randomized' experiences.

In [4]:
random_state = 0

## Quick and dirty

We first are going to use all the features of the dataset and a simple implementation of the neural network (with two hidden layer as it is mandatory in the subject to set a minimum of 2 layers)

### Prepare data

In [22]:
df = df_orig.drop(columns=[0])

In [23]:
X = np.array(df.iloc[:, 1:]).T

In [24]:
labels = get_labels(df.iloc[:, :1])
y = labels_to_numbers(df.iloc[:, :1], labels)
y = one_hot_encoder(y, len(labels))

### Set the parameters of the neural network

In [25]:
nn_dimensions = [X.shape[0], 2, 2, y.shape[0]]

### Training

In [26]:
k = 10

In [27]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
    
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
    batch_size=batch_size,\
    random_state=random_state,\
    early_stopping=True,\
    verbose=10000,\
    max_epoch=30000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/30000 - loss: 0.6931471605599453 - val_loss: 0.6928702229794984
End of training:
epoch 407/30000 - loss: 0.6598207111772613 - val_loss: 0.6700575532770661

Fold number 0
Cost for training set = 0.6598144153081701
Cost for      dev set = 0.6700575532770661

Accuracy for training set = 0.6296296296296297
Accuracy for      dev set = 0.6071428571428571

epoch 0/30000 - loss: 0.6931471605599453 - val_loss: 0.6929572973889001
End of training:
epoch 203/30000 - loss: 0.6620339396912712 - val_loss: 0.6832804450061875

Fold number 1
Cost for training set = 0.6619862898853159
Cost for      dev set = 0.6832804450061875

Accuracy for training set = 0.6335282651072125
Accuracy for      dev set = 0.5714285714285714

epoch 0/30000 - loss: 0.6931471605599453 - val_loss: 0.6926769100769491
End of training:
epoch 5947/30000 - loss: 0.6641213346366042 - val_loss: 0.626686604092358

Fold number 2
Cost for training set = 0.6641213346366042
Cost for      dev set = 0.626686604092358

Accuracy for tra

Hmmmm... I didn't work a lot on the data but such awful results look suspicious... Is this a dying relu problem ?

Let's try this time with leaking relu to avoid this problem

In [None]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
    
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
    hidden_activation='lrelu',\
    batch_size=batch_size,\
    random_state=random_state,\
    early_stopping=True,\
    verbose=10000,\
    max_epoch=30000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/30000 - loss: 0.7252764759887506 - val_loss: 0.7289629701478995
End of training:
epoch 6556/30000 - loss: 0.05014150718065007 - val_loss: 0.01753943862347959

Fold number 0
Cost for training set = 0.05013834561482558
Cost for      dev set = 0.01753943862347959

Accuracy for training set = 0.9902534113060428
Accuracy for      dev set = 1.0

epoch 0/30000 - loss: 0.7252231768283781 - val_loss: 0.7275137596021468
End of training:
epoch 2883/30000 - loss: 0.06810429862105259 - val_loss: 0.08779325484643037

Fold number 1
Cost for training set = 0.06809727464234808
Cost for      dev set = 0.08779325484643037

Accuracy for training set = 0.9785575048732943
Accuracy for      dev set = 0.9642857142857143

epoch 0/30000 - loss: 0.72655029150947 - val_loss: 0.7215084128094712
End of training:
epoch 3179/30000 - loss: 0.0680357012962475 - val_loss: 0.06723710959446409

Fold number 2
Cost for training set = 0.0680300977353123
Cost for      dev set = 0.06723710959446409

Accuracy for traini

It's better :) With a not so deep neural network, and no work on the data (except normalization), the accuracy is pretty descent ! But let's try to do better !

## Decrease the bias

### Increase hidden units

In [12]:
nn_dimensions = [X_train.shape[0], 20, 20, 20, y_train.shape[0]]

### Training

In [14]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
      
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
        batch_size=batch_size,\
        random_state=random_state,\
        early_stopping=True,\
    #    l2_reg = True,\
    #    lambd=1.5,\
        max_epoch=100000,\
        verbose=10000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/100000 - loss: 0.9351147094054832 - val_loss: 0.8931816892990122
End of training:
epoch 3083/100000 - loss: 0.0254270387935981 - val_loss: 0.03462181734367178

Fold number 0
Cost for training set = 0.0254184445426214
Cost for      dev set = 0.03462181734367178

Accuracy for training set = 0.9902534113060428
Accuracy for      dev set = 0.9821428571428571

epoch 0/100000 - loss: 0.9310020225468737 - val_loss: 0.9308833501332077
End of training:
epoch 3665/100000 - loss: 0.02168985200509469 - val_loss: 0.07447561441921886

Fold number 1
Cost for training set = 0.021684509691213688
Cost for      dev set = 0.07447561441921886

Accuracy for training set = 0.9922027290448343
Accuracy for      dev set = 0.9464285714285714

epoch 0/100000 - loss: 0.9406947156723722 - val_loss: 0.8725817169754615
End of training:
epoch 2643/100000 - loss: 0.029979643703953833 - val_loss: 0.03783544243830451

Fold number 2
Cost for training set = 0.029969189465917118
Cost for      dev set = 0.037835442438

## With another optimization function

### Training