# Model training

The purpose of this notebook is to take advantage of the data exploration process to train a performant model.

In addition, we will iterate on some hyperparameters to better fit the data.

In [1]:
import numpy as np
import pandas as pd

import os
import sys
import inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from utils.model import FTMultilayerPerceptron
from utils.data_processing.normalizers import FTStandardScaler
from utils.data_processing.transform_labels import get_labels, labels_to_numbers
from utils.data_processing.one_hot import one_hot_encoder, one_hot_decoder
from utils.data_processing.selection import train_dev_split, KFold
from utils.metrics import *

## Download dataset

In [2]:
df_orig = pd.read_csv('../data.csv', header=None)

In [3]:
df_orig.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Set a seed

Here we set a seed to be able to reproduce those 'randomized' experiences.

In [4]:
random_state = 0

## Quick and dirty

We first are going to use all the features of the dataset and a simple implementation of the neural network (with two hidden layer as it is mandatory in the subject to set a minimum of 2 layers)

### Prepare data

In [5]:
df = df_orig.drop(columns=[0])

In [6]:
X = np.array(df.iloc[:, 1:]).T

In [7]:
labels = get_labels(df.iloc[:, :1])
y = labels_to_numbers(df.iloc[:, :1], labels)
y = one_hot_encoder(y, len(labels))

### Set the parameters of the neural network

In [8]:
nn_dimensions = [X.shape[0], 2, 2, y.shape[0]]

### Training

In [9]:
k = 10

In [10]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
    
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
    batch_size=batch_size,\
    random_state=random_state,\
    early_stopping=True,\
    patience=5,\
    verbose=10000,\
    max_epoch=30000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/30000 - loss: 0.6931471605599453 - val_loss: 0.6928702229794984
End of training:
epoch 362/30000 - loss: 0.6601762028127721 - val_loss: 0.6700096531609975

Fold number 0
Cost for training set = 0.6601665311871561
Cost for      dev set = 0.6700096531609975

Accuracy for training set = 0.6296296296296297
Accuracy for      dev set = 0.6071428571428571

epoch 0/30000 - loss: 0.6931471605599453 - val_loss: 0.6929572973889001
End of training:
epoch 158/30000 - loss: 0.6647396126798509 - val_loss: 0.6829127194904402

Fold number 1
Cost for training set = 0.6646656387539777
Cost for      dev set = 0.6829127194904402

Accuracy for training set = 0.6335282651072125
Accuracy for      dev set = 0.5714285714285714

epoch 0/30000 - loss: 0.6931471605599453 - val_loss: 0.6926769100769491
End of training:
epoch 6266/30000 - loss: 0.6641213346366043 - val_loss: 0.6266866040923408

Fold number 2
Cost for training set = 0.6641213346366043
Cost for      dev set = 0.6266866040923408

Accuracy for t

Hmmmm... I didn't work a lot on the data but such awful results look suspicious... Is this a dying relu problem ?

Let's try this time with tanh to avoid this problem

In [11]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
    
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
    hidden_activation='tanh',\
    batch_size=batch_size,\
    random_state=random_state,\
    early_stopping=True,\
    patience=5,\
    verbose=10000,\
    max_epoch=30000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/30000 - loss: 1.0397859541150594 - val_loss: 1.0703188862491215
End of training:
epoch 6728/30000 - loss: 0.04155336658051457 - val_loss: 0.014590099648048157

Fold number 0
Cost for training set = 0.04155122370556702
Cost for      dev set = 0.014590099648048157

Accuracy for training set = 0.9922027290448343
Accuracy for      dev set = 1.0

epoch 0/30000 - loss: 1.0417325249590372 - val_loss: 0.9911456989764061
End of training:
epoch 1406/30000 - loss: 0.06558954125833209 - val_loss: 0.09379850086242773

Fold number 1
Cost for training set = 0.06556844812227397
Cost for      dev set = 0.09379850086242773

Accuracy for training set = 0.9844054580896686
Accuracy for      dev set = 0.9642857142857143

epoch 0/30000 - loss: 1.0466539062125946 - val_loss: 1.0067427543816392
End of training:
epoch 1366/30000 - loss: 0.06681464547220436 - val_loss: 0.11039817795382167

Fold number 2
Cost for training set = 0.06679016809196853
Cost for      dev set = 0.11039817795382167

Accuracy for 

It's better :) With a not so deep neural network, and no work on the data (except normalization), the accuracy is pretty descent ! But let's try to do better !

## Decrease the bias

### Increase hidden units

In [12]:
nn_dimensions = [X_train.shape[0], 7, 7, y_train.shape[0]]

### Training

In [13]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
      
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
        batch_size=batch_size,\
        random_state=random_state,\
        hidden_activation='tanh',\
        early_stopping=True,\
        patience=10,\
        max_epoch=100000,\
        verbose=10000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/100000 - loss: 0.99740596344597 - val_loss: 0.9874425480493414
epoch 10000/100000 - loss: 0.03306305109808842 - val_loss: 0.011264777917208946
epoch 20000/100000 - loss: 0.015409171971058651 - val_loss: 0.008873685199254258
epoch 30000/100000 - loss: 0.006575954933323627 - val_loss: 0.006347753614584037
End of training:
epoch 31088/100000 - loss: 0.005987313424912897 - val_loss: 0.006304914635190286

Fold number 0
Cost for training set = 0.005986792670496652
Cost for      dev set = 0.006304914635190286

Accuracy for training set = 0.9980506822612085
Accuracy for      dev set = 1.0

epoch 0/100000 - loss: 0.99520423704617 - val_loss: 1.0002616336176215
End of training:
epoch 2504/100000 - loss: 0.05585143732219983 - val_loss: 0.11630170845898197

Fold number 1
Cost for training set = 0.05584304270000636
Cost for      dev set = 0.11630170845898197

Accuracy for training set = 0.98635477582846
Accuracy for      dev set = 0.9464285714285714

epoch 0/100000 - loss: 1.000447719023107

### Increase hidden units again

In [17]:
nn_dimensions = [X_train.shape[0], 20, 20, 20, y_train.shape[0]]

### Training

In [19]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
      
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
        batch_size=batch_size,\
        random_state=random_state,\
        hidden_activation='tanh',\
        early_stopping=True,\
        patience=10,\
        max_epoch=100000,\
        verbose=10000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/100000 - loss: 1.3087060935571988 - val_loss: 1.3250277327849596
End of training:
epoch 7082/100000 - loss: 0.020565901405728185 - val_loss: 0.013994870723874828

Fold number 0
Cost for training set = 0.020561929058845584
Cost for      dev set = 0.013994870723874828

Accuracy for training set = 0.9941520467836257
Accuracy for      dev set = 1.0

epoch 0/100000 - loss: 1.3189109578000702 - val_loss: 1.1716282396410513
End of training:
epoch 1810/100000 - loss: 0.04707681283898881 - val_loss: 0.08519697145340348

Fold number 1
Cost for training set = 0.04706773893579337
Cost for      dev set = 0.08519697145340348

Accuracy for training set = 0.9902534113060428
Accuracy for      dev set = 0.9821428571428571

epoch 0/100000 - loss: 1.3162084476832179 - val_loss: 1.2108374028498368
End of training:
epoch 669/100000 - loss: 0.06513391387040354 - val_loss: 0.07276304302681981

Fold number 2
Cost for training set = 0.06510060802424608
Cost for      dev set = 0.07276304302681981

Accura

## Decrease the variance

### Training

In [None]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
      
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
        batch_size=batch_size,\
        random_state=random_state,\
        hidden_activation='tanh',\
        early_stopping=True,\
        l2_reg = True,\
        lambd=1.5,\
        patience=10,\
        max_epoch=100000,\
        verbose=10000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')

epoch 0/100000 - loss: 1.4823778927256268 - val_loss: 2.915591106307399
epoch 10000/100000 - loss: 0.1280554374755036 - val_loss: 1.0394564777252044
epoch 20000/100000 - loss: 0.08202224691048102 - val_loss: 0.7675514429519823
epoch 30000/100000 - loss: 0.060003219165110346 - val_loss: 0.5836052474810315
epoch 40000/100000 - loss: 0.0480565646002832 - val_loss: 0.4708006304279736
epoch 50000/100000 - loss: 0.041467649520011605 - val_loss: 0.403730996332112
epoch 60000/100000 - loss: 0.0377881127938186 - val_loss: 0.3639659038362708
epoch 70000/100000 - loss: 0.03570308137539638 - val_loss: 0.340831747302833
epoch 80000/100000 - loss: 0.034496490935978355 - val_loss: 0.3276319850277055
epoch 90000/100000 - loss: 0.03377616268746275 - val_loss: 0.32015365024608544
End of training:
epoch 100000/100000 - loss: 0.033326473566232674 - val_loss: 0.31572503239113653

Fold number 0
Cost for training set = 0.0052161816581970605
Cost for      dev set = 0.0582150090564561

Accuracy for training se

## Other optimizer

### Training

In [None]:
nn_dimensions = [X_train.shape[0], 20, 20, 20, y_train.shape[0]]

In [None]:
mean_cost_train = 0
mean_cost_dev = 0
mean_accuracy_train = 0
mean_accuracy_dev = 0

for i, (X_train, X_dev, y_train, y_dev) in enumerate(KFold(X, y, k, random_state=random_state)):
      
    scaler = FTStandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_dev = scaler.transform(X_dev)
    
    batch_size = X_train.shape[1]
    model = FTMultilayerPerceptron(nn_dimensions,\
        batch_size=batch_size,\
        random_state=random_state,\
        hidden_activation='tanh',\
        optimizer='adam',\
        early_stopping=True,\
        patience=10,\
        max_epoch=100000,\
        verbose=10000)
    
    model.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
    
    cost_train = cross_entropy_cost(y_train, model.predict_probas(X_train))
    mean_cost_train += cost_train
    cost_dev = cross_entropy_cost(y_dev, model.predict_probas(X_dev))
    mean_cost_dev += cost_dev
    
    y_pred_train = model.predict(X_train)
    y_pred_dev = model.predict(X_dev)
    y_truth_train = one_hot_decoder(y_train)
    y_truth_dev = one_hot_decoder(y_dev)
    accuracy_train = accuracy(y_truth_train, y_pred_train)
    mean_accuracy_train += accuracy_train
    accuracy_dev = accuracy(y_truth_dev, y_pred_dev)
    mean_accuracy_dev += accuracy_dev
    
    print()
    print('Fold number ' + str(i))
    print('Cost for training set = ' + str(cost_train))
    print('Cost for      dev set = ' + str(cost_dev))
    print()
    print('Accuracy for training set = ' + str(accuracy_train))
    print('Accuracy for      dev set = ' + str(accuracy_dev))
    print()

    
mean_cost_train /= k
mean_cost_dev /= k
mean_accuracy_train /= k
mean_accuracy_dev /= k

print()
print()
print('Mean Cost for training set = ' + str(mean_cost_train))
print('Mean Cost for      dev set = ' + str(mean_cost_dev))
print()
print('Mean Accuracy for training set = ' + str(mean_accuracy_train))
print('Mean Accuracy for      dev set = ' + str(mean_accuracy_dev))
print('\n')