In [92]:
import keras
import numpy as np
from numpy import load
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization

In [117]:
#unpack data from source

data = np.load('train_and_test.npz')  #what kind of file is this? zipped np.array file

X = [data[key] for key in data]  #??? why does this work? "data" is not a dictionary data type but a NpzFile type

xtrain0 = X[0]
ytrain0 = X[1]  #ytrain0 contains values between 0-42; so 43 categories
xtest0 = X[2]   #this is test data for final kaggle submission

x_tr, x_tst, y_tr, y_tst = train_test_split(xtrain0, ytrain0, test_size = 0.2,shuffle = True)  #results overfit if data is not shuffled!!!

In [118]:
#build model -  this is a function to build the model where activation type and batch
#normalization can be specified as parameters so models can be changed without retyping

def build_model(activation, batch_normalization = False):
    model = Sequential()
    model.add(Conv2D(8, (3,3), activation = activation, padding = 'same', name = 'conv1', 
                    input_shape = (32,32,3))) 
    if batch_normalization: 
        model.add(BatchNormalization())
    model.add(MaxPooling2D((2,2), name = 'maxpool1')) 
    model.add(Conv2D(16, (3,3), activation = activation, padding = 'same', name = 'conv2' ))
    if batch_normalization: 
        model.add(BatchNormalization())
    model.add(MaxPooling2D((2,2), name = 'maxpool2'))
    model.add(Flatten())
    model.add(Dense(64, activation=activation, name='dense_1'))
    model.add(Dense(43, activation='softmax', name='output'))  #there are 43 labels
    return model

# Optimizing Hyperparameters

The remaining of this notebook will be dedicated to testing for the optimal values of our hyperparameters within our model (specifically our learning rate and our momentum). For our own sanity, we will be setting our number of epochs to 20 for our initial tests to limit processing time. Once a proper learning rate has been selected, we will shift our focus to finding the optimal momentum rate. Finally, we will look for the proper number of epochs to run based off our selected learning rate and momentum.

In [20]:
learning_rates = []
momentums = []

rate = 1
m = 0.99
for i in range(5):
    if rate > 0.125:
        rate = rate/2
    elif rate == 0.125:
        rate = 0.01
    else:
        rate = rate/10
        
    momentums.append(round(m,2))
    m -= 0.02
    learning_rates.append(rate)
    
momentums += [0.5, 0.25, 0.1, 0]

test = [0.1,0.25,0.9,0.95,0.99]
    
print(learning_rates)
print(momentums)

[0.5, 0.25, 0.125, 0.01, 0.001]
[0.99, 0.97, 0.95, 0.93, 0.91, 0.5, 0.25, 0.1, 0]


In [21]:
for i in range(len(learning_rates)):
    print('learning rate:', learning_rates[i])
    print('momentum:', test[i])
    model = build_model(activation = 'sigmoid', batch_normalization = False)  # try 'relu' too
    opt = tf.keras.optimizers.SGD(learning_rate = learning_rates[i], momentum = test[i], nesterov=False)
    model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    model.fit(x_tr, y_tr, epochs = 20, validation_data= (x_tst, y_tst))
    print('========================================================================================================')

learning rate: 0.5
momentum: 0.1
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
learning rate: 0.25
momentum: 0.25
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
learning rate: 0.125
momentum: 0.9
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
learning rate: 0.01
momentum: 0.95
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
learning rate: 0.001
momentum: 0.99
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


The optimal learning rate seems to be around 0.125 (which is what similar to our original value that we were using to begin with). With that found, we will try to find the best matching momentum value and number of epochs. 

In [8]:
loss, accuracy = model.evaluate(x_tst, y_tst)
print('Test accuracy: %.4f' % (accuracy))

Test accuracy: 0.0531


In [12]:
for m in momentums:
    print('momentum:', learning_rates[i])
    model = build_model(activation = 'sigmoid', batch_normalization = False)  # try 'relu' too
    opt = tf.keras.optimizers.SGD(learning_rate = 0.125, momentum = m, nesterov=False)
    model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    model.fit(x_tr, y_tr, epochs = 20, validation_data= (x_tst, y_tst))
    print('========================================================================================================')

momentum: 0.5
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
momentum: 0.25
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
momentum: 0.125
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
momentum: 0.0625
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
momentum: 0.03125
Train on 26270 samples, validate on 12939 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

KeyboardInterrupt: 

In [136]:
model = build_model(activation = 'sigmoid', batch_normalization = True)  # try 'relu' too
opt = tf.keras.optimizers.SGD(learning_rate = 0.1, momentum = 0, nesterov=True)
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
model.fit(x_tr, y_tr, epochs = 20, validation_data= (x_tst, y_tst))

Train on 31367 samples, validate on 7842 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1be0a5e3e48>

In [137]:
yhat = model.predict(xtest0, verbose = False)

print(yhat)
print(len(yhat))


y_TST = [np.argmax(yhat[i]) for i in range(len(yhat))]

sub = [[0 for j in range(44)] for i in range(len(y_TST))]

for i in range(len(y_TST)):
    j = y_TST[i]+1
    sub[i][0] = i
    sub[i][j] = 1
# print(min(y_TST))
# print(max(y_TST))

print(y_TST[2])

[[7.3597480e-06 1.1880704e-06 4.0130180e-07 ... 9.2217283e-07
  4.8400634e-06 8.4510185e-07]
 [1.4789744e-04 9.7446352e-01 1.5995530e-03 ... 7.1707946e-06
  3.6577674e-08 4.9049589e-07]
 [1.3829646e-06 1.9243575e-07 3.6817747e-07 ... 7.9551297e-05
  1.6468884e-05 1.5074387e-06]
 ...
 [3.5664780e-04 1.0475375e-03 1.5668098e-03 ... 8.8161137e-04
  2.4159157e-04 4.5088156e-05]
 [1.3376473e-03 3.8526282e-03 5.0244057e-03 ... 5.3995554e-03
  1.0639826e-05 3.9452375e-04]
 [5.7629490e-04 9.8339608e-03 1.2270998e-03 ... 2.1267186e-03
  3.5112803e-03 2.1784692e-03]]
12630
38


In [149]:
model = build_model(activation = 'sigmoid', batch_normalization = True)  # try 'relu' too
opt = tf.keras.optimizers.SGD(learning_rate = 1e-5, momentum = 0.99, nesterov=True)
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
model.fit(x_tr, y_tr, epochs = 100, validation_data= (x_tst, y_tst))

Train on 31367 samples, validate on 7842 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100

KeyboardInterrupt: 

In [138]:
import pandas as pd

cols = []
for i in range(43):
    cols.append('oh_'+str(i))
cols = ['id'] + cols

df = pd.DataFrame(data=sub, columns=cols)
df = df.set_index('id')

df.to_csv('results.csv', sep=',', encoding='utf-8')
df

Unnamed: 0_level_0,oh_0,oh_1,oh_2,oh_3,oh_4,oh_5,oh_6,oh_7,oh_8,oh_9,...,oh_33,oh_34,oh_35,oh_36,oh_37,oh_38,oh_39,oh_40,oh_41,oh_42
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12625,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12626,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12627,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12628,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Results


learning rate: 0.0001

momentum: 0.99

number epochs: 200

Although the number of epochs is a bit high, having such a large epoch range allows our model to slowly increase accuracy as the epoch step increases. If we limit our number of epochs to 20, our model settles around 87% accuracy. If we increase epochs to 50, our model accuracy incrases to 90% or so. Increasing it further to 100 brings us close to 95% accuracy. Finally, 200 epochs brings us close to 97% accuracy.

Overall, these results are similar to previous hyperparamter tuning we've performed in the past. By setting a low learning rate (with decent momentum to keep our model "learning" through each epoch step) and allowing our model to run through a high number of epochs, our model can steadily reduce its error until its performance plateaus.