## Data Download

## Imports and Setup

In [1]:
import idx2numpy # https://pypi.org/project/idx2numpy/
import numpy as np

np.random.seed(0)
np.set_printoptions(linewidth=200)

import matplotlib.pyplot as plt
%matplotlib notebook

from tqdm import tqdm

from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata

from glob import glob

PATH = 'fashion-mnist/data/fashion/'
DICT = {
    0: 't-shirt',
    1: 'trouser',
    2: 'pullover',
    3: 'dress',
    4: 'coat',
    5: 'sandal',
    6: 'shirt',
    7: 'sneaker',
    8: 'bag',
    9: 'ankle boot'
}

## Data preparation

In [2]:
data = {}
for file, label in zip(sorted(glob(PATH+'*')),
                ['test_images', 'test_labels',
                 'train_images', 'train_labels']):
    data[label] = idx2numpy.convert_from_file(file)

## 1. Data exploration

In [3]:
for key, value in data.items():
    print(f'{key} shape: {value.shape}')

test_images shape: (10000, 28, 28)
test_labels shape: (10000,)
train_images shape: (60000, 28, 28)
train_labels shape: (60000,)


In [4]:
print(DICT[data['train_labels'][0]])
plt.imshow(data['train_images'][0], cmap='gray')
plt.show()

ankle boot


<IPython.core.display.Javascript object>

## 3. Data preprocessing

In [5]:
# define training and testing arrays
Xtr = data['train_images']
Ytr = data['train_labels']
Xte = data['test_images']
Yte = data['test_labels']


# preprocessing lambda functions
keys = np.random.permutation(len(Xtr))
shuffle = lambda X, keys: X[keys]
scale = lambda X: (X.astype(np.float32) - 127.5) / 127.5
flat = lambda X: X.reshape(X.shape[0], -1)


# preprocess arrays
# shuffle image dataset
Xtr = shuffle(Xtr, keys)
Ytr = shuffle(Ytr, keys)

# scale pixel values between -1 and 1
Xtr = scale(Xtr)
Xte = scale(Xte)

# flatten image arrays from 28 x 28 to 784 x 1
Xtr = flat(Xtr)
Xte = flat(Xte)

# make images zero-mean
Xtr -= np.mean(Xtr)
Xte -= np.mean(Xte)

In [6]:
print(DICT[Ytr[0]])
plt.imshow((Xtr[0].reshape(28, 28)), cmap='gray')
plt.show()

t-shirt


<IPython.core.display.Javascript object>

## 2. Fully Connected Neural Network

In [7]:
print(f"""
We will use a fully connected neural network with {Xtr.shape[1]} input neurons and {len(DICT)} output neurons. Classification of an image dataset is a large-enough problem to warrant a neural network; however, we will experiment with different hidden layer sizes and numbers. 
""")


We will use a fully connected neural network with 784 input neurons and 10 output neurons. Classification of an image dataset is a large-enough problem to warrant a neural network; however, we will experiment with different hidden layer sizes and numbers. 



In [8]:
from nn import *

Try different activation too (relu and sigmoid)
and optimizers (sgd and adam)

In [9]:
nodes  = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1028]
layers = [1, 2, 3, 4]

losses = []

with tqdm(total=len(layers) * len(nodes)) as pbar:
    for layer in layers:
        for node in nodes:
            # initialize model
            model = Model()

            # add layers
            model.add(Layer_Dense(Xtr.shape[1], node))
            model.add(Activation_ReLU())
            for _ in range(layer):
                model.add(Layer_Dense(node, node))
                model.add(Activation_ReLU())
            model.add(Layer_Dense(node, 10))
            model.add(Activation_Softmax())

            # set model parameters
            model.set(
                loss=Loss_CategoricalCrossentropy(),
                optimizer=Optimizer_Adam(decay=1e-3),
                accuracy=Accuracy_Categorical()
            )

            # finalize model
            model.finalize()

            # train model
            model.train(Xtr, Ytr, validation_data=(Xte, Yte),
                        epochs=5, batch_size=128, p=False) # increase to 10 epochs before submission

            losses.append((layer, node, model.evaluate(Xte, Yte, p=False)))
            pbar.update(1)

100%|██████████| 40/40 [26:31<00:00, 39.79s/it] 


## 4. Performance validation

In [10]:
x, y, z = zip(*losses)
z = list(map(float, z))
grid_x, grid_y = np.mgrid[min(x):max(x):100j, min(y):max(y):100j]
grid_z = griddata((x, y), z, (grid_x, grid_y), method='cubic')

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(grid_x, grid_y, grid_z, cmap=plt.cm.Spectral)
plt.show()

<IPython.core.display.Javascript object>

In [11]:
# 1 layer larger than the input image works well
losses

[(1, 2, 2.302586844343015),
 (1, 4, 0.7007382336248197),
 (1, 8, 0.6029430780557105),
 (1, 16, 0.5078750395106498),
 (1, 32, 0.47972879559713355),
 (1, 64, 0.41724176979812183),
 (1, 128, 0.3689192354986687),
 (1, 256, 0.34068021436533147),
 (1, 512, 0.3245194735661972),
 (1, 1028, 0.3325341849563749),
 (2, 2, 2.3025868443430895),
 (2, 4, 1.0003741570432478),
 (2, 8, 0.6835245240206015),
 (2, 16, 0.6759450665588541),
 (2, 32, 0.57697856673716),
 (2, 64, 0.4516169964002416),
 (2, 128, 0.3915463296159219),
 (2, 256, 0.3477302789075241),
 (2, 512, 0.33100272178715545),
 (2, 1028, 0.34100293551578287),
 (3, 2, 2.3025868443430895),
 (3, 4, 1.1347173522740484),
 (3, 8, 0.9472933376668793),
 (3, 16, 0.9766224218352884),
 (3, 32, 0.7625718596934135),
 (3, 64, 0.5821557620963044),
 (3, 128, 0.4788522405840489),
 (3, 256, 0.35978111058860407),
 (3, 512, 0.3365322753669588),
 (3, 1028, 0.3432860807748086),
 (4, 2, 2.3025868443430895),
 (4, 4, 2.3025868445228546),
 (4, 8, 2.3025868446163402),
 (4,

In [15]:
# initialize model
model = Model()

# add layers
model.add(Layer_Dense(Xtr.shape[1], 1024))
model.add(Activation_ReLU())
model.add(Layer_Dense(1024, 256))
model.add(Activation_ReLU())
model.add(Layer_Dense(256, 64))
model.add(Activation_ReLU())
model.add(Layer_Dense(64, 10))
model.add(Activation_Softmax())

# set model parameters
model.set(
    loss=Loss_CategoricalCrossentropy(),
    optimizer=Optimizer_Adam(decay=1e-3),
    accuracy=Accuracy_Categorical()
)

# finalize model
model.finalize()

# train model
model.train(Xtr, Ytr, validation_data=(Xte, Yte),
            epochs=20, batch_size=128, print_every=200)

epoch: 1
step: 0, acc: 0.117, loss: 2.303 (data_loss: 2.303, reg_loss: 0.000), lr: 0.001
step: 200, acc: 0.805, loss: 0.574 (data_loss: 0.574, reg_loss: 0.000), lr: 0.0008333333333333334
step: 400, acc: 0.820, loss: 0.424 (data_loss: 0.424, reg_loss: 0.000), lr: 0.0007142857142857143
step: 468, acc: 0.875, loss: 0.362 (data_loss: 0.362, reg_loss: 0.000), lr: 0.000681198910081744
training, acc: 0.768, loss: 0.622 (data_loss: 0.622, reg_loss: 0.000), lr: 0.000681198910081744
validation, acc: 0.833, loss: 0.457
epoch: 2
step: 0, acc: 0.875, loss: 0.386 (data_loss: 0.386, reg_loss: 0.000), lr: 0.0006807351940095304
step: 200, acc: 0.844, loss: 0.435 (data_loss: 0.435, reg_loss: 0.000), lr: 0.0005991611743559018
step: 400, acc: 0.859, loss: 0.339 (data_loss: 0.339, reg_loss: 0.000), lr: 0.0005350454788657037
step: 468, acc: 0.885, loss: 0.318 (data_loss: 0.318, reg_loss: 0.000), lr: 0.0005162622612287042
training, acc: 0.857, loss: 0.391 (data_loss: 0.391, reg_loss: 0.000), lr: 0.0005162622

step: 400, acc: 0.953, loss: 0.135 (data_loss: 0.135, reg_loss: 0.000), lr: 0.00011855364552459988
step: 468, acc: 0.979, loss: 0.079 (data_loss: 0.079, reg_loss: 0.000), lr: 0.00011760555098200635
training, acc: 0.939, loss: 0.170 (data_loss: 0.170, reg_loss: 0.000), lr: 0.00011760555098200635
validation, acc: 0.889, loss: 0.330
epoch: 17
step: 0, acc: 0.953, loss: 0.115 (data_loss: 0.115, reg_loss: 0.000), lr: 0.00011759172154280338
step: 200, acc: 0.961, loss: 0.151 (data_loss: 0.151, reg_loss: 0.000), lr: 0.00011488970588235294
step: 400, acc: 0.953, loss: 0.131 (data_loss: 0.131, reg_loss: 0.000), lr: 0.00011230907457322552
step: 468, acc: 0.979, loss: 0.075 (data_loss: 0.075, reg_loss: 0.000), lr: 0.00011145786892554613
training, acc: 0.942, loss: 0.164 (data_loss: 0.164, reg_loss: 0.000), lr: 0.00011145786892554613
validation, acc: 0.890, loss: 0.333
epoch: 18
step: 0, acc: 0.953, loss: 0.109 (data_loss: 0.109, reg_loss: 0.000), lr: 0.00011144544745347155
step: 200, acc: 0.969, 

In [14]:
model.plot()

<IPython.core.display.Javascript object>