In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.ndimage import convolve
from sklearn import linear_model, datasets, metrics
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import time
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn import ensemble

In [2]:
def nudge_dataset(X, Y):
    
    direction_vectors = [
        [[0, 1, 0],
         [0, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [1, 0, 0],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 1],
         [0, 0, 0]],

        [[0, 0, 0],
         [0, 0, 0],
         [0, 1, 0]]]

    def shift(x, w):
        return convolve(x.reshape((28, 28)), mode='constant', weights=w).ravel()

    X = np.concatenate([X] +
                       [np.apply_along_axis(shift, 1, X, vector)
                        for vector in direction_vectors])
    Y = np.concatenate([Y for _ in range(5)], axis=0)
    return X, Y

In [3]:
df_train = pd.read_csv('fashion-mnist_train.csv')
df_test = pd.read_csv('fashion-mnist_test.csv')
X_train = df_train.drop('label', axis = 1)
Y_train = df_train['label']
X_train, Y_train = nudge_dataset(X_train, Y_train)
X_train = (X_train - np.min(X_train, 0)) / (np.max(X_train, 0) + 0.0001)
X_test = df_test.drop('label', axis = 1)
Y_test = df_test['label']
X_test = (X_test - np.min(X_test, 0)) / (np.max(X_test, 0) + 0.0001)

In [4]:
#define random forest model
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train, Y_train)
print(rfc.score(X_train, Y_train))
print(rfc.score(X_test, Y_test))



0.9958466666666667
0.8715


While the random forest model performs well on the training set, it has a huge problem with overfitting and does not perform nearly as well on the training set

In [5]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=100)

In [None]:
parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

clf = GridSearchCV(mlp, parameter_space, n_jobs=1, cv=3)
clf.fit(X_train, Y_train)
# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [6]:
#single 100 perceptron layer
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (100), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=100, learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [7]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.90183
0.8973


In [8]:
# 100 perceptron, 10 perceptron layers
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (100, 10), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [9]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.9087333333333333
0.8984


In [10]:
#10 perceptron, 100 perceptron layers
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (10, 100), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 100), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.86731
0.8661


In [12]:
#two 100 perceptron layers
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (100, 100), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 100), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [13]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.9201033333333334
0.8977


In [14]:
#one 200 perceptron layer 
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (200,), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.90392
0.8973


In [16]:
#one 300 perceptron layer
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (300,), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [17]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.90235
0.8949


In [18]:
#200 perceptron layer, 10 perceptron layer
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (200, 10), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [19]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.90865
0.8919


In [20]:
#one 200 perceptron layer, one 100 perceptron layer
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (200, 100), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200, 100), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [21]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.9204533333333333
0.902


In [22]:
#200 perceptron layer with larger alpha
mlp = MLPClassifier(activation = 'tanh', alpha = 0.10, hidden_layer_sizes = (200, ), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [23]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.8950333333333333
0.8911


In [24]:
#200 perceptron layer with smaller alpha
mlp = MLPClassifier(activation = 'tanh', alpha = 0.03, hidden_layer_sizes = (200, ), learning_rate = 'constant', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.03, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [25]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.9191733333333333
0.9023


In [26]:
#200 perceptron layer with adaptive learning rate
mlp = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (200,), learning_rate = 'adaptive', 
                    solver = 'adam')
mlp.fit(X_train, Y_train)

MLPClassifier(activation='tanh', alpha=0.05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(200,), learning_rate='adaptive',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [27]:
print(mlp.score(X_train, Y_train))
print(mlp.score(X_test, Y_test))

0.9052733333333334
0.8949


After completing a grid search to find a starting point for the best hyperparameters for the neural network, I started with the hyperparameters activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (100), learning_rate = 'constant', and solver = 'adam'. I then modified each of the parameters somewhat to see if the performance improved. I ultimately found the best performance with activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (200, 100), learning_rate = 'constant', and solver = 'adam'. This model did not perform as well on the training set as the random forest model, but it did perfom better on the test set. Going forward, this would be a better model to use than the random forest model because it appears to have less of an issue with overfitting. It is possible that further modification of the parameters could further improve the neural network without the same overfit issues as the random forest model.