# Neural Networks for MNIST dataset

## Loading data

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import np_utils
from keras.datasets import mnist

Using Theano backend.


In [2]:
seed = 7
np.random.seed(seed)

In [3]:
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()

In [4]:
X_train.shape

(60000, 28, 28)

In [5]:
%matplotlib inline

In [6]:
# flatten 28*28 images to a 784 vector for each image
num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')

In [7]:
# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255

In [8]:
# one hot encode outputs
Y_train = np_utils.to_categorical(Y_train)
Y_test = np_utils.to_categorical(Y_test)
num_classes = Y_test.shape[1]

## Part 2.1 - Tuning Learning Rates

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [25]:
M = 300

learning_rates = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]

for index, learning_rate in enumerate(learning_rates):
    model1 = Sequential()
    model1.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu'))
    model1.add(Dense(10, activation='softmax'))
    model1.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    
    model1.optimizer.lr.set_value(learning_rate)
    
    model1.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=200, verbose=0)
    
    scores = model1.evaluate(X_test, Y_test, verbose=0)
    
    print "Learning Rate:", learning_rate, ", Validation Accuracy:", scores[1]

Learning Rate: 1 , Validation Accuracy: 0.0892
Learning Rate: 0.1 , Validation Accuracy: 0.2134
Learning Rate: 0.01 , Validation Accuracy: 0.9692
Learning Rate: 0.001 , Validation Accuracy: 0.9798
Learning Rate: 0.0001 , Validation Accuracy: 0.9555
Learning Rate: 1e-05 , Validation Accuracy: 0.8966


### 0.01 and 0.001 were the best learning rates. We will interpolate between these values and find the best learning rate to use:

In [29]:
new_learning_rates = np.arange(0.001, 0.01, 0.001).astype('float32')

for index, learning_rate in enumerate(new_learning_rates):
    model1 = Sequential()
    model1.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu'))
    model1.add(Dense(10, activation='softmax'))
    model1.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    
    model1.optimizer.lr.set_value(learning_rate)
    
    model1.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=200, verbose=0)
    
    scores = model1.evaluate(X_test, Y_test, verbose=0)
    
    print "Learning Rate:", learning_rate, ", Validation Accuracy:", scores[1]

Learning Rate: 0.001 , Validation Accuracy: 0.979
Learning Rate: 0.002 , Validation Accuracy: 0.9784
Learning Rate: 0.003 , Validation Accuracy: 0.9762
Learning Rate: 0.004 , Validation Accuracy: 0.977
Learning Rate: 0.005 , Validation Accuracy: 0.9789
Learning Rate: 0.006 , Validation Accuracy: 0.9764
Learning Rate: 0.007 , Validation Accuracy: 0.9721
Learning Rate: 0.008 , Validation Accuracy: 0.9787
Learning Rate: 0.009 , Validation Accuracy: 0.9752


### Since the validation accuracy drops off from learning rate = 0.001 onwards, we will interpolate between 0.0001 and 0.001 instead:

In [30]:
new_learning_rates = np.arange(0.0001, 0.001, 0.0001).astype('float32')

for index, learning_rate in enumerate(new_learning_rates):
    model1 = Sequential()
    model1.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu'))
    model1.add(Dense(10, activation='softmax'))
    model1.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    
    model1.optimizer.lr.set_value(learning_rate)
    
    model1.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=200, verbose=0)
    
    scores = model1.evaluate(X_test, Y_test, verbose=0)
    
    print "Learning Rate:", learning_rate, ", Validation Accuracy:", scores[1]

Learning Rate: 0.0001 , Validation Accuracy: 0.9557
Learning Rate: 0.0002 , Validation Accuracy: 0.9692
Learning Rate: 0.0003 , Validation Accuracy: 0.9751
Learning Rate: 0.0004 , Validation Accuracy: 0.9765
Learning Rate: 0.0005 , Validation Accuracy: 0.978
Learning Rate: 0.0006 , Validation Accuracy: 0.9774
Learning Rate: 0.0007 , Validation Accuracy: 0.9802
Learning Rate: 0.0008 , Validation Accuracy: 0.98
Learning Rate: 0.0009 , Validation Accuracy: 0.9787


## Part 2.2 - Tuning Hidden Layer Size

In [33]:
def get_model(lr=0.001, M=300):
    model = Sequential()
    model.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu'))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    model.optimizer.lr.set_value(lr)
    return model

In [34]:
hidden_sizes = [10, 50, 100, 300, 1000, 2000]

In [37]:
for hidden_layer_size in hidden_sizes:
    
    model = get_model(lr=0.01, M = hidden_layer_size)
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=200, verbose=2)
    
    scores = model.evaluate(X_test, Y_test, verbose=0)
    
    print "Hidden Layer Size:", hidden_layer_size, ", Validation Accuracy:", scores[1]
    

Train on 60000 samples, validate on 10000 samples
Epoch 1/10
0s - loss: 0.5084 - acc: 0.8476 - val_loss: 0.3366 - val_acc: 0.9024
Epoch 2/10
0s - loss: 0.3274 - acc: 0.9075 - val_loss: 0.3168 - val_acc: 0.9115
Epoch 3/10
0s - loss: 0.3090 - acc: 0.9129 - val_loss: 0.3168 - val_acc: 0.9098
Epoch 4/10
0s - loss: 0.3014 - acc: 0.9148 - val_loss: 0.3057 - val_acc: 0.9126
Epoch 5/10
0s - loss: 0.2954 - acc: 0.9168 - val_loss: 0.3051 - val_acc: 0.9087
Epoch 6/10
0s - loss: 0.2890 - acc: 0.9180 - val_loss: 0.3122 - val_acc: 0.9097
Epoch 7/10
0s - loss: 0.2860 - acc: 0.9175 - val_loss: 0.3090 - val_acc: 0.9106
Epoch 8/10
0s - loss: 0.2814 - acc: 0.9197 - val_loss: 0.3143 - val_acc: 0.9096
Epoch 9/10
0s - loss: 0.2794 - acc: 0.9201 - val_loss: 0.3132 - val_acc: 0.9069
Epoch 10/10
0s - loss: 0.2766 - acc: 0.9205 - val_loss: 0.3016 - val_acc: 0.9139
Hidden Layer Size: 10 , Validation Accuracy: 0.9139
Train on 60000 samples, validate on 10000 samples
Epoch 1/10
0s - loss: 0.2652 - acc: 0.9211 - va

## Part 2.3 - L2 Weight Decay

In [38]:
from keras.regularizers import l2

def get_reg_model(lr=0.001, M=300, w=0.1):
    model = Sequential()
    model.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu', W_regularizer=l2(w)))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    model.optimizer.lr.set_value(lr)
    return model

In [39]:
models = [get_model(lr = 0.01, M = 300), get_model(lr = 0.001, M = 300), get_reg_model(lr=0.001, M = 300, w = 0.002)]

## Model 1 - Learning Rate = 0.01, M = 300 

In [41]:
    model = models[0]
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=200, verbose=2)
    
    scores = model.evaluate(X_test, Y_test, verbose=0)
    
    print  "Validation Accuracy:", scores[1]

Train on 60000 samples, validate on 10000 samples
Epoch 1/10
1s - loss: 0.2199 - acc: 0.9328 - val_loss: 0.1084 - val_acc: 0.9675
Epoch 2/10
1s - loss: 0.0975 - acc: 0.9700 - val_loss: 0.0971 - val_acc: 0.9691
Epoch 3/10
1s - loss: 0.0723 - acc: 0.9778 - val_loss: 0.1060 - val_acc: 0.9708
Epoch 4/10
2s - loss: 0.0617 - acc: 0.9809 - val_loss: 0.1101 - val_acc: 0.9696
Epoch 5/10
2s - loss: 0.0563 - acc: 0.9821 - val_loss: 0.1200 - val_acc: 0.9691
Epoch 6/10
2s - loss: 0.0470 - acc: 0.9855 - val_loss: 0.1303 - val_acc: 0.9703
Epoch 7/10
2s - loss: 0.0522 - acc: 0.9848 - val_loss: 0.1537 - val_acc: 0.9680
Epoch 8/10
2s - loss: 0.0524 - acc: 0.9846 - val_loss: 0.1307 - val_acc: 0.9718
Epoch 9/10
2s - loss: 0.0421 - acc: 0.9880 - val_loss: 0.1585 - val_acc: 0.9707
Epoch 10/10
2s - loss: 0.0496 - acc: 0.9870 - val_loss: 0.1745 - val_acc: 0.9693
Validation Accuracy: 0.9693


## Model 2 - Learning Rate = 0.001, M = 300 

In [42]:
    model = models[1]
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=10, batch_size=200, verbose=2)
    
    scores = model.evaluate(X_test, Y_test, verbose=0)
    
    print  "Validation Accuracy:", scores[1]

Train on 60000 samples, validate on 10000 samples
Epoch 1/10
1s - loss: 0.3422 - acc: 0.9045 - val_loss: 0.1766 - val_acc: 0.9482
Epoch 2/10
1s - loss: 0.1468 - acc: 0.9588 - val_loss: 0.1219 - val_acc: 0.9653
Epoch 3/10
1s - loss: 0.1016 - acc: 0.9709 - val_loss: 0.0974 - val_acc: 0.9700
Epoch 4/10
1s - loss: 0.0772 - acc: 0.9774 - val_loss: 0.0828 - val_acc: 0.9745
Epoch 5/10
1s - loss: 0.0586 - acc: 0.9833 - val_loss: 0.0779 - val_acc: 0.9768
Epoch 6/10
1s - loss: 0.0472 - acc: 0.9867 - val_loss: 0.0695 - val_acc: 0.9779
Epoch 7/10
1s - loss: 0.0382 - acc: 0.9897 - val_loss: 0.0662 - val_acc: 0.9804
Epoch 8/10
1s - loss: 0.0311 - acc: 0.9917 - val_loss: 0.0673 - val_acc: 0.9804
Epoch 9/10
1s - loss: 0.0252 - acc: 0.9934 - val_loss: 0.0649 - val_acc: 0.9806
Epoch 10/10
1s - loss: 0.0201 - acc: 0.9950 - val_loss: 0.0635 - val_acc: 0.9801
Validation Accuracy: 0.9801


## Model 3 - Learning Rate = 0.001, M = 300, w = 0.002, and epochs = 30



In [44]:
    model = models[2]
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=30, batch_size=200, verbose=2)
    
    scores = model.evaluate(X_test, Y_test, verbose=0)
    
    print  "Validation Accuracy:", scores[1]

Train on 60000 samples, validate on 10000 samples
Epoch 1/30
1s - loss: 0.7699 - acc: 0.9021 - val_loss: 0.3846 - val_acc: 0.9438
Epoch 2/30
1s - loss: 0.3231 - acc: 0.9479 - val_loss: 0.2632 - val_acc: 0.9548
Epoch 3/30
1s - loss: 0.2548 - acc: 0.9564 - val_loss: 0.2336 - val_acc: 0.9605
Epoch 4/30
1s - loss: 0.2233 - acc: 0.9624 - val_loss: 0.2141 - val_acc: 0.9637
Epoch 5/30
1s - loss: 0.2092 - acc: 0.9653 - val_loss: 0.1968 - val_acc: 0.9688
Epoch 6/30
2s - loss: 0.1971 - acc: 0.9674 - val_loss: 0.1859 - val_acc: 0.9700
Epoch 7/30
2s - loss: 0.1889 - acc: 0.9699 - val_loss: 0.1881 - val_acc: 0.9667
Epoch 8/30
2s - loss: 0.1809 - acc: 0.9707 - val_loss: 0.1754 - val_acc: 0.9722
Epoch 9/30
2s - loss: 0.1762 - acc: 0.9710 - val_loss: 0.1743 - val_acc: 0.9704
Epoch 10/30
2s - loss: 0.1700 - acc: 0.9725 - val_loss: 0.1649 - val_acc: 0.9727
Epoch 11/30
2s - loss: 0.1643 - acc: 0.9741 - val_loss: 0.1664 - val_acc: 0.9736
Epoch 12/30
2s - loss: 0.1618 - acc: 0.9745 - val_loss: 0.1590 - val

## Part 2.4 - Models with Dropout

In [63]:
def get_dropout_model(lr=0.001, M=300, w=0.2):
    model = Sequential()
    model.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu'))
    model.add(Dropout(w))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    model.optimizer.lr.set_value(lr)
    return model

In [69]:
dropouts = [ i/10.0 for i in range(1,11,1)]

In [70]:
dropouts

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [71]:
for dropout in dropouts:
    model = get_dropout_model(lr = 0.001, M = 300, w = dropout)

    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=30, batch_size=200, verbose=2)
    scores = model.evaluate(X_test, Y_test, verbose=0)
    
    print "Dropout:", dropout, ", Validation Accuracy:", scores[1]
    

Train on 60000 samples, validate on 10000 samples
Epoch 1/30
1s - loss: 0.3544 - acc: 0.8999 - val_loss: 0.1779 - val_acc: 0.9485
Epoch 2/30
1s - loss: 0.1535 - acc: 0.9563 - val_loss: 0.1222 - val_acc: 0.9632
Epoch 3/30
1s - loss: 0.1080 - acc: 0.9690 - val_loss: 0.0963 - val_acc: 0.9695
Epoch 4/30
1s - loss: 0.0828 - acc: 0.9759 - val_loss: 0.0875 - val_acc: 0.9728
Epoch 5/30
1s - loss: 0.0669 - acc: 0.9813 - val_loss: 0.0782 - val_acc: 0.9765
Epoch 6/30
1s - loss: 0.0558 - acc: 0.9835 - val_loss: 0.0684 - val_acc: 0.9780
Epoch 7/30
1s - loss: 0.0460 - acc: 0.9870 - val_loss: 0.0680 - val_acc: 0.9782
Epoch 8/30
1s - loss: 0.0395 - acc: 0.9884 - val_loss: 0.0657 - val_acc: 0.9798
Epoch 9/30
1s - loss: 0.0333 - acc: 0.9903 - val_loss: 0.0627 - val_acc: 0.9805
Epoch 10/30
1s - loss: 0.0275 - acc: 0.9925 - val_loss: 0.0625 - val_acc: 0.9806
Epoch 11/30
1s - loss: 0.0251 - acc: 0.9929 - val_loss: 0.0605 - val_acc: 0.9813
Epoch 12/30
1s - loss: 0.0217 - acc: 0.9940 - val_loss: 0.0634 - val

## Part 2.5 - 3-layer Network

In [17]:
hl_1 = [300, 500]
hl_2 = [150, 300]
lrs = [0.001]
dropout_1 = [0.0, 0.1, 0.2]
dropout_2 = [0.0, 0.1, 0.2]

In [18]:
def get_three_layer_model(lr=0.001, M=500, N =150, w1=0.2, w2=0.2):
    model = Sequential()
    model.add(Dense(M, input_dim=num_pixels, init='normal', activation='relu'))
    model.add(Dropout(w1))
    
    model.add(Dense(N, input_dim=M, init='normal', activation='relu'))
    model.add(Dropout(w2))
    
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam',
               loss='categorical_crossentropy',
               metrics=['accuracy'])
    model.optimizer.lr.set_value(lr)
    return model

In [20]:
for M in hl_1:
    for N in hl_2:
        for lr in lrs:
            for w1 in dropout_1:
                for w2 in dropout_2:
                    
                    model = get_three_layer_model(lr=lr, M=M, N =N, w1=w1, w2=w2)
                    
                    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=30, batch_size=200, verbose=0)
                    scores = model.evaluate(X_test, Y_test, verbose=0)
    
                    print "Hidden Layer 1:", M, ", Hidden Layer 2:", N, ", Learning Rate:", lr, ", Dropout 1:", w1, ", Dropout 2:", w2, ", Validation Accuracy:", scores[1] 

                    
                    
                    
                    

Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.0 , Dropout 2: 0.0 , Validation Accuracy: 0.9839
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.0 , Dropout 2: 0.1 , Validation Accuracy: 0.9826
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.0 , Dropout 2: 0.2 , Validation Accuracy: 0.9806
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.1 , Dropout 2: 0.0 , Validation Accuracy: 0.9829
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.1 , Dropout 2: 0.1 , Validation Accuracy: 0.983
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.1 , Dropout 2: 0.2 , Validation Accuracy: 0.9856
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.2 , Dropout 2: 0.0 , Validation Accuracy: 0.9835
Hidden Layer 1: 300 , Hidden Layer 2: 150 , Learning Rate: 0.001 , Dropout 1: 0.2 , Dropout 2: 0.1