# Introduction to Keras with MNIST
Import various modules that we need for this notebook.

In [1]:
%pylab inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from keras.regularizers import l2

Using Theano backend.


Populating the interactive namespace from numpy and matplotlib


Load the MNIST dataset, flatten the images, convert the class labels, and scale the data.

In [2]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 28**2).astype('float32') / 255
X_test = X_test.reshape(10000, 28**2).astype('float32') / 255
Y_train = np_utils.to_categorical(y_train, 10)
Y_test = np_utils.to_categorical(y_test, 10)

### I. Basic example 
Build and compile a basic model.

In [3]:
model = Sequential()
model.add(Dense(512, input_shape=(28 * 28,)))
model.add(Activation("sigmoid"))
model.add(Dense(10))
          
sgd = SGD(lr = 0.01, momentum = 0.9, nesterov = True)
model.compile(loss='mse', optimizer=sgd)

Fit the model over 25 epochs.

In [4]:
model.fit(X_train, Y_train, batch_size=32, nb_epoch=10,
          verbose=1, show_accuracy=True, validation_split=0.1)

Train on 54000 samples, validate on 6000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x104402e48>

Evaluate model on the test set

In [6]:
print("Test classification rate %0.05f" % model.evaluate(X_test, Y_test, show_accuracy=True)[1])

Test classification rate 0.87740


Predict classes on the test set.

In [7]:
y_hat = model.predict_classes(X_test)
pd.crosstab(y_hat, y_test)



col_0,0,1,2,3,4,5,6,7,8,9
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,953,0,23,6,0,21,16,5,12,20
1,0,1099,25,7,11,9,4,31,13,8
2,1,3,802,13,4,4,4,13,5,1
3,1,2,24,889,0,83,1,7,20,11
4,0,1,17,3,881,16,9,12,9,52
5,4,0,0,6,1,616,12,0,16,0
6,12,5,39,10,16,25,901,2,16,1
7,1,0,23,21,1,17,0,911,5,43
8,7,24,72,41,16,79,11,8,866,17
9,1,1,7,14,52,22,0,39,12,856


### II. Deeper model with dropout and cross entropy

Let's now build a deeper model, with three hidden dense layers and dropout layers. I'll use rectified linear units as they tend to perform better on deep models. I also initilize the nodes using "glorot_normal", which uses Gaussian noise scaled by the sum of the inputs plus outputs from the node. Notice that we do not need to give an input shape to any layers other than the first.

In [8]:
model = Sequential()

model.add(Dense(512, input_shape=(28 * 28,), init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(512, init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(512, init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(512, init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(10))
model.add(Activation('softmax'))

In [9]:
sgd = SGD(lr = 0.01, momentum = 0.9, nesterov = True)
model.compile(loss='categorical_crossentropy', optimizer=sgd)

model.fit(X_train, Y_train, batch_size=32, nb_epoch=10,
          verbose=1, show_accuracy=True, validation_split=0.1)

Train on 54000 samples, validate on 6000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x104402a90>

In [10]:
print("Test classification rate %0.05f" % model.evaluate(X_test, Y_test, show_accuracy=True)[1])
fy_hat = model.predict_classes(X_test)
pd.crosstab(y_hat, y_test)

Test classification rate 0.97730


col_0,0,1,2,3,4,5,6,7,8,9
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,953,0,23,6,0,21,16,5,12,20
1,0,1099,25,7,11,9,4,31,13,8
2,1,3,802,13,4,4,4,13,5,1
3,1,2,24,889,0,83,1,7,20,11
4,0,1,17,3,881,16,9,12,9,52
5,4,0,0,6,1,616,12,0,16,0
6,12,5,39,10,16,25,901,2,16,1
7,1,0,23,21,1,17,0,911,5,43
8,7,24,72,41,16,79,11,8,866,17
9,1,1,7,14,52,22,0,39,12,856


In [None]:
test_wrong = [im for im in zip(X_test,y_hat,y_test) if im[1] != im[2]]

plt.figure(figsize=(15, 15))
for ind, val in enumerate(test_wrong[:100]):
    plt.subplot(10, 10, ind + 1)
    im = 1 - val[0].reshape((28,28))
    axis("off")
    plt.imshow(im, cmap='gray')

### III. Small model: Visualizing weights
Now, I want to make a model that has only a small number of hidden nodes in each layer. We may then have a chance of actually visualizing the weights.

In [None]:
model = Sequential()

model.add(Dense(16, input_shape=(28 * 28,), init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(16, init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(10))
model.add(Activation('softmax'))

rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms)

model.fit(X_train, Y_train, batch_size=32, nb_epoch=10,
          verbose=1, show_accuracy=True, validation_split=0.1)

The classification rate on the validation set is not nearly as predictive, but it is still not too bad overall. A model object contains a list of its layers. The weights are easy to pull out.

In [None]:
print(model.layers) # list of the layers
print(model.layers[0].get_weights()[0].shape) # the weights

The first set of weights will be given as weights the same size as the input space. Notice how 

In [None]:
W1 = model.layers[0].get_weights()[0]

for ind, val in enumerate(W1.T):
    plt.figure(figsize=(3, 3), frameon=False)
    im = val.reshape((28,28))
    plt.axis("off")
    plt.imshow(im, cmap='seismic')

The second layer of weights will be given as a single 16x16 matrix of weights.

In [None]:
W2 = model.layers[3].get_weights()[0]

plt.figure(figsize=(3, 3))
im = W2.reshape((16,16))
plt.axis("off")
plt.imshow(im, cmap='seismic')

### IV. Further tweaks: weights and alternative optimizers
Just to show off a few more tweaks, we'll run one final model. Here we use weights and an alternative to vanillia stochastic gradient descent. 

In [None]:
model = Sequential()

model.add(Dense(128, input_shape=(28 * 28,), init="glorot_normal"))
model.add(Activation("relu"))
model.add(Dropout(0.5))

model.add(Dense(512, init="glorot_normal",W_regularizer=l2(0.1)))
model.add(Activation("relu"))
model.add(Dropout(0.2))

model.add(Dense(512, init="glorot_normal",W_regularizer=l2(0.1)))
model.add(Activation("relu"))
model.add(Dropout(0.2))

model.add(Dense(10))
model.add(Activation('softmax'))

In [None]:
rms = RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=rms)

model.fit(X_train, Y_train, batch_size=32, nb_epoch=5,
          verbose=1, show_accuracy=True, validation_split=0.1)