In [41]:
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from keras.utils import np_utils
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Convolution2D, MaxPooling2D
from keras.layers import Dense, Dropout, Activation, Flatten
from IPython.display import HTML, display

nb_classes = 10
np.random.seed(2017) # for reproducibility

## Pre-processing the data
The neural-network is going to take a single vector for each training example, so reshaping the input so that each 28x28 image becomes a single 784 dimensional vector is done. Scale the inputs to be in the range [0-1] rather than [0-255] so that we give all features of the input vector same importance

In [38]:
def dataPrep(conv):
    # the data, shuffled and split between train and test sets
    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    print("X_train original shape", X_train.shape)
    print("y_train original shape", y_train.shape)
    
    if conv:
            X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
            X_test = X_test.reshape(X_test.shape[0], 1, 28, 28)
    else:
        X_train = X_train.reshape(60000, 784)
        X_test = X_test.reshape(10000, 784)
    
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255
    
    # One-hot encoding the output i.e 1-->(1,0,0,0,0,0,0,0,0,0),2-->(0,1,0,0,0,0,0,0,0,0),....
    Y_train = np_utils.to_categorical(y_train, nb_classes)
    Y_test = np_utils.to_categorical(y_test, nb_classes)
    
    print("Training matrix shape", X_train.shape)
    print("Training target matrix shape", Y_train.shape)
    print("Testing matrix shape", X_test.shape)
    print("Testing target matrix shape", Y_test.shape)
    
    return X_train,Y_train,X_test,Y_test

# Build the neural network
A simple 3 layer fully connected network.
<img src="plots/figure.png" />

## Compile the model
The loss function here is **categorical crossentropy**, a well-suited to comparing two probability distributions.

Here our predictions are probability distributions across the ten different digits (e.g. "we're 80% confident this image is a 3, 10% sure it's an 8, 5% it's a 2, etc."), and the target is a probability distribution with 100% for the correct category, and 0 for everything else. The cross-entropy is a measure of how different your predicted distribution is from the target distribution.

In [40]:
def train(X_train,Y_train,X_test,Y_test,drop_prob,activ_fun,num_epochs,callback):
    model = Sequential()
    model.add(Dense(512, input_shape=(784,))) # Hidden layer_1 with 512 neurons
    model.add(Activation(activ_fun)) 
    model.add(Dropout(drop_prob))             # Dropout regularization to avoid overfitting
    model.add(Dense(512))                     # Hidden layer_2 with 512 neurons 
    model.add(Activation(activ_fun))
    model.add(Dropout(drop_prob)) 
    model.add(Dense(10))                      # Output Layer with 10 neurons 
    model.add(Activation('softmax')) 
    
    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    if callback:
        earlyStopping=EarlyStopping(monitor='loss', min_delta=0.0001, patience=5, verbose=0, mode='auto')
        model_info = model.fit(X_train, Y_train,callbacks=[earlyStopping],batch_size=128, 
                               epochs=num_epochs,verbose=2,validation_split=0.15)
    else:
        model_info = model.fit(X_train, Y_train,batch_size=128,epochs=num_epochs,verbose=2,validation_split=0.15)
    score = model.evaluate(X_test, Y_test,verbose=0)
    return model_info,score

In [22]:
conv = False
callback = False
drop_prob = 0.2
activ_fun = 'relu'
num_epochs = 4
X_train,Y_train,X_test,Y_test = dataPrep(conv)
if conv:
    print 'No model'
else:
    model_info,score = train(X_train,Y_train,X_test,Y_test,drop_prob,activ_fun,num_epochs,callback)
    print('Test score:', score[0])
    print('Test accuracy:', score[1])

('X_train original shape', (60000, 28, 28))
('y_train original shape', (60000,))
('Training matrix shape', (60000, 784))
('Training target matrix shape', (60000, 10))
('Testing matrix shape', (10000, 784))
('Testing target matrix shape', (10000, 10))
Train on 51000 samples, validate on 9000 samples
Epoch 1/4
 - 12s - loss: 0.2695 - acc: 0.9173 - val_loss: 0.1132 - val_acc: 0.9677
Epoch 2/4
 - 11s - loss: 0.1085 - acc: 0.9667 - val_loss: 0.0824 - val_acc: 0.9756
Epoch 3/4
 - 11s - loss: 0.0782 - acc: 0.9757 - val_loss: 0.0741 - val_acc: 0.9786
Epoch 4/4
 - 11s - loss: 0.0592 - acc: 0.9814 - val_loss: 0.0822 - val_acc: 0.9761
('Test score:', 0.079624776807846506)
('Test accuracy:', 0.97489999999999999)


# Experiments

## Varying dropout -- [0.1,0.2,0.4]  

with *relu* as Activation function in all the instances
<table>
    <tr>
        <td><img src='plots/drop_0.1.png'></td>
        <td><img src='plots/drop_0.2.png'></td>
        <td><img src='plots/drop_0.4.png'></td>
    </tr>
</table>


| DropOut Ratio | train_loss*| test_accuracy(%)
|:---|:---:  |---: |
|no| 0.0102|97.78|
|0.1 | 0.0197|98.09|
|0.2 | 0.0258|98.20|
|0.4 | 0.0466|98.03|

<center><sup>* - after training for 10 epochs</sup></center>

### Inferences:
* The train_loss increases with the dropout ratio increasing due to information lost  i.e, as more neurons are dropped their contribution to the decrement of error is temporally removed on the on the backward pass and any weight updates are not applied to those neurons.
* Test accuracy increase till 0.2 and it drops for 0.4 ratio.
* When you increase dropout beyond a certain threshold, it results in the model not being able to fit properly. 
* Dropout is like all other forms of regularization in that it reduces model capacity. If you reduce the capacity too much, it is sure that you will get bad results.




## Activation fuctions -- [sigmoid,relu,tanh]
Drop_ratio = 0.2
<table>
    <tr>
        <td><img src='plots/sigmoid.png'></td>
        <td><img src='plots/relu.png'></td>
        <td><img src='plots/tanh.png'></td>
    </tr>
</table>

| Activation function | train_loss* | test_accuracy(%)
|:---|:---:  |---: |
|sigmoid | 0.0598|97.76|
|relu | 0.0253|98.12|
|tanh | 0.0380|97.82|
<center><sup>* - after training for 10 epochs</sup></center>


### Inferences:
* The train_loss increases with the dropout ratio increasing due to information lost  i.e, as more neurons are dropped their contribution to the decrement of error is temporally removed on the on the backward pass and any weight updates are not applied to those neurons.
* Test accuracy increase till 0.2 and it drops for 0.4 ratio.
* When you increase dropout beyond a certain threshold, it results in the model not being able to fit properly. 
* Dropout is like all other forms of regularization in that it reduces model capacity. If you reduce the capacity too much, it is sure that you will get bad results.



## No.of epochs
<center>Drop_Ratio = 0.2, Activation fuction is *relu* in both the instances </center>

<center>Training for 100 epochs</center>
<table>
        <tr><td><img src="plots/epochs_100.png", style="height: 250px;" ></tr></td>
</table>

<center>Training stopped at 35 epochs with Early Stopping</center>
<table>
    <tr><td><img src="plots/earlystopping.png", style="height: 250px;"></tr></td>
</table>

| No.of epochs | Early Stopping | test_accuracy(%)
|:---|:---:|---: |
|100|No |98.18|
|35|Yes |98.39|

### Inferences:
* When trained for 100 epochs the test accuracy is 98.18% where as if the training is stopped at 35 epochs using early stopping with criteria that if there is no change in train_loss then stop the training, the accuracy increases to 98.39%. This decrease in accuracy for the first model is may be due to overfitting as the validation loss increases(1st plot) after certain epochs



In [None]:
# lenet model