# <font color="TURQUOISE">Dropout Exercise</font>

In [8]:
import mxnet as mx
import numpy as np
import matplotlib.pyplot as plt
from mxnet import nd, autograd, gluon

mx.random.seed(1)
ctx = mx.cpu()

### MNIST dataset

In [9]:
mnist = mx.test_utils.get_mnist()
batch_size = 64
def transform(data, label):
    return data.astype(np.float32)/255, label.astype(np.float32)
train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),
                                      batch_size, shuffle=True)
test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),
                                     batch_size, shuffle=False)

In [10]:
W1 = nd.random_normal(shape=(784,256), ctx=ctx) *.01
b1 = nd.random_normal(shape=256, ctx=ctx) * .01

W2 = nd.random_normal(shape=(256,128), ctx=ctx) *.01
b2 = nd.random_normal(shape=128, ctx=ctx) * .01

W3 = nd.random_normal(shape=(128,10), ctx=ctx) *.01
b3 = nd.random_normal(shape=10, ctx=ctx) *.01

params = [W1, b1, W2, b2, W3, b3]

### Allocate space for gradients.

In [11]:
for param in params:
    param.attach_grad()

### ReLU Activation function

In [12]:
def relu(X):
    return nd.maximum(X, 0)

# <font color="DODGERBLUE">Dropout Function definition</font>

### 1. Generate a random number < keep probability
### 2. Multiply drop matrix with conv output
### 3. Scale this output according to keep probability to bump output the same dimension as input

In [13]:
def dropout(X, drop_probability):
    keep_probability = 1 - drop_probability
    # generate dropout mask based on keep probability
    mask = nd.random_uniform(0, 1.0, X.shape, ctx=X.context) < keep_probability
    #############################
    #  Avoid division by 0 when scaling
    #############################
    if keep_probability > 0.0:
        scale = (1/keep_probability)
    else:
        scale = 0.0
    # multiply conv layer with dropout and scale
    return mask * X * scale

# <font color="LAWNGREEN">Dropout of 0 keeps all values in</font>

In [14]:
A = nd.arange(20).reshape((5,4))
dropout(A, 0.0)


[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]
 [16. 17. 18. 19.]]
<NDArray 5x4 @cpu(0)>

# <font color="GOLD">Dropout of half keeps half values in</font>

In [15]:
dropout(A, 0.5)


[[ 0.  2.  4.  6.]
 [ 8.  0. 12. 14.]
 [16. 18. 20. 22.]
 [24.  0. 28. 30.]
 [32. 34.  0. 38.]]
<NDArray 5x4 @cpu(0)>

# <font color="ORANGERED">Dropout of 1 drops all values</font>

In [16]:
dropout(A, 1.0)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
<NDArray 5x4 @cpu(0)>

### Softmax output

In [17]:
def softmax(y_linear):
    exp = nd.exp(y_linear-nd.max(y_linear))
    partition = nd.nansum(exp, axis=0, exclude=True).reshape((-1,1))
    return exp / partition

### The softmax cross-entropy loss function


In [18]:
def softmax_cross_entropy(yhat_linear, y):
    return - nd.nansum(y * nd.log_softmax(yhat_linear), axis=0, exclude=True)

### Define the model


In [19]:
def net(X, drop_prob=0.0):
    #######################
    #  Compute the first hidden layer
    #######################
    h1_linear = nd.dot(X, W1) + b1
    h1 = relu(h1_linear)
    h1 = dropout(h1, drop_prob)

    #######################
    #  Compute the second hidden layer
    #######################
    h2_linear = nd.dot(h1, W2) + b2
    h2 = relu(h2_linear)
    h2 = dropout(h2, drop_prob)

    #######################
    #  Compute the output layer.
    #  We will omit the softmax function here
    #  because it will be applied
    #  in the softmax_cross_entropy loss
    #######################
    yhat_linear = nd.dot(h2, W3) + b3
    return yhat_linear

### Optimizer: Stochastic gradient descent 

In [20]:
def SGD(params, learning_rate):
    for param in params:
        param[:] = param - learning_rate * param.grad

### Evaluation metric


In [21]:
def evaluate_accuracy(data_iterator, net):
    numerator = 0.
    denominator = 0.
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx).reshape((-1,784))
        label = label.as_in_context(ctx)
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        numerator += nd.sum(predictions == label)
        denominator += data.shape[0]
    return (numerator / denominator).asscalar()

### Execute the model

In [22]:
epochs = 10
moving_loss = 0.
learning_rate = .001

def train(net, drop_prob, epochs):
    for e in range(epochs):
        for i, (data, label) in enumerate(train_data):
            data = data.as_in_context(ctx).reshape((-1,784))
            label = label.as_in_context(ctx)
            label_one_hot = nd.one_hot(label, 10)
            with autograd.record():
                ########################
                #   Drop out used here #
                #######################
                output = net(data, drop_prob)
                loss = softmax_cross_entropy(output, label_one_hot)
            loss.backward()
            SGD(params, learning_rate)

            ##########################
            #  Keep a moving average of the losses
            ##########################
            if i == 0:
                moving_loss = nd.mean(loss).asscalar()
            else:
                moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar()

        test_accuracy = evaluate_accuracy(test_data, net)
        train_accuracy = evaluate_accuracy(train_data, net)
        print("Epoch %s. Loss: %s,    Train_acc %s,    Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))


# <font color="ORANGERED">High dropout used</font>

In [None]:
train(net,0.8, 10)

Epoch 0. Loss: 1.0959734010353634,    Train_acc 0.7847,    Test_acc 0.7947
Epoch 1. Loss: 0.6911433717565789,    Train_acc 0.9026,    Test_acc 0.9054
Epoch 2. Loss: 0.5690453843627099,    Train_acc 0.9235333,    Test_acc 0.9235
Epoch 3. Loss: 0.5147457278797349,    Train_acc 0.93505,    Test_acc 0.936
Epoch 4. Loss: 0.4951578293524433,    Train_acc 0.93995,    Test_acc 0.9407
Epoch 5. Loss: 0.4433863431392614,    Train_acc 0.9443667,    Test_acc 0.9424
Epoch 6. Loss: 0.43610409423565655,    Train_acc 0.9493333,    Test_acc 0.949
Epoch 7. Loss: 0.42300142204891145,    Train_acc 0.95195,    Test_acc 0.9505


# <font color="GOLD">Dropout of half used</font>

In [0]:
train(net,0.5, 10)

Epoch 0. Loss: 0.05844953087192618,    Train_acc 0.99675,    Test_acc 0.9805
Epoch 1. Loss: 0.05593816106680461,    Train_acc 0.99703336,    Test_acc 0.9817
Epoch 2. Loss: 0.05417170336255669,    Train_acc 0.99675,    Test_acc 0.9805
Epoch 3. Loss: 0.053125804223949635,    Train_acc 0.99721664,    Test_acc 0.9818
Epoch 4. Loss: 0.05272426012394333,    Train_acc 0.9971333,    Test_acc 0.981
Epoch 5. Loss: 0.0517162056513185,    Train_acc 0.99698335,    Test_acc 0.9808
Epoch 6. Loss: 0.05335260935578101,    Train_acc 0.99766666,    Test_acc 0.9818
Epoch 7. Loss: 0.05498305921293919,    Train_acc 0.9978333,    Test_acc 0.9809
Epoch 8. Loss: 0.05104374116619116,    Train_acc 0.99715,    Test_acc 0.9815
Epoch 9. Loss: 0.05246860679119453,    Train_acc 0.99733335,    Test_acc 0.9811


# <font color="LAWNGREEN">Dropout of 0 used</font>

In [0]:
train(net,0.0, 10)

Epoch 0. Loss: 0.007049042951271868,    Train_acc 0.9992833,    Test_acc 0.9831
Epoch 1. Loss: 0.005208811431898914,    Train_acc 0.99951667,    Test_acc 0.9829
Epoch 2. Loss: 0.0030589178185008122,    Train_acc 0.99971664,    Test_acc 0.983
Epoch 3. Loss: 0.003068211641881961,    Train_acc 0.99985,    Test_acc 0.9829
Epoch 4. Loss: 0.002408713933403444,    Train_acc 0.99981666,    Test_acc 0.9832
Epoch 5. Loss: 0.0022319511442104173,    Train_acc 0.99995,    Test_acc 0.9829
Epoch 6. Loss: 0.001691517848392048,    Train_acc 0.99995,    Test_acc 0.9834
Epoch 7. Loss: 0.0015038096210987341,    Train_acc 0.99995,    Test_acc 0.9831
Epoch 8. Loss: 0.0011653935781116094,    Train_acc 0.9999667,    Test_acc 0.9832
Epoch 9. Loss: 0.0009982405403038616,    Train_acc 0.9999833,    Test_acc 0.9833


# <font color="ORANGERED">High dropout of 0.8 used</font>

In [0]:
train(net,0.8, 100)

Epoch 0. Loss: 0.5086358352008217,    Train_acc 0.97995,    Test_acc 0.9691
Epoch 1. Loss: 0.4614239824096406,    Train_acc 0.97723335,    Test_acc 0.969
Epoch 2. Loss: 0.48122709564350097,    Train_acc 0.97618335,    Test_acc 0.9699
Epoch 3. Loss: 0.4152667391015395,    Train_acc 0.9749,    Test_acc 0.9674
Epoch 4. Loss: 0.4276696791554884,    Train_acc 0.976,    Test_acc 0.9669
Epoch 5. Loss: 0.4127018304774338,    Train_acc 0.9751,    Test_acc 0.9671
Epoch 6. Loss: 0.39011731180992815,    Train_acc 0.97478336,    Test_acc 0.9678
Epoch 7. Loss: 0.38137675653435726,    Train_acc 0.97496665,    Test_acc 0.9693
Epoch 8. Loss: 0.3936920239281544,    Train_acc 0.97433335,    Test_acc 0.9671
Epoch 9. Loss: 0.38101606456571346,    Train_acc 0.97505,    Test_acc 0.9672
Epoch 10. Loss: 0.3633826508275447,    Train_acc 0.97426665,    Test_acc 0.9665
Epoch 11. Loss: 0.37900348845824233,    Train_acc 0.97545,    Test_acc 0.9691
Epoch 12. Loss: 0.3398188990244026,    Train_acc 0.97578335,    Test

# <font color="GOLD">Dropout of 0.5 used</font>

In [0]:
train(net,0.5, 100)

Epoch 0. Loss: 0.0725543986700366,    Train_acc 0.98753333,    Test_acc 0.9737
Epoch 1. Loss: 0.06075809338807031,    Train_acc 0.98931664,    Test_acc 0.9745
Epoch 2. Loss: 0.05570570510003326,    Train_acc 0.99018335,    Test_acc 0.9752
Epoch 3. Loss: 0.06358640744756218,    Train_acc 0.99116665,    Test_acc 0.9754
Epoch 4. Loss: 0.05429037356104726,    Train_acc 0.99165,    Test_acc 0.9756
Epoch 5. Loss: 0.04900720837717807,    Train_acc 0.9925,    Test_acc 0.9757
Epoch 6. Loss: 0.04591402025927379,    Train_acc 0.99273336,    Test_acc 0.9756
Epoch 7. Loss: 0.047352310537359436,    Train_acc 0.9936,    Test_acc 0.9771
Epoch 8. Loss: 0.04973145411168545,    Train_acc 0.99336666,    Test_acc 0.9766
Epoch 9. Loss: 0.04583497905452421,    Train_acc 0.994,    Test_acc 0.9765
Epoch 10. Loss: 0.051114642762894534,    Train_acc 0.9946,    Test_acc 0.9769
Epoch 11. Loss: 0.04653329936474795,    Train_acc 0.99476665,    Test_acc 0.9765
Epoch 12. Loss: 0.04400529388862444,    Train_acc 0.99453

# <font color="LAWNGREEN">NO Dropout used</font>

### Model overfits the data since Training accuracy is 100% and testing accuracy is less than before

In [0]:
train(net,0.0, 100)

Epoch 0. Loss: 0.0016375640792185607,    Train_acc 0.99986666,    Test_acc 0.9786
Epoch 1. Loss: 0.0014725788864934607,    Train_acc 0.99988335,    Test_acc 0.9789
Epoch 2. Loss: 0.0010551783543087992,    Train_acc 0.9999,    Test_acc 0.9788
Epoch 3. Loss: 0.0008233518454906051,    Train_acc 0.9999167,    Test_acc 0.9788
Epoch 4. Loss: 0.0008872226114330016,    Train_acc 0.9999167,    Test_acc 0.9789
Epoch 5. Loss: 0.0007882358503075904,    Train_acc 0.9999167,    Test_acc 0.9788
Epoch 6. Loss: 0.0008159225270600517,    Train_acc 0.99993336,    Test_acc 0.979
Epoch 7. Loss: 0.0010220072928986237,    Train_acc 0.99995,    Test_acc 0.9791
Epoch 8. Loss: 0.0009894596513810158,    Train_acc 0.99995,    Test_acc 0.979
Epoch 9. Loss: 0.0005782774989442238,    Train_acc 0.9999667,    Test_acc 0.9792
Epoch 10. Loss: 0.0007660663426596448,    Train_acc 0.9999667,    Test_acc 0.9791
Epoch 11. Loss: 0.000737330754276713,    Train_acc 0.9999667,    Test_acc 0.9791
Epoch 12. Loss: 0.000567672446317

In [0]:
#REF:https://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-dropout-scratch.html