# Digit Recognizer - Feed Foward Net w/ Dropout in Theano

**Kyu Cho**  
**11/1/2016**

# Data
- Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. 
- Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.  
  
- The training data set, (train.csv), has 785 columns. 
- The first column, called "label", is the digit that was drawn by the user. 
- The rest of the columns contain the pixel-values of the associated image.  
  
- Each pixel column in the training set has a name like pixelx, where x is an integer between 0 and 783, inclusive. 
- To locate this pixel on the image, suppose that we have decomposed x as x = i * 28 + j, where i and j are integers between 0 and 27, inclusive. 
- Then pixelx is located on row i and column j of a 28 x 28 matrix, (indexing by zero).
    + For example, pixel31 indicates the pixel that is in the fourth column from the left, and the second row from the top, as in the ascii-diagram below.

000 001 002 003 ... 026 027  
028 029 030 031 ... 054 055  
056 057 058 059 ... 082 083  
 |   |   |   |  ...  |   |  
728 729 730 731 ... 754 755  
756 757 758 759 ... 782 783   

# Techniques Used

1. Feed Forward
2. Drop out
3. Nesterov and RMSprop
4. Ragulization
5. Learning Rate
6. Batch Gradient
7. Normalization or PCA
8. Dynamic number of nodes foor each layer
9. Dynamic number of hidden layers

# Table of Contents
1. Load Data
2. Split Data
3. Function initialization
4. Build Model

## Load Data

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from util import get_normalized_data
from sklearn.utils import shuffle

def get_normalized_data():
    print "Reading in and transforming data..."
    df = pd.read_csv('../large_files/train.csv')
    
    data = df.as_matrix().astype(np.float32)
    np.random.shuffle(data)
    
    Y = data[:, 0]
    X = data[:, 1:]
    
    mu = X.mean(axis=0)
    std = X.std(axis=0)
    np.place(std, std == 0, 1)
    X = (X - mu) / std # normalize the data
    
    return X, Y

In [2]:
class HiddenLayer(object):
    def __init__(self, M1, M2, an_id):
        self.id = an_id
        self.M1 = M1
        self.M2 = M2
        
        # [init. a hidden layer]
        W = np.random.randn(M1, M2) / np.sqrt(M1 + M2)
        b = np.zeros(M2)
        self.W = theano.shared(W, 'W_%s' % self.id)
        self.b = theano.shared(b, 'b_%s' % self.id)
        
        # save params.
        self.params = [self.W, self.b]

    def forward(self, X):
        return T.nnet.relu(X.dot(self.W) + self.b)

In [3]:
class ANN(object):
    def __init__(self, hidden_layer_sizes, p_keep):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.dropout_rates = p_keep # list of probabilities for each layer

    def fit(self, X, Y, learning_rate=10e-7, mu=0.99, decay=0.999, epochs=10, batch_sz=100, show_fig=False):
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32) # for GPU
        Y = Y.astype(np.int32) # for GPU
        
        # split data set
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]
    
        # theano randomly generate number when it passes through it
        self.rng = RandomStreams() 
    
        # first hidden layer -> hidden layers -> last hidden layer
        # [init. hidden layers]
        N, D = X.shape
        M1 = D  # save dim(784) for the first hidden layer

        self.hidden_layers = []
        count = 0
        for M2 in self.hidden_layer_sizes: # [500, 300]
            h = HiddenLayer(M1, M2, count) # (784, 500, 1), (500, 300, 2)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1

        # [init. last layer]
        K = len(set(Y)) # output length
        W = np.random.randn(M1, K) / np.sqrt(M1 + K)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')
        
        # save params from last param to first param(opposite) for backprob.
        
        # append previous hidden layers in to the last layer
        # [(300, 10), (10,), (784, 500), (500,), (500, 300), (300,)]
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params # self.params.extend(h.params)
        
     
        
        ################################ Theano Area Start ################################
        # create theano var.
        thX = T.matrix('X')
        thY = T.ivector('Y')
        
        # define forward function w/ dropout 
        pY_train = self.forward_train(thX)
        
        # define cost function
        # list [i, j] = return single index matching 'value'
        # list[is, js] = return multiple index matching 'list'
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        # init. empty value for all nodes for momentum, p = [wieght matrix, baies vector]
        dparams = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params]
        # init. empty value for all nodes for rmsprop, p = [wieght matrix, baies vector]
        cache = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params]

        updates = [
            (c, decay*c + (1-decay)*g*g) for p, c, g in zip(self.params, cache, grads)
        ] + [
            (p, p + mu*dp - learning_rate*g/T.sqrt(c + 10e-10)) for p, c, dp, g in zip(self.params, cache, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g/T.sqrt(c + 10e-10)) for p, c, dp, g in zip(self.params, cache, dparams, grads)
        ]
        
        train_op = theano.function(
            inputs = [thX, thY],
            updates = updates
        )

        ################################ Theano Area End ################################
        # for evaluation and prediction
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(
            inputs = [thX, thY], 
            outputs = [cost_predict, prediction]
        )

        n_batches = N / batch_sz
        costs = []
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            
            for j in xrange(n_batches):
                Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]

                train_op(Xbatch, Ybatch)

                if j % 20 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e
        
        if show_fig:
            plt.plot(costs)
            plt.show()

        
    # X = data (theano value)
    def forward_train(self, X):
        Z = X
        for h, p in zip(self.hidden_layers, self.dropout_rates[:-1]): # [500, .8], [300, .5]
            # init. random 0,1 for all nodes for dropout masking
            mask = self.rng.binomial(n=1, p=p, size=Z.shape) # n=1 (0 or 1)
            Z = mask * Z
            Z = h.forward(Z)
        
        # ?? probability has list of values
        mask = self.rng.binomial(n=1, p=self.dropout_rates[-1], size=Z.shape)
        Z = mask * Z
        return T.nnet.softmax(Z.dot(self.W) + self.b)

    def forward_predict(self, X):
        Z = X
        for h, p in zip(self.hidden_layers, self.dropout_rates[:-1]):
            Z = h.forward(p * Z)
        return T.nnet.softmax((self.dropout_rates[-1] * Z).dot(self.W) + self.b)

    def predict(self, X):
        pY = self.forward_predict(X)
        return T.argmax(pY, axis=1)


In [7]:
def error_rate(p, t):
    return np.mean(p != t)

def relu(a):
    return a * (a > 0)

def main():
#     X, Y = get_normalized_data()
    print 'Dimention of X', X.shape
    print 'Dimention of Y',Y.shape
    print Y
    # hidden layer size , drop out rate
#     ann = ANN([500, 300], [0.8, 0.5, 0.5])
#     ann.fit(X, Y, show_fig=True)

if __name__ == '__main__':
    main()

Dimention of X (42000, 784)
Dimention of Y (42000,)
[ 5.  6.  2. ...,  3.  7.  4.]
