# Problem 1

In [1]:
import numpy as np

data = np.load('mnist.npz')
x_train, y_train = data['x_train'], data['y_train']
x_test, y_test = data['x_test'], data['y_test']

In [2]:
# reshape to vectors, change dtype from int8 to float32 and normalize [0,255] -> [0,1]
x_train = x_train.reshape(x_train.shape[0], -1).astype(np.float32) / 255
x_test = x_test.reshape(x_test.shape[0], -1).astype(np.float32) / 255

In [3]:
import math

class Classifier(object):
    def __init__(self,hidden_dims=(500,100),n_hidden=2,mode='train',datapath=None,model_path=None):
        #weights and bias structure
        self.weights, self.bias = [], []
        #layer 1
        self.weights.append(np.empty((784, hidden_dims[0])))
        self.bias.append(np.zeros((hidden_dims[0])))
        #hidden layer
        for i in range(n_hidden - 1):
            self.weights.append(np.empty((hidden_dims[i], hidden_dims[i+1])))
            self.bias.append(np.zeros((hidden_dims[i+1])))
        #output layer
        self.weights.append(np.empty((hidden_dims[-1], 10)))
        self.bias.append(np.zeros((10)))
    
    def initialize_weights(self,method='glorot'):
        for i, w in enumerate(self.weights):
            if method is 'glorot':
                d = math.sqrt(6/(w.shape[0]+w.shape[1]))
                self.weights[i]=np.random.uniform(low=-d, high=d, size=w.shape)
            if method is 'normal':
                self.weights[i]=np.random.normal(loc=0, scale=1, size=w.shape)
            if method is 'zero':
                self.weights[i]=np.zeros(shape=w.shape)
    
    def forward(self,input):
        self.cache = [input]
        for w, b in zip(self.weights, self.bias):
            self.cache.append(self.activation(self.cache[-1] @ w + b))
        return self.softmax(self.cache.pop())
    
    def activation(self,input):
        return np.maximum(0, input)
    
    def loss(self, prediction, label):
        return -math.log(prediction[label])

    def softmax(self, input):
        return np.exp(input)/np.sum(np.exp(input))
    
    def backward(self, output, label):
        grad_pre_activation = np.asarray([o-1 if o==label else o for o in output])
        self.grad_w, self.grad_b = [], []
        # we go from the last layer to the first one
        for i, (w, b) in enumerate(zip(reversed(self.weights), reversed(self.bias))):
            gw = np.asarray(list(reversed(self.cache))[i]).reshape(-1,1) @ grad_pre_activation.reshape(1,-1)
            self.grad_w.insert(0, gw)
            self.grad_b.insert(0, grad_pre_activation)
            grad_previous_hidden_layer = w @ grad_pre_activation
            grad_pre_activation = grad_previous_hidden_layer * [1 if x>0 else 0 for x in list(reversed(self.cache))[i]]
            
    def update(self):
        lr = 0.01
        for i, (gw, gb) in enumerate(zip(self.grad_w, self.grad_b)):
            self.weights[i] = self.weights[i] - lr * gw
            self.bias[i] = self.bias[i] - lr * gb

    def train(self, inputs, labels, epochs):
        for epoch in range(epochs):
            print("epoch {}".format(epoch))
            for i, (x, y) in enumerate(zip(inputs, labels),1):
                pred = clf.forward(x)
                if i%10 == 0:
                    print("\t{}: {:.3f}".format(i, clf.loss(pred, y)))
                if math.isnan(clf.loss(pred, y)):
                    break
                clf.backward(pred, y)
                clf.update()

    def test(self):
        pass

In [4]:
clf = Classifier()
clf.initialize_weights('glorot')

In [5]:
clf.backward(clf.forward(x_train[0]), y_train[0])

In [6]:
clf.train(x_train, y_train, epochs=1)

epoch 0
	10: 2.310
	20: 2.303
	30: 2.303
	40: 2.303
	50: 2.303
	60: 2.303
	70: 2.303
	80: 2.303
	90: 2.303
	100: 2.303


