In [4]:
import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import fashion_mnist
import torch
(xtrain, ytrain), (xtest, ytest) = fashion_mnist.load_data()
ytrain, ytest = ytrain.copy(), ytest.copy()
ytrain[ytrain != 8] = 0
ytrain[ytrain == 8] = 1
ytest[ytest != 8] = 0
ytest[ytest == 8] = 1

In [3]:
xtrain = xtrain.reshape(60000, 784)
xtest = xtest.reshape(10000, 784)

In [44]:
xtrain.shape

(60000, 784)

Old implementation

In [63]:
N, D = xtrain.shape
Wtotal = np.zeros(D)
b = 0
c = 0
epoch = 0
W = np.zeros(D)
btotal = 0

for epoch in range(100):
    nerr = 0
    for i in range(N):
        yhat = int(W.dot(xtrain[i]) + b >= 0)
        if yhat == ytrain[i]:
            c += 1
        else:
            Wtotal += c*W
            btotal += c*b
            W = W + (ytrain[i] - yhat) * xtrain[i]
            b += (ytrain[i] - yhat)
            c = 0
            nerr += 1
    btotal /= Wtotal.sum()
    Wtotal /= Wtotal.sum()
    
    if epoch % 10 == 0:
        print(f"{epoch}: {nerr/N}")
    

0: 0.032266666666666666
10: 0.0245
20: 0.024016666666666665
30: 0.024016666666666665
40: 0.023283333333333333
50: 0.023533333333333333
60: 0.023683333333333334
70: 0.02295
80: 0.0228
90: 0.023133333333333332


In [57]:
(ytrain[i] - yhat)

0

In [138]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

In [300]:
class MLP:
    def __init__(self, nnodes, num_classes):
        self.num_classes = num_classes
        self.w0 = np.random.normal(0, 0.01, (nnodes[0], 784))
        self.w1 = np.random.normal(0, 0.01, (nnodes[1], nnodes[0]))
        self.w2 = np.random.normal(0, 0.01, (nnodes[2], nnodes[1]))
        self.b0 = np.random.normal(0, 0.01, (nnodes[0], 1))
        self.b1 = np.random.normal(0, 0.01, (nnodes[1], 1))        
        self.b2 = np.random.normal(0, 0.01, (nnodes[2], 1))
    
    def forward(self, x):
        lenx = 1 if len(x.shape) == 1 else x.shape[0]
        self.a0 = (self.w0.dot(x.reshape((x.shape[-1], lenx))) + self.b0).reshape(self.w0.shape[0], 1)
        self.h0 = sigmoid(self.a0)
        print(self.w1.shape, self.h0.shape, self.b1.shape)
        self.a1 = (self.w1.dot(self.h0) + self.b1).reshape(self.w1.shape[0], 1)
        self.h1 = sigmoid(self.a1)
        self.a2 = self.w2.dot(self.h1) + self.b2
        self.h2 = sigmoid(self.a2)
        return self.h2
    
    def gradient(self, x, y):
        print(y, 1/self.h2, -1*y/self.h2)
        E_h2 = -1*y/self.h2 + (1-y)/(1-self.h2)
        h2_w2 = self.h1 * self.h2 * (1 - self.h2)
        E_w2 = h2_w2 * E_h2
        # 1
        h1_w1 = self.h0.dot(self.h1 * (1-self.h1))
        h2_h1 = (self.h2 * (1-self.h2)).dot(self.w2.T)
        E_h1 = E_h2 * h2_h1
        return E_h1
        

In [301]:
mlp = MLP([31, 17, 1], 10)
mlp.w0.shape

(31, 784)

In [302]:
mlp.forward(xtrain[0])
mlp.gradient(xtrain[0], ytrain[0])

(17, 31) (31, 1) (17, 1)
1 [[2.03288515]] [[-2.03288515]]


ValueError: shapes (31,1) and (17,1) not aligned: 1 (dim 1) != 17 (dim 0)

In [274]:
xtrain[0].shape

(784,)

In [227]:
eps = 1e-8
l1 = np.zeros(17)
l2 = np.zeros(17)
for i in range(17):
    mlp.w2[0,i] += eps
    l1[i] = -np.log(mlp.forward(xtrain[0]))
    mlp.w2[0,i] -= (2 * eps)
    l2[i] = -np.log(mlp.forward(xtrain[0]))
    mlp.w2[0,i] += eps
grad = (l1-l2) / (2*eps)
print(grad, mlp.gradient(xtrain[0], ytrain[0]))

1 [2.01368948] [-2.01368948]
[-0.25254974 -0.2491898  -0.25124734 -0.25278761 -0.25355198 -0.25481376
 -0.24923622 -0.25922887 -0.25019262 -0.25563198 -0.25302872 -0.24597462
 -0.25172313 -0.24437084 -0.25540867 -0.25896087 -0.24480481] [-0.25254975 -0.2491898  -0.25124733 -0.2527876  -0.25355199 -0.25481375
 -0.24923622 -0.25922888 -0.25019261 -0.25563199 -0.25302872 -0.24597462
 -0.25172313 -0.24437084 -0.25540868 -0.25896087 -0.24480482]


In [81]:
def losstotal(x, y, w0, w1, w2, b0, b1, b2):
    output = forward(x, y, w0, w1, w2, b0, b1, b2)[-1]
    return loss(output, y).sum()/x.shape[0]
def loss(h2, y):
    return -((1 - y) * np.log((1 - h2)) + y * np.log(h2))
def gradients(x, y, w0, w1, w2, b0, b1, b2):
    """provides the gradients for a single sample"""
    x = x.reshape(1, 784)
    a0, a1, a2, h0, h1, h2 = forward(x, y, w0, w1, w2, b0, b1, b2)
    # Variable Notation: E_w1 is E diff wrt w1
    # w2
    h2_w2 = h1 * h2*(1-h2)
    E_h2 = (-y/h2 + (1-y)/(1-h2))
    E_w2 = h2_w2 * E_h2
    #w1
    h1_w1 = h0.T.dot((h1*(1-h1)))
    h2_h1 = ((h2*(1-h2))).dot(w2.T)
    E_h1 = E_h2 * h2_h1
    E_w1 = E_h1 * h1_w1
    #w0
    h0_w0 = x.T.dot(h0*(1-h0))
    h1_h0 = (h1*(1-h1)).dot(w1.T)
    h1_h0 = w1 * ((h1*(1-h1)))
    E_w0 = h0_w0.dot(h1_h0.dot(E_h2*h2_h1.T))
    E_w0 = E_h1.dot(h1_h0.T) * h0_w0
    # bias terms
    h2_b2 = h2*(1-h2)
    E_b2 = h2_b2*E_h2
    h1_b1 = h1*(1-h1)
    E_b1 = E_h2*h2_h1.T*h1_b1
    h0_b0 = h0*(1-h0)
    E_b0 = E_h2*h2_h1.T*h1_h0.T*h0_b0
    E_b0 = E_h1.dot(h1_h0.T)*h0_b0
    return E_w0, E_w1, E_w2.T, E_b0, E_b1, E_b2
def forward(x, y, w0, w1, w2, b0, b1, b2):
    a0 = x.dot(w0) + b0
    h0 = logistic(a0)
    a1 = h0.dot(w1) + b1
    h1 = logistic(a1)
    a2 = h1.dot(w2) + b2
    h2 = logistic(a2)
    return a0, a1, a2, h0, h1, h2
def logistic(x):
    return 1 / (1 + np.exp(-x))
def lprime(x):
    return logistic(x) * (1 - logistic(x))
def update(xtrain, ytrain, w0, w1, w2, b0, b1, b2, learning_rate = 0.01):
    """update function which was originally
            written for task 3.4 optimization """
    xtrain = xtrain
    n = xtrain.shape[0]
    w0gradtotal = np.zeros_like(w0)
    w1gradtotal = np.zeros_like(w1)
    w2gradtotal = np.zeros_like(w2)
    b0gradtotal = np.zeros_like(b0)
    b1gradtotal = np.zeros_like(b1)
    b2gradtotal = np.zeros_like(b2)
    for i in range(n):
        w0grad, w1grad, w2grad, b0grad, b1grad, b2grad = gradients(xtrain[i], ytrain[i], w0, w1, w2, b0, b1, b2)
        w0gradtotal += w0grad
        w1gradtotal += w1grad
        w2gradtotal += w2grad
        b1gradtotal += b1grad.sum()
        b0gradtotal += b0grad
        b2gradtotal += b2grad
    # update parameters
    w0 -= learning_rate * w0gradtotal
    w1 -= learning_rate * w1gradtotal
    w2 -= learning_rate * w2gradtotal
    b0 -= learning_rate * b0gradtotal
    b1 -= learning_rate * b1gradtotal
    b2 -= learning_rate * b2gradtotal
    return w0, w1, w2, b0, b1, b2



In [88]:
w0

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [1]:
w0 = np.random.normal(0, 0.1, (784, 32))
b0 = np.random.normal(0, 0.1, (1, 32))
w1 = np.random.normal(0, 0.1, (32, 16))
b1 = np.random.normal(0, 0.1, (1, 16))
w2 = np.random.normal(0, 0.1, (16, 1))
b2 = np.random.normal(0, 0.1, (1, 1))
epochs = 500
accs = np.zeros(epochs)
losses = np.zeros(epochs)
devacc = np.zeros(epochs)
x = xtrain[:100]
y = ytrain[:100]
js = np.array([i for i in range(100)])
for i in range(epochs):
    np.random.shuffle(js) #shuffle order to use for training
    w0, w1, w2, b0, b1, b2 = update(x[js], y[js], w0, w1, w2, b0, b1, b2, learning_rate = 0.1)
    yhat = forward(x, y, w0, w1, w2, b0, b1, b2)[-1].round(0)
    accs[i] = 1 - np.mean(np.abs(y.flatten()-yhat.flatten()))
    l = (losstotal(x, y, w0, w1, w2, b0, b1, b2))
    losses[i] = l
#     devhat = forward(devimgs, devlabels, w0, w1, w2, b0, b1, b2)[-1].round(0)
#     devacc[i] = 1 - np.mean(np.abs(devlabels - devhat))
    if i%1==0:
        print(f"{i}: {accs[i]:.5f}, {l:.6f}, {devacc[i]:.3f}")

NameError: name 'np' is not defined