In [43]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import math
import matplotlib.pyplot as plt
from data_utils import *
import time
import autograd.numpy as np
from autograd import value_and_grad

def forward_pass(W1, W2, W3, b1, b2, b3, x):
    """
    forward-pass for an fully connected neural network with 2 hidden layers of M neurons
    Inputs:
        W1 : (M, 784) weights of first (hidden) layer
        W2 : (M, M) weights of second (hidden) layer
        W3 : (10, M) weights of third (output) layer
        b1 : (M, 1) biases of first (hidden) layer
        b2 : (M, 1) biases of second (hidden) layer
        b3 : (10, 1) biases of third (output) layer
        x : (N, 784) training inputs
    Outputs:
        Fhat : (N, 10) output of the neural network at training inputs
    """
    H1 = np.maximum(0, np.dot(x, W1.T) + b1.T) # layer 1 neurons with ReLU activation, shape (N, M)
    
    #print(np.min(H1),np.max(H1))
    H2 = np.maximum(0, np.dot(H1, W2.T) + b2.T) # layer 2 neurons with ReLU activation, shape (N, M)
    
    #print(np.min(H2),np.max(H2))
    Fhat = np.dot(H2, W3.T) + b3.T # layer 3 (output) neurons with linear activation, shape (N, 10)

    N = np.shape(Fhat)[0]
    
    Fhat_max = np.amax(Fhat,axis=1).reshape(N,1)
    
    diff = Fhat-Fhat_max
       
    exp_diff = np.exp(diff)
    
    sum_exp_diff = np.sum(exp_diff,axis=1).reshape(N,1)
    
    log_sum_exp_diff = np.log(sum_exp_diff)
    
    return diff - log_sum_exp_diff


def negative_log_likelihood(W1, W2, W3, b1, b2, b3, x, y):
    """
    computes the negative log likelihood of the model `forward_pass`
    Inputs:
        W1, W2, W3, b1, b2, b3, x : same as `forward_pass`
        y : (N, 10) training responses
    Outputs:
        nll : negative log likelihood
    """
    Fhat = forward_pass(W1, W2, W3, b1, b2, b3, x)

    nll = -(Fhat*y)
    
    return np.sum(nll)
    

nll_gradients = value_and_grad(negative_log_likelihood, argnum=[0,1,2,3,4,5])
"""
    returns the output of `negative_log_likelihood` as well as the gradient of the 
    output with respect to all weights and biases
    Inputs:
        same as negative_log_likelihood (W1, W2, W3, b1, b2, b3, x, y)
    Outputs: (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad))
        nll : output of `negative_log_likelihood`
        W1_grad : (M, 784) gradient of the nll with respect to the weights of first (hidden) layer
        W2_grad : (M, M) gradient of the nll with respect to the weights of second (hidden) layer
        W3_grad : (10, M) gradient of the nll with respect to the weights of third (output) layer
        b1_grad : (M, 1) gradient of the nll with respect to the biases of first (hidden) layer
        b2_grad : (M, 1) gradient of the nll with respect to the biases of second (hidden) layer
        b3_grad : (10, 1) gradient of the nll with respect to the biases of third (output) layer
     """

def update_parameters(w, grad_w, learning_rate=1.):
    """
    perform gradient descent update to minimize an objective
    Inputs:
        w : vector of parameters
        grad_w : gradient of the loss with respect to the parameters
        learning_rate : learning rate of the optimizer
    """
    return w - learning_rate * grad_w

# load the MNIST_small dataset
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mnist_small')

#scale data to 4-bit black and white
for i in range (x_train.shape[0]):
    l = x_train[i].min()
    h = x_train[i].max()
    x_train[i] = (((x_train[i]-l)/(h-l))*15).astype(np.uint8)
    
for i in range (x_test.shape[0]):
    l = x_test[i].min()
    h = x_test[i].max()
    x_test[i] = (((x_test[i]-l)/(h-l))*15).astype(np.uint8)
    
for i in range (x_valid.shape[0]):
    l = x_valid[i].min()
    h = x_valid[i].max()
    x_valid[i] = (((x_valid[i]-l)/(h-l))*15).astype(np.uint8)


np.random.seed(1)
scale = 0.1
M = 10 # neurons per hidden layer
W1 = scale *np.random.randn(M, 784) # weights of first (hidden) layer
W2 = scale *np.random.randn(M, M) # weights of second (hidden) layer
W3 = scale *np.random.randn(10, M) # weights of third (output) layer
b1 = np.zeros((M, 1)) # biases of first (hidden) layer
b2 = np.zeros((M, 1)) # biases of second (hidden) layer
b3 = np.zeros((10, 1)) # biases of third (output) layer

In [44]:
#set up training paramaters
learning_rate = 0.0002
size = x_train.shape[0]
batch_size = 250

n = 3000
iteration_vec = np.zeros((round(n/10),1))
log_loss_vec = np.zeros((round(n/10),1))
v_log_loss_vec = np.zeros((round(n/10),1))

for i in range(n):
    #select minibatch
    randomize = np.arange(size)
    np.random.shuffle(randomize)
    x_train = x_train[randomize]
    y_train = y_train[randomize]
    
    # compute the gradient
    (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) = nll_gradients(W1, W2, W3, b1, b2, b3, x_train[:batch_size], y_train[:batch_size])
    # update the parameters
    W1 = update_parameters(W1, W1_grad, learning_rate)
    W2 = update_parameters(W2, W2_grad, learning_rate)
    W3 = update_parameters(W3, W3_grad, learning_rate)
    b1 = update_parameters(b1, b1_grad, learning_rate)
    b2 = update_parameters(b2, b2_grad, learning_rate)
    b3 = update_parameters(b3, b3_grad, learning_rate)
    
    #record performance
    if (i+1) % 10 == 0:
        log_loss_vec[round(i/10)-1] = negative_log_likelihood(W1, W2, W3, b1, b2, b3, x_train[:batch_size], y_train[:batch_size])
        v_log_loss_vec[round(i/10)-1] = negative_log_likelihood(W1, W2, W3, b1, b2, b3, x_valid,y_valid)
        iteration_vec[round(i/10)-1] = i
        
    # print loss if nessessary
    if i==0 or (i+1) % 200 == 0:
        print("Iter %3d, loss = %.6f" % (i+1, negative_log_likelihood(W1, W2, W3, b1, b2, b3, x_train, y_train)))
        val = negative_log_likelihood(W1, W2, W3, b1, b2, b3, x_valid,y_valid)  
        print("Iter %3d, Validation Loss = %.6f" % (i+1, val))

Iter   1, loss = 23176.645392
Iter   1, Validation Loss = 2318.898576
Iter 200, loss = 5121.993508
Iter 200, Validation Loss = 553.941933
Iter 400, loss = 3532.761319
Iter 400, Validation Loss = 441.581490
Iter 600, loss = 2920.872742
Iter 600, Validation Loss = 400.008008
Iter 800, loss = 3004.763573
Iter 800, Validation Loss = 409.587109
Iter 1000, loss = 2480.932017
Iter 1000, Validation Loss = 387.414678
Iter 1200, loss = 2278.807493
Iter 1200, Validation Loss = 371.790168
Iter 1400, loss = 1928.972958
Iter 1400, Validation Loss = 357.195005
Iter 1600, loss = 1881.171284
Iter 1600, Validation Loss = 361.586945
Iter 1800, loss = 1867.239615
Iter 1800, Validation Loss = 350.444879
Iter 2000, loss = 1561.761158
Iter 2000, Validation Loss = 353.347935
Iter 2200, loss = 1705.082528
Iter 2200, Validation Loss = 384.947216
Iter 2400, loss = 1486.688218
Iter 2400, Validation Loss = 373.101159
Iter 2600, loss = 2206.290896
Iter 2600, Validation Loss = 400.798510
Iter 2800, loss = 1336.70039

In [59]:
#find the accuracy of the trained model

preds = forward_pass(W1, W2, W3, b1, b2, b3, x_test)
probs = np.amax(np.exp(preds),axis = 1)
pred_idx = np.argmax(preds,axis = 1)
targets = np.argmax(y_test,axis = 1)

#calculate number of correct predictions on test set
correct = np.sum(pred_idx==targets)

#number of test points
total = len(targets)

print('accuracy on testing data =',correct*100/total,'%')

accuracy on testing data = 91.4 %


In [85]:
#cast params into ints
W1_new = (W1*1e3).astype(np.int16)
W2_new = (W2*1e3).astype(np.int16)
W3_new = (W3*1e3).astype(np.int16)

b1_new = (b1*1e3).astype(np.int16)
b2_new = (b2*1e3).astype(np.int16)
b3_new = (b3*1e3).astype(np.int16)

preds = forward_pass(W1_new, W2_new, W3_new, b1_new, b2_new, b3_new, x_test)
probs = np.amax(np.exp(preds),axis = 1)
pred_idx = np.argmax(preds,axis = 1)
targets = np.argmax(y_test,axis = 1)

#calculate number of correct predictions on test set
correct = np.sum(pred_idx==targets)

#number of test points
total = len(targets)

print('accuracy on testing data =',correct*100/total,'%')

accuracy on testing data = 92.0 %


In [137]:
forward_pass(W1_new, W2_new, W3_new, b1_new, b2_new, b3_new, 15*np.ones([784,]))

array([[-2.13617858e+10, -4.43701774e+10, -1.32617229e+10,
         0.00000000e+00, -2.43594086e+10, -1.62651976e+10,
        -4.47055490e+10, -2.89100882e+10, -2.08626348e+10,
        -2.06895244e+10]])

In [129]:
x = 15*np.ones([784,])

H1 = np.maximum(0, np.dot(x, W1_new.T) + b1_new.T) # layer 1 neurons with ReLU activation, shape (N, M)

#print(np.min(H1),np.max(H1))
H2 = np.maximum(0, np.dot(H1, W2_new.T) + b2_new.T) # layer 2 neurons with ReLU activation, shape (N, M)

#print(np.min(H2),np.max(H2))
Fhat = np.dot(H2, W3_new.T) + b3_new.T # layer 3 (output) neurons with linear activation, shape (N, 10)

In [133]:
np.dot(H1, W2_new.T)

array([[ 2.13833630e+07, -6.15938500e+07, -1.07835520e+07,
         4.48475600e+06,  3.66800290e+07,  2.94925520e+07,
         1.78744580e+07, -6.03236250e+07,  3.91116720e+07,
        -1.61259666e+08]])

In [135]:
H2


array([[21383273.,        0.,        0.,  4484783., 36680023., 29492550.,
        17874525.,        0., 39111637.,        0.]])

In [136]:
Fhat




array([[ 2.69203492e+09, -2.03163567e+10,  1.07920978e+10,
         2.40538207e+10, -3.05587941e+08,  7.78862310e+09,
        -2.06517284e+10, -4.85626756e+09,  3.19118584e+09,
         3.36429625e+09]])

In [61]:
import os

# Create a new directory to store the output files
os.makedirs('output_folder', exist_ok=True)

#W1
for k in range(10):
    # Open a new file for writing, with a filename based on the value of k
    with open(f'output_folder/W1_{k}.mem', 'w') as f:
        # Loop over i values from 0 to 783
        for i in range(784): 
            # Write the value of w[i][k] to the file, followed by a newline character
            f.write('{:0>4x}\n'.format(W1_new[k][i] & 0xffff))
            
#W2
for k in range(10):
    # Open a new file for writing, with a filename based on the value of k
    with open(f'output_folder/W2_{k}.mem', 'w') as f:
        # Loop over i values from 0 to 783
        for i in range(10): 
            # Write the value of w[i][k] to the file, followed by a newline character
            f.write('{:0>4x}\n'.format(W2_new[k][i] & 0xffff))
                   
#W3
for k in range(10):
    # Open a new file for writing, with a filename based on the value of k
    with open(f'output_folder/W3_{k}.mem', 'w') as f:
        # Loop over i values from 0 to 783
        for i in range(10): 
            # Write the value of w[i][k] to the file, followed by a newline character
            f.write('{:0>4x}\n'.format(W3_new[k][i] & 0xffff))
            
#b1            
with open(f'output_folder/B1_0.mem', 'w') as f:
    # Loop over i values from 0 to 783
    for i in range(10): 
        # Write the value of w[i][k] to the file, followed by a newline character
        f.write('{:0>4x}\n'.format(b1_new[i][0] & 0xffff))

#b2
with open(f'output_folder/B2_0.mem', 'w') as f:
    # Loop over i values from 0 to 783
    for i in range(10): 
        # Write the value of w[i][k] to the file, followed by a newline character
        f.write('{:0>4x}\n'.format(b2_new[i][0] & 0xffff))

#b3
with open(f'output_folder/B3_0.mem', 'w') as f:
    # Loop over i values from 0 to 783
    for i in range(10): 
        # Write the value of w[i][k] to the file, followed by a newline character
        f.write('{:0>4x}\n'.format(b3_new[i][0] & 0xffff))

In [57]:
W1_new[1]


array([ 121, -194,  -80,    4,  -59,   86, -208,   36,   42,    4,  110,
       -122,  110,  -70,   72,  -32,   81,   78, -146,  -15,   -9,  -23,
        -75,  185,   20,  155,  -56, -106,   13,  -56,  239,   24,  115,
        -22,  -33,   -8,   11,   77,  -15,  -66, -101,   -9,   41,    3,
        -29,  -67,   14,   56,   -3,   10,  -15,   91,  -43,   18,   39,
         72,  149,   67,   59, -147,   60,  229,  -84, -108,  -49,   79,
         32,  -75,  -50,   28,   -6,  178,   82,   23,   40,  -61, -109,
          5,   43, -139,  -52,  -36,   26,  -26,   44,    9,  106, -171,
        165,  142,    1,   48,   20, -110,  -33,   21,  -68,  -16,  -74,
       -140, -145,  -93, -102,  143,   -9, -128,   35,  -16, -304,   47,
       -155,   57,  -96, -145,   49, -148,  -43,   15,   51,   32,  -57,
        -44,  136, -128,   76, -172,    7,   21, -155,   17,   14,  -33,
        -18,  112,   17, -116,  125,  -44,  137,   60,  -89,  -16,  -28,
        -90,   56,    5,  184,   19, -181,   16,   