In [1]:
import matplotlib.pyplot as plt
import numpy as np
import configparser

# just to overwrite default colab style
plt.style.use('default')
plt.style.use('seaborn-talk')

np.random.seed(10)

In [2]:
config = configparser.ConfigParser()
config.read('config.config')
default = 'DEFAULT'
config = config[default]
alpha = config['alpha']

In [49]:
# Activations
def relu(X):
    return np.maximum(0,X)

def softmax(X):
    exp_x = np.exp(X)
    probs = exp_x / np.sum(exp_x, axis=1, keepdims=True) # [N x K]
    return probs

def batchloader(X, Y, batchsize = 32):
    n = Y.shape[0]
    idx = np.random.choice(np.arange(n),size=batchsize,replace=False)
    X_batch = X[idx,:]
    Y_batch = Y[idx,:]
    
    return X_batch, Y_batch

def crossEntropy(X, Y, theta):
    n, m = X.shape
    n, p = Y.shape 
    fce = 0
    
    for i in range(n):
        Y_pred = softmax(np.dot(X[i],theta))
        fce += np.dot(Y[i].T, np.log(Y_pred))  
    fce *= (-1/n)
    return fce

# Storing all weights ndarray as list
def get_weights(layers):
    weights = []
    Nlayers = len(layers)

    for i in range(Nlayers - 1):
        w = np.random.rand(layers[i], layers[i+1])-1 # 1 added for the bias
        weights.append(w)    
    return weights

def forward(X, W):
    h = X  #h0 = X
    network = [X]
    
    for w in W[:-1]:   # in => h1 => out 
        z = np.dot(h, w) # next layer
        h = relu(z)      # ReLU activation
        network.append(h)    
    y_pred = softmax(np.dot(h, W[-1])) # output with softmax
    network.append(y_pred)
    return network # h1, h2, ... y_hat

def accuracy(Y_pred, Y):
    n,p = Y.shape
    acc = np.sum([np.argmax(Y_pred[i])==np.argmax(Y[i]) for i in range(n)])/(0.01*n)
    return acc

def SGD(X, Y, learning_rate=0.1, epochs=100, bs = 32, alpha = 0.002):

    n, m = X.shape
    n, p = Y.shape
    
    # Define layers: Input Hidden Output
    # Enter no. of perceptrons in each layer
    layers = [m] + [256] + [p]
    
    # Initialize Weights
    weights = get_weights(layers)

    COST = np.zeros(epochs)
    ACC = np.zeros(epochs)
    
    for i in range(epochs):
        
        # Get Batch 
        X_batch, Y_batch = batchloader(X, Y, bs) 
        
        # Forward Pass
        network = forward(X_batch,weights)

        # Back Propagation
        weights = backprop(network, Y_batch, weights, learning_rate)    

        # get cost
        COST[i] = crossEntropy(X_batch, Y_batch, theta)
        acc = accuracy(Y_pred, Y_batch)
        ACC[i] = acc
        print("acc:", acc)
        
    return theta, COST, ACC 

def backprop(network, y, W, lr=0.01):
    y_pred = network[-1]
    print('y:', y.shape)
    dJ_dz = y - y_pred # Initial dJ_dz (deltas)
    
    h = network[-1] # y_hat
    N = range(len(network))
    dW = []

    for i in list(reversed(N)):
        
        print('i',i)
        print(list(reversed(N)))
        print('dJ_dz:', dJ_dz.shape)
        print('network[i-1]',network[i-1].shape)
        dJ_dW = np.dot(dJ_dz.T, network[i-1]) # dJ_dw1 = dJ_dz * dz_dw
        
        dW.append(dJ_dW)

        dz_dh = W[i-1]
        
        z_im1 = np.dot(network[i-2], W[i-2]) 
        
        dh_dz = np.asarray(z_im1 >= 0, dtype=int)
        
        print('dJ_dW:', dJ_dW.shape)
        print('dJ_dz:', dJ_dz.shape)
        print('dz_dh:',dz_dh.shape)
        print('dh_dz:',dh_dz.shape)
        
        
        print('z_im1', z_im1.shape)
        print('h1:', network[i-1].shape)
        print('W2', W[i-1].shape)
        
        dJ_dz = np.dot(np.dot(dJ_dz,dz_dh.T),dh_dz.T)

    for i in N:
        W[i] += - lr * dW[i]
        
    return W     

In [50]:
theta, COST, ACC = SGD(Xtr, Ytr, learning_rate=lr_, epochs=epochs_, bs=bs_, alpha=alpha_)

y: (16, 10)
i 2
[2, 1, 0]
dJ_dz: (16, 10)
network[i-1] (16, 256)
dJ_dW: (10, 256)
dJ_dz: (16, 10)
dz_dh: (256, 10)
dh_dz: (16, 256)
z_im1 (16, 256)
h1: (16, 256)
W2 (256, 10)
i 1
[2, 1, 0]
dJ_dz: (16, 16)
network[i-1] (16, 785)


ValueError: shapes (16,10) and (256,10) not aligned: 10 (dim 1) != 256 (dim 0)

In [8]:
# Load data
Xtr = np.load("mnist_train_images.npy")
n = Xtr.shape[0]
Xtr = Xtr.reshape((n,-1))

# preprocessing Data
Xtr = np.append(Xtr, np.ones((n,1)), axis=1)

Ytr = np.load("mnist_train_labels.npy")

# Get Validation Set
Xv = np.load("mnist_validation_images.npy")
nv = Xv.shape[0]
Xv = Xv.reshape((nv,-1)) # feature vector is row vector 

# preprocessing on validation set 
Xv = np.append(Xv, np.ones((nv,1)), axis=1)

Yv = np.load("mnist_validation_labels.npy")

# Tune Hyper parameter
LR = [0.001, 0.005, 0.01, 0.05]
EPOCHS = [50, 50, 50, 50]
BATCHSIZE = [16, 32, 128, 256]
ALPHA = [0.001, 0.002, 0.005, 0.01]

cost = 1000000

iter = 0

for lr_ in LR:
    for epochs_ in EPOCHS:
        for bs_ in BATCHSIZE:
            for alpha_ in ALPHA:
                theta, COST, ACC = SGD(Xtr, Ytr, learning_rate=lr_, epochs=epochs_, bs=bs_, alpha=alpha_) 

                Y_pred = softmax(np.dot(Xv, theta))
                accu = accuracy(Y_pred, Yv)
                print(accu)
                loss = COST[-1]
                print("iter: ", iter, ", loss: ", COST[-1])
                iter += 1 

                if(loss<cost):
                    lr = lr_
                    epochs = epochs_
                    bs = bs_
                    alpha = alpha_
                    cost=loss
                    
print('learning rate: ', lr, ', Epochs: ', epochs, ', mini-batchsize (in %): ', bs*100, ', alpha: ', alpha)                    

y: (16, 10)
dJ_dz: (16, 10)
dJ_dW: (10, 256)
W[i] (785, 256)
z_im1 (16, 256)
dh_dz: (16, 256)


ValueError: operands could not be broadcast together with shapes (16,10) (785,256) 

In [None]:
epochs = 100
lr = 10
alpha = 0.001
bs = 0.1
# Training on tuned hyperparameters
theta, COST, ACC = SGD(Xtr, Ytr, learning_rate=lr, epochs=epochs, bs=bs, alpha=alpha)

plt.plot(ACC)
plt.xlabel('epochs')
plt.ylabel('MSE')
plt.title('Stochastic Gradient Descent with L2 Regularization')
plt.show()

# Testing 
X_te = np.load("mnist_test_images.npy")
n = X_te.shape[0]
X_te = X_te.reshape((n,-1))

# preprocessing Data
X_te = np.append(X_te, np.ones((n,1)), axis=1)
yte = np.load("mnist_test_labels.npy")

MSE_test = crossEntropy(X_te, yte, theta)
Y_pred = softmax(np.dot(X_te, theta))
accu = accuracy(Y_pred, yte)

print('accuracy', accu)
print('MSE on test data: ', MSE_test)
print('Tuned Hyperparameters: ')
print('learning rate: ', lr, ', Epochs: ', epochs, ', mini-batchsize (in %): ', bs*100, ', alpha: ', alpha)