In [1]:
import matplotlib.pyplot as plt
import numpy as np

# just to overwrite default colab style
plt.style.use('default')
plt.style.use('seaborn-talk')

In [166]:
# Linear Regression using Stochastic Gradient Descent
def batchloader(X, Y, batchsize = 20):
    n = Y.shape[0]
    idx = np.random.choice(np.arange(n),size=batchsize,replace=False)
    X_batch = X[idx,:]
    Y_batch = Y[idx,:]
    return X_batch, Y_batch

def softmax(X):
    exps = np.exp(X)
    return exps / np.sum(exps)

def crossEntropy(X, Y, theta):
    fce = 0
    
    n, m = X.shape
    n, p = Y.shape 
    
    for i in range(n):
        Y_pred = softmax(np.dot(X[i],theta))
        fce += np.dot(Y[i].T, np.log(Y_pred))  
    fce *= (-1/n)
#     print(fce)
    return fce
    
def SGD(X, Y, learning_rate=0.001, epochs=100, bs = 0.2, alpha = 0.2):
    
    n, m = X.shape
    n, p = Y.shape
    
    w = np.random.randn(m,p)
    b = np.random.randn(1,p)
    
    batchsize = round(bs*n)
    
    # preprocessing Data
    X = np.append(X, np.ones((n,1)), axis=1)
    
    n, mpn = X.shape

    COST = np.zeros(epochs)
    ACC = np.zeros(epochs)
    theta = np.append(w, b, axis=0)
    
    for i in range(epochs):
        alpha *= 0.99
        # Get Batch 
        X_batch, Y_batch = batchloader(X, Y, batchsize) 
        Y_pred = softmax(np.dot(X_batch,theta))
        
        # perform gradient descent
        gradJ = (1/n)*( np.dot(X_batch.T, (Y_pred - Y_batch) )) 
        theta = theta - learning_rate * gradJ - 2*alpha*np.append(w,np.zeros((1,p)),axis=0)
         
        w = theta[:m,:p]    

        # get cost
        COST[i] = crossEntropy(X_batch, Y_batch, theta)
        acc = accuracy(Y_pred, Y_batch)
        ACC[i] = acc
        print("acc:", acc)
    return theta, COST, ACC 
    
def accuracy(Y_pred, Y):
    n,p = Y.shape
    acc = np.sum([np.argmax(Y_pred[i])==np.argmax(Y[i]) for i in range(n)])/(0.01*n)
    return acc

In [151]:
# Load data
Xtr = np.load("mnist_train_images.npy")
n = Xtr.shape[0]
Xtr = Xtr.reshape((n,-1))
Ytr = np.load("mnist_train_labels.npy")

# Get Validation Set
Xv = np.load("mnist_validation_images.npy")
nv = Xv.shape[0]
Xv = Xv.reshape((nv,-1)) # feature vector is row vector 
Yv = np.load("mnist_validation_labels.npy")

# preprocessing on validation set 
Xv = np.append(Xv, np.ones((nv,1)), axis=1)

# print(Xtr.shape, Ytr.shape, Xv.shape, Yv.shape)

# Tune Hyper parameter
LR = [0.001, 0.005, 0.01, 0.05]
EPOCHS = [50, 50, 50, 50]
BATCHSIZE = [0.1, 0.2, 0.3, 0.5]
ALPHA = [0.001, 0.002, 0.005, 0.01]

cost = 1000000

iter = 0

for lr_ in LR:
#     for epochs_ in EPOCHS:
    for bs_ in BATCHSIZE:
        for alpha_ in ALPHA:
            theta, COST, ACC = SGD(Xtr, Ytr, learning_rate=lr_, epochs=epochs_, bs=bs_, alpha=alpha_) 

            Y_pred = softmax(np.dot(Xv, theta))
            accu = accuracy(Y_pred, Yv)
            print(accu)
            loss = COST[-1]
            print("iter: ", iter, ", loss: ", COST[-1])
            iter += 1 

            if(loss<cost):
                lr = lr_
                epochs = epochs_
                bs = bs_
                alpha = alpha_
                cost=loss
                    
print('learning rate: ', lr, ', Epochs: ', epochs, ', mini-batchsize (in %): ', bs*100, ', alpha: ', alpha)                    

10.96
iter:  0 , loss:  11.52473838607429
11.2
iter:  1 , loss:  11.082956664825186
6.58
iter:  2 , loss:  9.869023394071686
17.28
iter:  3 , loss:  5.435014731350501
15.42
iter:  4 , loss:  11.26120916546845
10.88
iter:  5 , loss:  9.016304011025214
6.2
iter:  6 , loss:  9.130103531862238
11.34
iter:  7 , loss:  5.7428510294623365
7.14
iter:  8 , loss:  14.782858586671411
6.7
iter:  9 , loss:  12.329391890873936
6.5
iter:  10 , loss:  7.929299528993527
13.12
iter:  11 , loss:  6.053432106652638
8.92
iter:  12 , loss:  14.897557445207749
14.02
iter:  13 , loss:  11.209017016608671
8.84
iter:  14 , loss:  8.611335422424986
12.54
iter:  15 , loss:  6.852292442695873
8.4
iter:  16 , loss:  14.375539295701412
9.18
iter:  17 , loss:  10.748271879871918
5.18
iter:  18 , loss:  11.077312872420881
10.74
iter:  19 , loss:  5.48774852457642
10.26
iter:  20 , loss:  14.141961733532023
11.06
iter:  21 , loss:  14.499004298124444
11.94
iter:  22 , loss:  9.193137736248701
12.02
iter:  23 , loss:  4

In [179]:
epochs = 100
lr = 10
alpha = 0.001
bs = 0.1
# Training on tuned hyperparameters
theta, COST, ACC = SGD(Xtr, Ytr, learning_rate=lr, epochs=epochs, bs=bs, alpha=alpha)

plt.plot(ACC)
plt.xlabel('epochs')
plt.ylabel('MSE')
plt.title('Stochastic Gradient Descent with L2 Regularization')
plt.show()

# Testing 
X_te = np.load("mnist_test_images.npy")
n = X_te.shape[0]
X_te = X_te.reshape((n,-1))

# preprocessing Data
X_te = np.append(X_te, np.ones((n,1)), axis=1)
yte = np.load("mnist_test_labels.npy")

MSE_test = crossEntropy(X_te, yte, theta)
Y_pred = softmax(np.dot(X_te, theta))
accu = accuracy(Y_pred, yte)

print('accuracy', accu)
print('MSE on test data: ', MSE_test)
print('Tuned Hyperparameters: ')
print('learning rate: ', lr, ', Epochs: ', epochs, ', mini-batchsize (in %): ', bs*100, ', alpha: ', alpha)

acc: 11.254545454545454
acc: 13.090909090909092
acc: 15.10909090909091
acc: 18.10909090909091
acc: 22.745454545454546
acc: 25.472727272727273
acc: 28.272727272727273
acc: 30.70909090909091
acc: 32.4
acc: 35.236363636363635
acc: 38.78181818181818
acc: 41.25454545454546
acc: 43.07272727272727
acc: 45.78181818181818
acc: 47.29090909090909
acc: 48.61818181818182
acc: 50.8
acc: 51.29090909090909
acc: 50.654545454545456
acc: 52.69090909090909
acc: 52.85454545454545
acc: 54.14545454545455
acc: 53.74545454545454
acc: 54.054545454545455
acc: 55.21818181818182
acc: 56.56363636363636
acc: 56.2
acc: 56.58181818181818
acc: 57.63636363636363
acc: 58.836363636363636
acc: 58.654545454545456
acc: 58.03636363636364
acc: 57.92727272727273
acc: 58.4
acc: 59.2
acc: 59.18181818181818
acc: 58.72727272727273
acc: 58.981818181818184
acc: 59.236363636363635
acc: 59.2
acc: 59.21818181818182
acc: 60.0
acc: 60.63636363636363
acc: 61.127272727272725
acc: 60.56363636363636
acc: 60.654545454545456
acc: 61.58181818181

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


acc: 9.672727272727272
acc: 9.690909090909091
acc: 10.145454545454545
acc: 9.763636363636364
acc: 9.345454545454546
acc: 10.145454545454545
acc: 9.527272727272727
acc: 10.018181818181818
acc: 9.49090909090909
acc: 9.690909090909091
acc: 9.581818181818182
acc: 10.163636363636364
acc: 9.745454545454546
acc: 9.436363636363636
acc: 9.945454545454545


KeyboardInterrupt: 