### Import and Organise the data####

In [10]:
from sklearn.datasets import fetch_mldata
import numpy as np
from sklearn.utils import shuffle
mnist = fetch_mldata('MNIST original', data_home=".")
mnist.data, mnist.target = shuffle(mnist.data, mnist.target)
X = mnist.data
y_float = mnist.target
y_int = y_float.astype(np.int64)
y = np.eye(10)[y_int]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
s_train = np.ones(X_train.shape[0])
X_train = np.hstack((X_train, s_train.reshape(len(s_train),1)))
s_test = np.ones(X_test.shape[0])
X_test = np.hstack((X_test, s_test.reshape(len(s_test),1)))

## define useful functions 

In [12]:
def sigmoid(a):
    return 1.0 / (1.0 + np.exp(-a))


### gradient descent

In [13]:
def gradient(x, y, w1, w2, w3, theta1, theta2,theta3):
    z1 = np.matmul(x, w1)
    z2 = np.matmul(x, w2)
    z3 = np.matmul(x, w3)
    zf = np.matmul(z1, theta1) + np.matmul(z2, theta2)+np.matmul(z3, theta3)
    sig = (sigmoid(zf)-y) # [N, 10]
    gradient_c_theta1 = np.matmul(np.transpose(z1), sig) / N #[S1, N] [N, 10]
    gradient_c_theta2 = np.matmul(np.transpose(z2), sig) / N #[S2, N] [N, 10]
    gradient_c_theta3 = np.matmul(np.transpose(z3), sig) / N #[S3,N] [N,10]
    gradient_c_z1 = np.matmul(theta1, np.transpose(sig))  / N #[S1, 10] [N, 10]
    gradient_c_z2 = np.matmul(theta2, np.transpose(sig)) / N 
    gradient_c_z3 = np.matmul(theta3, np.transpose(sig)) / N
    #gradient_z1_w1= np.tranpose(x).T[:,:,None]*np.transpose(np.multiply(sigmoid(z2), (1-sig(z2)))).T[:,None] #[N, S1, 780]
    gradient_z1_w1 =x
    gradient_c_w1 = np.matmul(np.transpose(x),np.transpose(gradient_c_z1))
    gradient_z2_w2=np.ones((N,S2_logistic,785))
    gradient_c_w2=np.ones((S2_logistic,785))
    for i in range(S2_logistic):
        siggy = np.multiply(sigmoid(z2),(1-sigmoid(z2))) ### [N,S2 ]
        gradient_z2_w2[:,i,:] = x * (siggy[:,i].reshape(N,1))   ###[N,S2,785]
    for i in range(S2_logistic):    
        gradient_c_w2[i,:] =np.matmul(gradient_c_z1[i,:],gradient_z2_w2[:,i,:]) / N
    gradient_z3_w3=np.ones((N,S3_Gauss,785))
    gradient_c_w3=np.ones((S3_Gauss,785))
    for i in range(S3_Gauss):
        gradient_z3_w3[:,i,:] = np.multiply((beta*(x-w3[:,i])), np.exp( - beta*(np.linalg.norm(x-w3[:,i]))))   #[N, i ,785]
    for i in range(S3_Gauss):
        gradient_c_w3[i,:] =np.matmul(gradient_c_z3[i,:],gradient_z3_w3[:,i,:]) / N
        
    return zf, gradient_c_theta1, gradient_c_theta2, gradient_c_theta3, gradient_c_w1, gradient_c_w2, gradient_c_w3

In [14]:
# z1 = linear_output_z1(X_train, int_w1)
# z2 = logistic_output_z2(X_train, int_w2)
# zf = logistic_output(X_train, z1, z2, int_theta1, int_theta2)

In [15]:
def gradient_descent(x, y, int_w1, int_w2, int_w3, int_theta1, int_theta2, int_theta3, lowtrain, uptrain, iterations,N):

    place_w1=int_w1
    place_w2=int_w2
    place_w3=int_w3
    place_theta1=int_theta1
    place_theta2=int_theta2
    place_theta3=int_theta3

    for i in range(iterations):
        trainrate=np.random.uniform(lowtrain,uptrain,1)/np.cbrt(i+1) 

        zf, gradient_theta1, gradient_theta2, gradient_theta3, gradient_w1, gradient_w2, gradient_w3 = gradient(x, y, place_w1, place_w2, place_w3, place_theta1, place_theta2, place_theta3)
        p = sigmoid(zf)
            
        place_w1 = place_w1 - (trainrate * gradient_w1) - (0.1/N)*place_w1 * trainrate
        place_w2 = place_w2 - (trainrate * np.transpose(gradient_w2)) - (0.1/N)*place_w2 * trainrate
        place_w3 = place_w3 - (trainrate * np.transpose(gradient_w3)) - (0.1/N)*place_w3 * trainrate

        place_theta1 = place_theta1 - trainrate*gradient_theta1*0.1 - (0.1/N)*place_theta1 * trainrate 
        place_theta2 = place_theta2 - trainrate*gradient_theta2 - (0.1/N)*place_theta2 * trainrate
        place_theta3 = place_theta3 - trainrate*gradient_theta3 - (0.1/N)*place_theta3 * trainrate

    return  p, place_theta1, place_theta2, place_theta3, place_w1, place_w2, place_w3

In [16]:
def predict_accuracy(xtest, ytest, theta1, theta2, theta3, w1, w2, w3):
    z1 = np.matmul(xtest, w1)
    z2 = np.matmul(xtest, w2)
    z3 = np.matmul(xtest, w3) 
    zf = np.matmul(z1, theta1) + np.matmul(z2, theta2) + np.matmul(z3, theta3)
    p = sigmoid(zf)
    prediction = np.argmax(p, axis=1)
    answer = np.argmax(ytest, axis=1)
    success_rate = answer == prediction
    success_rate = success_rate*1
    prediction_rate = sum(success_rate) / xtest.shape[0]
    
    return prediction_rate

In [17]:
S1_linear = 3
S2_logistic = 3
S3_Gauss = 5
N = 52500 # Batch size
beta = 0.1

int_w1, int_w2,int_w3 = np.random.uniform(-1,1,(X_train.shape[1],S1_linear)), np.random.uniform(-1,1,(X_train.shape[1], S2_logistic)), np.random.uniform(-1,1,(X_train.shape[1], S3_Gauss))
int_theta1, int_theta2, int_theta3 = np.random.uniform(-1,1,(S1_linear,10)), np.random.uniform(-1,1,(S2_logistic,10)), np.random.uniform(-1,1,(S3_Gauss,10))
result = gradient_descent(X_train, y_train, int_w1, int_w2, int_w3, int_theta1, int_theta2, int_theta3, 0.00001, 0.0001, 1,N)

  


In [None]:
for i in range(200):
    X_train, y_train = shuffle(X_train, y_train)
    X_train_mini, y_train_mini = X_train[0:200], y_train[0:200]
    N = X_train_mini.shape[0]
    w1 = result[4]
    w2 = result[5]
    w3 = result[6]
    theta1 = result[1]
    theta2 = result[2]
    theta3 = result[3]

    result = gradient_descent(X_train_mini, y_train_mini, w1, w2, w3, theta1, theta2, theta3, 0.0001, 0.0005, 5,N)
    output = predict_accuracy(X_test, y_test, theta1, theta2, theta3, w1, w2, w3)
    print(i, output)

  


0 0.422228571429
1 0.425028571429
2 0.4316
3 0.436
4 0.444685714286
5 0.450114285714
6 0.433885714286
7 0.432857142857
8 0.447257142857
9 0.448457142857
10 0.427257142857
11 0.4568
12 0.455314285714
13 0.442285714286
14 0.460742857143
15 0.446228571429
16 0.442114285714
17 0.468914285714
18 0.476228571429
19 0.4704
20 0.452857142857
21 0.4676
22 0.476171428571
