# Stacking and back propagation on Mnist with multiple sigmoids

### import data

In [1]:
from sklearn.datasets import fetch_mldata
import numpy as np
from sklearn.utils import shuffle
mnist = fetch_mldata('MNIST original', data_home=".")
mnist.data, mnist.target = shuffle(mnist.data, mnist.target)
X = mnist.data
y_float = mnist.target
y_int = y_float.astype(np.int64)
y = np.eye(10)[y_int]
N = X.shape[0]

### Cross Validation

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
s_train = np.ones(X_train.shape[0])
X_train = np.hstack((X_train, s_train.reshape(len(s_train),1)))
s_test = np.ones(X_test.shape[0])
X_test = np.hstack((X_test, s_test.reshape(len(s_test),1)))

### PCA

In [3]:
# from sklearn.decomposition import IncrementalPCA

# n_batches = 100
# inc_pca = IncrementalPCA(n_components = 154)
# for X_batch in np.array_split(X_train, n_batches):
#     inc_pca.partial_fit(X_batch)
    
# X_reduced_train = inc_pca.transform(X_train)
# X_reduced_train = np.hstack((X_reduced_train, s_train.reshape(len(s_train),1)))
# X_reduced_test = inc_pca.transform(X_test)
# X_reduced_test = np.hstack((X_reduced_test, s_test.reshape(len(s_test),1)))

### define function

In [4]:
def linear_output_z1(x, w1):
    z1 = np.dot(x, w1)
    return z1

def logistic_output_z2(x, w2):
    z2 = sigmoid(np.dot(x, w2))
    return z2

def sigmoid(a):
    return 1.0 / (1.0 + np.exp(-a))

### define gradient

In [5]:
def logistic_output(x, z1, z2, theta1, theta2, theta3):
    zf = np.matmul(z1, theta1) + np.matmul(z2, theta2) + np.array(np.dot(x, theta3))
    return zf

def gradient_c_theta1(z1, zf, y):
    gradient = np.matmul(np.transpose(z1), (sigmoid(zf)-y)) / N
    return gradient

def gradient_c_theta2(z2, zf, y):
    gradient = np.matmul(np.transpose(z2), (sigmoid(zf)-y)) / N
    return gradient

def gradient_c_theta3(x, zf, y):
    gradient = np.matmul(np.transpose(x), (sigmoid(zf)-y)) / N
    return gradient

def gradient_c_w1(x, theta1, zf, y):
    gradient = np.matmul(np.transpose(x), np.transpose(np.matmul(theta1, np.transpose(np.array((sigmoid(zf) - y)))))) / N
    return gradient

def gradient_c_w2(x, w2, theta2, zf, y):
    elemsig = sigmoid(np.matmul(x, w2)) # [5000, 1]
    elem1=np.transpose(np.array(sigmoid(zf) - y)) # [10,5000]
    elem2=np.matmul(theta2, elem1) # [1, 5000]
    elem3=np.matmul(np.transpose(elemsig), np.transpose(elem2)) # [1,1]
    elem4 = np.matmul(elemsig, (1 - elem3)) # [5000,1]
    gradient= np.matmul(np.transpose(x), elem4)/ N # [785,1]

    return gradient


In [None]:
z1 = linear_output_z1(X_train, int_w1)
z2 = logistic_output_z2(X_train, int_w2)
zf = logistic_output(X_train, z1, z2, int_theta1, int_theta2, int_theta3)

elemsig = sigmoid(np.matmul(X_train, int_w2)) # [52500, 2]
elem1=np.transpose(np.array(sigmoid(zf) - y_train)) # [10,52500]
elem2=np.matmul(int_theta2, elem1) # [2, 52500]
elem3=np.matmul(np.transpose(elemsig), np.transpose(elem2)) # [2,2]
elem4 = np.matmul(elemsig, (1 - elem3)) # [5000,1]
# gradient= np.matmul(np.transpose(x), elem4)/ N # [785,1]

In [None]:
elem3.shape

### define gradient descent 

In [6]:
def gradient_descent(x, y, int_w1, int_w2, int_theta1, int_theta2, int_theta3, lowtrain, uptrain, iterations):

    place_w1=int_w1
    place_w2=int_w2
    place_theta1=int_theta1
    place_theta2=int_theta2
    place_theta3=int_theta3

    for i in range(iterations):
        trainrate=np.random.uniform(lowtrain,uptrain,1)/np.cbrt(i+1) 
        z1 = linear_output_z1(x, place_w1)
        z2 = logistic_output_z2(x, place_w2)
        zf = logistic_output(x, z1, z2, place_theta1, place_theta2, place_theta3)
        p = sigmoid(zf)

        gradient_theta1 = gradient_c_theta1(z1, zf, y)
        gradient_theta2 = gradient_c_theta2(z2, zf, y)
        gradient_theta3 = gradient_c_theta3(x, zf, y)
        gradient_w1 = gradient_c_w1(x, place_theta1, zf, y)
        gradient_w2 = gradient_c_w2(x, place_w2, place_theta2, zf, y)
        
            
        place_w1 = place_w1 - (trainrate * gradient_w1) - (0.1/N)*place_w1 * trainrate
        place_w2 = place_w2 - (trainrate * gradient_w2) - (0.1/N)*place_w2 * trainrate
        place_theta1 = place_theta1 - trainrate*gradient_theta1*0.01 - (0.1/N)*place_theta1 * trainrate 
        place_theta2 = place_theta2 - trainrate*gradient_theta2 - (0.1/N)*place_theta2 * trainrate
        place_theta3 = place_theta3 - trainrate*gradient_theta3 - (0.1/N)*place_theta3 * trainrate

    
    return  p, place_theta1, place_theta2, place_theta3, place_w1, place_w2


### initialization

In [8]:
N_linear = 2
M_logistic = 2

int_w1, int_w2 = np.random.uniform(-1,1,(X_train.shape[1],N_linear)), np.random.uniform(-1,1,(X_train.shape[1], M_logistic))
int_theta1, int_theta2, int_theta3 = np.random.uniform(-1,1,(N_linear,10)), np.random.uniform(-1,1,(M_logistic,10)),np.random.uniform(-1,1,(X_train.shape[1],10))
# int_w1 = result[4]
# int_w2 = result[5]
# int_theta1 = result[1]
# int_theta2 = result[2]
# int_theta3 = result[3]

result = gradient_descent(X_train, y_train, int_w1, int_w2, int_theta1, int_theta2, int_theta3, 0.00001, 0.0001, 50)

  # Remove the CWD from sys.path while we load stuff.


In [9]:
def predict_accuracy(xtest, ytest, theta1, theta2, theta3, w1, w2):
    z1 = linear_output_z1(xtest, w1)
    z2 = logistic_output_z2(xtest, w2)
    zf = logistic_output(xtest, z1, z2, theta1, theta2, theta3)
    p = sigmoid(zf)
    prediction = np.argmax(p, axis=1)
    answer = np.argmax(ytest, axis=1)
    success_rate = answer == prediction
    success_rate = success_rate*1
    prediction_rate = sum(success_rate) / xtest.shape[0]
    
    return prediction_rate

In [10]:
predict_accuracy(X_test, y_test, result[1], result[2], result[3], result[4], result[5])

  # Remove the CWD from sys.path while we load stuff.


0.096799999999999997

In [None]:
for i in range(20):
    int_w1 = result[4]
    int_w2 = result[5]
    int_theta1 = result[1]
    int_theta2 = result[2]
    int_theta3 = result[3]

    result = gradient_descent(X_train, y_train, int_w1, int_w2, int_theta1, int_theta2, int_theta3, 0.00001, 0.0001, 50)
    output = predict_accuracy(X_test, y_test, result[1], result[2], result[3], result[4], result[5])
    print(output)

  # Remove the CWD from sys.path while we load stuff.


0.735714285714
0.736228571429
0.736857142857
0.7376
0.738514285714
