# Stacking and back propagation on Mnist with multiple sigmoids

### import data

In [3]:
from sklearn.datasets import fetch_mldata
import numpy as np
from sklearn.utils import shuffle
mnist = fetch_mldata('MNIST original', data_home=".")
mnist.data, mnist.target = shuffle(mnist.data, mnist.target)
X = mnist.data[0:5000]
s = np.ones(X.shape[0])
X_1 = np.hstack((X, s.reshape(len(s),1)))
y_float = mnist.target[0:5000]
y_int = y_float.astype(np.int64)
y = np.eye(10)[y_int]
N = X.shape[1]

### define function

In [4]:
def linear_output_z1(x, w1):
    z1 = np.dot(x, w1)
    return z1

def logistic_output_z2(x, w2):
    z2 = sigmoid(np.dot(x, w2))
    return z2

def sigmoid(a):
    return 1.0 / (1.0 + np.exp(-a))

### define gradient

In [25]:
def logistic_output(x, z1, z2, theta1, theta2, theta3):
    zf =  z2*theta2 + z2* theta2 + np.dot(x, theta3)
    return zf

def gradient_c_theta1(z1, zf, y):
    gradient = np.matmul(np.transpose(z1), (sigmoid(zf)-y)) / N
    return gradient

def gradient_c_theta2(z2, zf, y):
    gradient = np.matmul(np.transpose(z2), (sigmoid(zf)-y)) / N
    return gradient

def gradient_c_theta3(x, zf, y):
    gradient = np.matmul(np.transpose(x), (sigmoid(zf)-y)) / N
    return gradient

def gradient_c_w1(x, theta1, zf, y):
    gradient = np.matmul (np.transpose(x), theta1 * np.array((sigmoid(zf) - y))) / N
    return gradient

def gradient_c_w2(x, w2, theta2, zf, y):
    gradient= np.matmul(np.transpose(x), sigmoid(np.dot(x, w2)) * (1 - sigmoid(np.dot(x, w2))) * (theta2 * np.array((sigmoid(zf) - y)))) / N
    return gradient


### define gradient descent 

In [227]:
def gradient_descent(x, y, int_w1, int_w2, int_theta1, int_theta2, int_theta3, lowtrain, uptrain, iterations):

    place_w1=int_w1
    place_w2=int_w2
    place_theta1=int_theta1
    place_theta2=int_theta2
    place_theta3=int_theta3
    
    for i in range(iterations):
        trainrate=np.random.uniform(lowtrain,uptrain,1)
        z1 = linear_output_z1(x, place_w1)
        z2 = logistic_output_z2(x, place_w2)
        z2=np.dot(x,place_w2)
        zf = logistic_output(x, z1, z2, place_theta1, place_theta2, place_theta3)
        p = sigmoid(zf)

        gradient_theta1 = gradient_c_theta1(z1, zf, y)

        gradient_theta2 = gradient_c_theta1(z2, zf, y)

        gradient_theta3 = gradient_c_theta3(x, zf, y)

        gradient_w1 = gradient_c_w1(x, int_theta1, zf, y)

        gradient_w2 = gradient_c_w2(x, int_w2, int_theta2, zf, y)
            
        place_w1 = place_w1 - (trainrate * gradient_w1)
        place_w2 = place_w2 - (trainrate * gradient_w2)
#         place_theta1 = place_theta1 -  trainrate*gradient_theta1
        #place_theta2 = place_theta2 - trainrate*gradient_theta2
#         place_theta3 = place_theta3 - trainrate * gradient_theta3
        
    
    return  z2


### initialization

In [228]:
N_linear = 1
M_logistic = 1

int_w1, int_w2 = np.random.uniform(-1,1,(X.shape[1]+1,N_linear)), np.random.uniform(-1,1,(X.shape[1]+1, M_logistic))
int_theta1, int_theta2, int_theta3 = np.random.uniform(-1,1,(N_linear,10)), np.random.uniform(-1,1,(M_logistic,10)),np.random.uniform(-1,1,(X.shape[1] + 1,10))
p = gradient_descent(X_1, y, int_w1, int_w2, int_theta1, int_theta2, int_theta3, 0.01, 0.01, 3)

  # Remove the CWD from sys.path while we load stuff.


In [229]:
int_w1.shape

(785, 1)

In [230]:
p.shape

(5000, 10)

## np.dot(X_1,int_w2).shape

In [226]:
hoge  = linear_output_z1(X_1, int_w1) * int_theta1
hoge.shape

(5000, 10)

In [159]:
hogehoge = logistic_output_z2(X_1, int_w2) * int_theta2
hogehoge.shape

  # Remove the CWD from sys.path while we load stuff.


(5000, 10)

In [103]:
hogehogehoge = np.dot(X_1, int_theta3)
hogehogehoge.shape

(5000, 10)

In [32]:
hoge + hogehoge + hogehogehoge

array([[-3972.64347028,  1189.89971615,   668.08263398, ...,
        -2190.1095023 ,  1003.6853455 ,   545.80396229],
       [   19.87895425, -1160.83237475,   632.49546209, ...,
          293.41462009,  -151.03558563,   -68.38270588],
       [ -220.20907832,  -440.5234562 , -1788.59520754, ...,
         -173.186085  ,   337.75713873,  1607.99147976],
       ..., 
       [  651.22820589, -1112.51009486,   706.46515679, ...,
          425.66494343, -1061.62670583,  1024.31732073],
       [ -235.60793395,  -867.0659947 ,   469.36946344, ...,
          132.15324004,   -86.54272486,  2029.33048831],
       [-2581.60600894,  2591.90271526,  1342.2400317 , ...,
        -3391.14242151,  2599.52918758,  -883.96485382]])

In [110]:
z1 = linear_output_z1(X_1, int_w1)
z2 = logistic_output_z2(X_1, int_w2)
zf = logistic_output(X_1, z1, z2, int_theta1, int_theta2,int_theta3)
p = sigmoid(zf)

gradient_theta1 = gradient_c_theta1(z1, zf, y)

  # Remove the CWD from sys.path while we load stuff.


In [111]:
gradient_theta1.shape

(1, 10)

In [None]:
trainrate=np.random.uniform(0.01,0.01,1)
z1 = linear_output_z1(X_1, int_w1)
z2 = logistic_output_z2(X_1, int_w2)
zf = logistic_output(X_1, z1, z2, int_theta1, int_theta2, int_theta3)

In [None]:
zf