In [1]:
import pandas as pd
import numpy as np

In [2]:
def one_hot(y_train):
    ceal=np.max(y_train)+1
    temp=np.zeros([ceal,y_train.shape[1]])
    for i in range(y_train.shape[1]):
        k=y_train[0][i]                    # k Just another temp variable
        temp[k][i]=1
    return temp

In [3]:
def softmax(z):
    return(np.exp(z)/(np.sum(np.exp(z),axis=0))) # axis=0 sum column wise

In [4]:
def relu(z):
    return(np.maximum(z,0.01*z))

In [5]:
def tanh(z):
    return((np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z)))

In [15]:
def model(no_hd_l, hd_unit, input_size, output_size, initialization=None):
    ''' Initialize weights and bias according to type of initialization and no of hidden layers 
        defined by user
        no_hd_l        -- no of hiddel layers
        hd_unit        -- list no of units(neuron) in each layer
        input_size     -- size of input or the layer 0
        output_size    -- size of output layer(last layer) or the number of elements to classify
        initialization -- type of initialization i.e. None, He, Xavier
        '''
    if(len(hd_unit)!=no_hd_l):
        raise ValueError('Size of hidden unit list != No of hidden layer')
        return
    hd_unit=[input_size]+hd_unit
    hd_unit=hd_unit+[output_size]
    parameters=dict()
    
    if(initialization is None):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros([hd_unit[i],1])
        
    if(initialization is 'He'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(2/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros([hd_unit[i],1])
        
    if(initialization is 'Xavier'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(1/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros([hd_unit[i],1])
    return(parameters)

In [27]:
def feed_forward(parameters, X_train, no_hd_l, activation):
    ''' Forward Propogation
        parameters -- returned by model function
        X_train    -- training data
        no_hd_l    -- no of hidden layer
        activation -- activation for hidden layers i.e. sigmoid or tanh or relu
    '''
    cache1=dict()
    if(activation is 'relu'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=relu(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=relu(cache1['Z'+str(i)])
    
    if(activation is 'sigmoid'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=sigmoid(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=sigmoid(cache1['Z'+str(i)])
    
    if(activation is 'tanh'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=tanh(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=tanh(cache1['Z'+str(i)])
        
    cache1['Z'+str(no_hd_l+1)]=np.matmul(parameters['W'+str(no_hd_l+1)],cache1['A'+str(no_hd_l+1-1)])+parameters['b'+str(no_hd_l+1)]
    cache1['A'+str(no_hd_l+1)]=softmax(cache1['Z'+str(no_hd_l+1)])
    return(cache1)

In [217]:
def cost(A,y,m):
    cost=-np.sum(y*np.log(A))/m
    return(cost)

In [223]:
def back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate):
    ''' Backward Propogation '''
    cache2={}
    cache2['dZ'+str(no_hd_l+1)]=cache1['A'+str(no_hd_l+1)]-y_train
    cache2['dW'+str(no_hd_l+1)]=np.matmul(cache2['dZ'+str(no_hd_l+1)],cache1['A'+str(no_hd_l)].T)
    cache2['db'+str(no_hd_l+1)]=cache2['dZ'+str(no_hd_l+1)]
    i=no_hd_l
    if(activation is 'relu'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            print(cache2['dW'+str(i)])
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        print(cache2['dW1'])
    if(activation is 'sigmoid'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
    if(activation is 'tanh'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    for i in range(1,no_hd_l+2):
        parameters["W"+str(i)]=parameters["W"+str(i)]-(learning_rate*cache2["dW"+str(i)])
        parameters["b"+str(i)]=parameters["b"+str(i)]-(learning_rate*cache2["db"+str(i)])
    return(parameters)

In [12]:
df=pd.read_csv("MNIST dataset/train.csv")

In [13]:
X_train=np.array(df.iloc[:,1:]).T
y_train=np.array(df.iloc[:,0])
y_train=y_train.reshape(1,y_train.shape[0])
y_train=one_hot(y_train) #one hot
del df

In [179]:
variance=np.var(X_train,axis=0)
X_train=X_train/variance

In [208]:
parameters=model(no_hd_l=2,hd_unit=[100,50],input_size=X_train.shape[0],output_size=10,initialization='Xavier')

In [188]:
print(parameters['W1'].shape,parameters['W2'].shape,parameters['W3'].shape)
print(parameters['b1'].shape,parameters['b2'].shape,parameters['b3'].shape)

(100, 784) (50, 100) (10, 50)
(100, 1) (50, 1) (10, 1)


In [224]:
cache1=feed_forward(parameters, X_train, 2, 'relu')

  
  


In [202]:
cache1['A1'].shape

(100, 42000)

In [210]:
cache1['A'+str(2+1)]-y_train

array([[ 0.09761707, -0.89583495,  0.10295436, ...,  0.09607144,
         0.09015782,  0.09748485],
       [-0.90248465,  0.15552058, -0.90025618, ...,  0.11377213,
         0.10697278,  0.08902587],
       [ 0.09794075,  0.071321  ,  0.124615  , ...,  0.12276843,
         0.12083721,  0.09718971],
       ..., 
       [ 0.09433232,  0.10875992,  0.08703732, ..., -0.90095795,
         0.09816657,  0.11126737],
       [ 0.09916331,  0.07892783,  0.07898304, ...,  0.06887466,
         0.08596704,  0.08906917],
       [ 0.1086009 ,  0.10826972,  0.1052852 , ...,  0.09333673,
         0.09132574, -0.90457609]])

In [211]:
np.max(cache1['A3'],axis=0)

array([ 0.1086009 ,  0.15552058,  0.124615  , ...,  0.12276843,
        0.12083721,  0.11362216])

In [222]:
cost(cache1['A'+str(3)],y_train,42000)

  
  


nan

In [219]:
parameters=back_prop(parameters, cache1, X_train, y_train, 2, 'relu', 0.02)

[[  9.41325386e-05   3.89644348e-05   2.47545432e-04 ...,  -3.01905767e-06
    7.41618089e-05   3.35246847e-05]
 [  1.36634987e-04   5.90202384e-05   3.72366392e-04 ...,  -4.50011952e-06
    1.01956085e-04   5.03466763e-05]
 [  4.49374890e-05   1.92200834e-05   1.10373256e-04 ...,  -1.61419821e-06
    3.10709421e-05   1.62680470e-05]
 ..., 
 [  5.00119856e-04   2.12889721e-04   1.29194110e-03 ...,  -1.72657460e-05
    3.73616516e-04   1.79905999e-04]
 [  1.12159001e-04   4.70710973e-05   3.03351076e-04 ...,  -3.56791286e-06
    8.43443920e-05   4.17232936e-05]
 [  6.83041212e-05   2.87278556e-05   1.66548320e-04 ...,  -2.44073118e-06
    5.04824461e-05   2.43346567e-05]]
[[-0. -0. -0. ..., -0. -0. -0.]
 [-0. -0. -0. ..., -0. -0. -0.]
 [-0. -0. -0. ..., -0. -0. -0.]
 ..., 
 [-0. -0. -0. ..., -0. -0. -0.]
 [-0. -0. -0. ..., -0. -0. -0.]
 [-0. -0. -0. ..., -0. -0. -0.]]
