In [2]:
import pandas as pd
import numpy as np

In [3]:
def one_hot(y_train):
    ceal=np.max(y_train)+1
    temp=np.zeros([ceal,y_train.shape[1]])
    for i in range(y_train.shape[1]):
        k=y_train[0][i]                    # k Just another temp variable
        temp[k][i]=1
    return temp

In [4]:
def softmax(z):
    return(np.exp(z)/(np.sum(np.exp(z),axis=0))) # axis=0 sum column wise

In [5]:
def relu(z):
    return(np.maximum(z,0.01*z))

In [15]:
def sigmoid(z):
    return(1/(1+np.exp(-z)))

In [6]:
def tanh(z):
    return((np.exp(z)-np.exp(-z))/(np.exp(z)+np.exp(-z)))

In [7]:
def model(no_hd_l, hd_unit, input_size, output_size, initialization=None):
    ''' Initialize weights and bias according to type of initialization and no of hidden layers 
        defined by user
        no_hd_l        -- no of hiddel layers
        hd_unit        -- list no of units(neuron) in each layer
        input_size     -- size of input or the layer 0
        output_size    -- size of output layer(last layer) or the number of elements to classify
        initialization -- type of initialization i.e. None, He, Xavier
        '''
    if(len(hd_unit)!=no_hd_l):
        raise ValueError('Size of hidden unit list != No of hidden layer')
        return
    hd_unit=[input_size]+hd_unit
    hd_unit=hd_unit+[output_size]
    parameters=dict()
    
    if(initialization is None):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
        
    if(initialization is 'He'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(2/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
        
    if(initialization is 'Xavier'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(1/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
    return(parameters)

In [8]:
def feed_forward(parameters, X_train, no_hd_l, activation):
    ''' Forward Propogation
        parameters -- returned by model function
        X_train    -- training data
        no_hd_l    -- no of hidden layer
        activation -- activation for hidden layers i.e. sigmoid or tanh or relu
    '''
    cache1=dict()
    if(activation is 'relu'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=relu(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=relu(cache1['Z'+str(i)])
    
    if(activation is 'sigmoid'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=sigmoid(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=sigmoid(cache1['Z'+str(i)])
    
    if(activation is 'tanh'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=tanh(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=tanh(cache1['Z'+str(i)])
        
    cache1['Z'+str(no_hd_l+1)]=np.matmul(parameters['W'+str(no_hd_l+1)],cache1['A'+str(no_hd_l+1-1)])+parameters['b'+str(no_hd_l+1)]
    cache1['A'+str(no_hd_l+1)]=softmax(cache1['Z'+str(no_hd_l+1)])
    return(cache1)

In [9]:
def cost(A,y,m):
    cost=-np.sum(y*np.log(A)+(1-y)*np.log(1-A))/m
    return(cost)

In [17]:
def back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate):
    ''' Backward Propogation 
        parameters     -- returned by model
        cache1         -- returned by feed_forward
        X_train        -- training data
        y_train        -- labels of training data one-hot encoded
        no_hd_l        -- scalar, no of hidden layers
        activation     -- String, type of activation function used in hidden layers i.e. relu or sigmoid or tanh
        learning_rate  -- Scalar, learning rate
    '''
    
    cache2={}
    cache2['dZ'+str(no_hd_l+1)]=cache1['A'+str(no_hd_l+1)]-y_train
    cache2['dW'+str(no_hd_l+1)]=np.matmul(cache2['dZ'+str(no_hd_l+1)],cache1['A'+str(no_hd_l)].T)
    cache2['db'+str(no_hd_l+1)]=cache2['dZ'+str(no_hd_l+1)]
    i=no_hd_l
    
    if(activation is 'relu'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            print(cache2['dW'+str(i)])
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        print(cache2['dW1'])
        
    if(activation is 'sigmoid'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(cache1['A'+str(i)]*np.square(cache1['A'+str(i)]))
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(cache1['A1']*np.square(cache1['A1']))
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    if(activation is 'tanh'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(1-np.square(cache1['A'+str(i)]))
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(1-np.square(cache1['A1']))
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    for i in range(1,no_hd_l+2):
        parameters["W"+str(i)]=parameters["W"+str(i)]-(learning_rate*cache2["dW"+str(i)])
        parameters["b"+str(i)]=parameters["b"+str(i)]-(learning_rate*cache2["db"+str(i)])
    return(parameters)

In [18]:
def data():
    df=pd.read_csv("MNIST dataset/train.csv")
    X_train=np.array(df.iloc[:,1:]).T
    y_train=np.array(df.iloc[:,0])
    y_train=y_train.reshape(1,y_train.shape[0])
    y_train=one_hot(y_train) #one hot
    del df
    variance=np.var(X_train,axis=0)
    X_train=X_train/variance
    m=X_train.shape[1]
    return((X_train, y_train, m))

In [19]:
def train(no_hd_l, hd_unit, initialization, activation, learning_rate):
    X_train, y_train, m = data()
    parameters=model(no_hd_l, hd_unit, input_size=X_train.shape[0],output_size=10,initialization='Xavier')
    
    for i in range(10):
        cache1=feed_forward(parameters, X_train, no_hd_l, activation)
        print(cost(cache1['A'+str(no_hd_l+1)], y_train, m))
        back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate)

In [20]:
train(2, [100,50], 'He', 'tanh', 0.01)

3.25053258888
3.26072413599


  
  


nan


  
  


nan
nan
nan
nan
nan
nan
nan
