In [2]:
import pandas as pd
import numpy as np

In [3]:
def one_hot(y_train):
    max_y=np.max(y_train)+1
    temp=np.zeros([max_y,y_train.shape[1]])
    for i in range(y_train.shape[1]):
        k=y_train[0][i]                    # k Just another temp variable
        temp[k][i]=1
    return temp

In [57]:
def softmax(z):
    z_norm=np.exp(z-np.max(z,axis=0))
    return(np.divide(z_norm,np.sum(z_norm,axis=0)))

In [5]:
def relu(z):
    return(np.maximum(z,0.0))

In [6]:
def sigmoid(z):
    return(1.0/(1.0+np.exp(-z)))

In [7]:
def tanh(z):
    return(np.tanh(Z))

In [8]:
def model(no_hd_l, hd_unit, input_size, output_size, initialization=None):
    ''' Initialize weights and bias according to type of initialization and no of hidden layers 
        defined by user
        no_hd_l        -- no of hiddel layers
        hd_unit        -- list no of units(neuron) in each layer
        input_size     -- size of input or the layer 0
        output_size    -- size of output layer(last layer) or the number of elements to classify
        initialization -- type of initialization i.e. None, He, Xavier
        '''
    
    if(len(hd_unit)!=no_hd_l):
        raise ValueError('Size of hidden unit list != No of hidden layer')
        return
    
    hd_unit=[input_size]+hd_unit
    hd_unit=hd_unit+[output_size]
    parameters=dict()
    
    if(initialization is None):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
        
    if(initialization is 'He'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(2/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
        
    if(initialization is 'Xavier'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(1/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
    return(parameters)

In [9]:
def feed_forward(parameters, X_train, no_hd_l, activation):
    ''' Forward Propogation
        parameters -- returned by model function
        X_train    -- training data
        no_hd_l    -- no of hidden layer
        activation -- activation for hidden layers i.e. sigmoid or tanh or relu
    '''
    cache1=dict()
    if(activation is 'relu'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=relu(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=relu(cache1['Z'+str(i)])
    
    if(activation is 'sigmoid'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=sigmoid(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=sigmoid(cache1['Z'+str(i)])
    
    if(activation is 'tanh'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=tanh(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=tanh(cache1['Z'+str(i)])
    
    cache1['Z'+str(no_hd_l+1)]=np.matmul(parameters['W'+str(no_hd_l+1)],cache1['A'+str(no_hd_l)])+parameters['b'+str(no_hd_l+1)]
    cache1['A'+str(no_hd_l+1)]=softmax(cache1['Z'+str(no_hd_l+1)])
    return(cache1)

In [11]:
def cost(A,y,m):
    #A = np.clip(A, 1e-10, 1 - 1e-10)
    
    cost= -np.sum(y*np.log(A))/m
    return(cost)

In [58]:
def cost(Z,y,m):
    z_norm=Z-np.max(Z,axis=0)
    cost=-np.sum( y * (z_norm-np.log(np.sum(np.exp(z_norm),axis=0))))
    return(cost/m)

In [59]:
def back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate, m):
    ''' Backward Propogation 
        parameters     -- returned by model
        cache1         -- returned by feed_forward
        X_train        -- training data
        y_train        -- labels of training data one-hot encoded
        no_hd_l        -- scalar, no of hidden layers
        activation     -- String, type of activation function used in hidden layers i.e. relu or sigmoid or tanh
        learning_rate  -- Scalar, learning rate
    '''
    
    cache2={}
    cache2['dZ'+str(no_hd_l+1)]=(cache1['A'+str(no_hd_l+1)]-y_train)
    cache2['dW'+str(no_hd_l+1)]=np.matmul(cache2['dZ'+str(no_hd_l+1)],cache1['A'+str(no_hd_l)].T)
    cache2['db'+str(no_hd_l+1)]=cache2['dZ'+str(no_hd_l+1)]
    i=no_hd_l
    
    if(activation is 'relu'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    if(activation is 'sigmoid'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(cache1['A'+str(i)]*np.square(cache1['A'+str(i)]))
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(cache1['A1']*np.square(cache1['A1']))
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    if(activation is 'tanh'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(1-np.square(cache1['A'+str(i)]))
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(1-np.square(cache1['A1']))
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    for i in range(1,no_hd_l+2):
        parameters["W"+str(i)]=parameters["W"+str(i)]-(learning_rate*cache2["dW"+str(i)])
        parameters["b"+str(i)]=parameters["b"+str(i)]-(learning_rate*cache2["db"+str(i)])
    return(parameters)

In [60]:
def data():
    df=pd.read_csv("MNIST dataset/train.csv")
    X_train=np.array(df.iloc[:,1:]).T
    y_train=np.array(df.iloc[:,0])
    y_train=y_train.reshape(1,y_train.shape[0])
    y_train=one_hot(y_train) #one hot
    del df
    X_train=X_train/np.max(X_train)
    m=X_train.shape[1]
    return((X_train, y_train, m))

In [61]:
def train(no_hd_l, hd_unit, initialization, activation, learning_rate):
    # Data
    X_train, y_train, m = data()
    print('No of training example {}'.format(m))
    print('X_train(data) {}'.format(X_train.shape))
    print('y_train(labels) {}'.format(y_train.shape))
    print('max and min value of data {} {}'.format(np.max(X_train),np.min(X_train)))
    print()
    
    # Initialize Parameters
    parameters=model(no_hd_l, hd_unit, input_size=X_train.shape[0],output_size=10,initialization=None)
    
    for i in range(20):
        cache1=feed_forward(parameters, X_train, no_hd_l, activation)
        print(cost(cache1['A'+str(no_hd_l+1)], y_train, m))
        back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate,m)
    return(parameters)

In [62]:
parameters=train(2, [100,50], 'He', 'relu', 0.01)

No of training example 42000
X_train(data) (784, 42000)
y_train(labels) (10, 42000)
max and min value of data 1.0 0.0

2.3648734966007883
2.317200836112716
2.3595396085666525
2.3563644574487608
2.370793028877332
2.361697790782094
2.3563644574487608
2.36441207649638
2.3496263622106652
2.3025831760948505
2.3563644574487608


  


nan
nan


KeyboardInterrupt: 

In [28]:
df=pd.read_csv("MNIST dataset/train.csv")
X_train=np.array(df.iloc[:,1:]).T
y_train=np.array(df.iloc[:,0])
y_train=y_train.reshape(1,y_train.shape[0])
y_train=one_hot(y_train) #one hot
del df
X_train=X_train/np.max(X_train)
m=X_train.shape[1]

In [26]:
print(parameters['W1'])

[[ 1.78587222  1.00106171 -0.00382787 ... -0.28839754  0.73577846
  -0.60379733]
 [ 1.39638372  0.80782387  1.44532641 ... -0.08066657 -1.24575505
   0.5385621 ]
 [ 0.57501008 -0.40044002  0.32160629 ...  1.34828102 -0.05182009
  -1.2309356 ]
 ...
 [ 1.59618568  1.25163257 -0.86411736 ... -0.64297419  0.90103906
  -0.05913399]
 [-1.61804996 -0.09143681 -0.5021722  ...  0.44455058 -2.04691807
  -0.86930639]
 [ 0.040396    1.71215624 -0.24029863 ...  1.24237587 -1.97027679
   0.15973742]]


In [49]:
cache=feed_forward(parameters,X_train[:,0].reshape(784,1),no_hd_l=2,activation='relu')

In [50]:
cache['A'+str(2+1)]

array([[0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [40]:
np.argmax(cache['A'+str(2+1)][:,0])

3

In [42]:
np.argmax(y_train[:,4])

0