In [1]:
import pandas as pd
import numpy as np

In [2]:
def one_hot(y_train):
    max_y=np.max(y_train)+1
    temp=np.zeros([max_y,y_train.shape[1]])
    for i in range(y_train.shape[1]):
        k=y_train[0][i]                    # k Just another temp variable
        temp[k][i]=1
    return temp

In [3]:
def softmax(z):
    z_norm=np.exp(z-np.max(z,axis=0))
    return(np.divide(z_norm,np.sum(z_norm,axis=0)))

In [4]:
def relu(z):
    return(np.maximum(z,0.0))

In [5]:
def sigmoid(z):
    return(1.0/(1.0+np.exp(-z)))

In [6]:
def tanh(z):
    return(np.tanh(Z))

In [7]:
def model(no_hd_l, hd_unit, input_size, output_size, initialization=None):
    ''' Initialize weights and bias according to type of initialization and no of hidden layers 
        defined by user
        no_hd_l        -- no of hiddel layers
        hd_unit        -- list no of units(neuron) in each layer
        input_size     -- size of input or the layer 0
        output_size    -- size of output layer(last layer) or the number of elements to classify
        initialization -- type of initialization i.e. None, He, Xavier
        '''
    
    if(len(hd_unit)!=no_hd_l):
        raise ValueError('Size of hidden unit list != No of hidden layer')
        return
    
    hd_unit=[input_size]+hd_unit
    hd_unit=hd_unit+[output_size]
    parameters=dict()
    
    if(initialization is None):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
        
    if(initialization is 'He'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(2/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
        
    if(initialization is 'Xavier'):
        for i in range(1,no_hd_l+2):
            parameters['W'+str(i)]=np.random.randn(hd_unit[i],hd_unit[i-1])*np.sqrt(1/hd_unit[i-1])
            parameters['b'+str(i)]=np.zeros((hd_unit[i],1))
    return(parameters)

In [8]:
def feed_forward(parameters, X_train, no_hd_l, activation):
    ''' Forward Propogation
        parameters -- returned by model function
        X_train    -- training data
        no_hd_l    -- no of hidden layer
        activation -- activation for hidden layers i.e. sigmoid or tanh or relu
    '''
    cache1=dict()
    if(activation is 'relu'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=relu(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=relu(cache1['Z'+str(i)])
    
    if(activation is 'sigmoid'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=sigmoid(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=sigmoid(cache1['Z'+str(i)])
    
    if(activation is 'tanh'):
        cache1['Z1']=np.matmul(parameters['W1'],X_train)+parameters['b1']
        cache1['A1']=tanh(cache1['Z1'])
        for i in range(2,no_hd_l+1):
            cache1['Z'+str(i)]=np.matmul(parameters['W'+str(i)],cache1['A'+str(i-1)])+parameters['b'+str(i)]
            cache1['A'+str(i)]=tanh(cache1['Z'+str(i)])
    
    cache1['Z'+str(no_hd_l+1)]=np.matmul(parameters['W'+str(no_hd_l+1)],cache1['A'+str(no_hd_l)])+parameters['b'+str(no_hd_l+1)]
    cache1['A'+str(no_hd_l+1)]=softmax(cache1['Z'+str(no_hd_l+1)])
    return(cache1)

In [9]:
def cost(A,y,m):
    cost=-np.divide(np.sum(y*np.log(A)),m)
    return(cost)

In [10]:
def back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate,m):
    ''' Backward Propogation 
        parameters     -- returned by model
        cache1         -- returned by feed_forward
        X_train        -- training data
        y_train        -- labels of training data one-hot encoded
        no_hd_l        -- scalar, no of hidden layers
        activation     -- String, type of activation function used in hidden layers i.e. relu or sigmoid or tanh
        learning_rate  -- Scalar, learning rate
    '''
    
    cache2={}
    cache2['dZ'+str(no_hd_l+1)]=(cache1['A'+str(no_hd_l+1)]-y_train)/m
    cache2['dW'+str(no_hd_l+1)]=np.matmul(cache2['dZ'+str(no_hd_l+1)],cache1['A'+str(no_hd_l)].T)
    cache2['db'+str(no_hd_l+1)]=cache2['dZ'+str(no_hd_l+1)]
    i=no_hd_l
    
    if(activation is 'relu'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    if(activation is 'sigmoid'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(cache1['A'+str(i)]*np.square(cache1['A'+str(i)]))
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(cache1['A1']*np.square(cache1['A1']))
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    if(activation is 'tanh'):
        while(i>1):
            cache2['dZ'+str(i)]=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(1-np.square(cache1['A'+str(i)]))
            cache2['dW'+str(i)]=np.matmul(cache2['dZ'+str(i)],cache1['A'+str(i-1)].T)
            cache2['db'+str(i)]=cache2['dZ'+str(i)]
            i=i-1
        cache2['dZ1']=np.matmul(parameters['W'+str(i+1)].T,cache2['dZ'+str(i+1)])*(1-np.square(cache1['A1']))
        cache2['dW1']=np.matmul(cache2['dZ'+str(i)],X_train.T)
        cache2['db1']=cache2['dZ'+str(i)]
        
    for i in range(1,no_hd_l+2):
        parameters["W"+str(i)]=parameters["W"+str(i)]-(learning_rate*cache2["dW"+str(i)])
        parameters["b"+str(i)]=parameters["b"+str(i)]-(learning_rate*cache2["db"+str(i)])
    return(parameters)

In [11]:
def data():
    df=pd.read_csv("MNIST dataset/train.csv")
    X_train=np.array(df.iloc[:,1:]).T
    y_train=np.array(df.iloc[:,0])
    y_train=y_train.reshape(1,y_train.shape[0])
    y_train=one_hot(y_train) #one hot
    del df
    X_train=X_train/np.max(X_train)
    m=X_train.shape[1]
    return((X_train, y_train, m))

In [14]:
def train(no_hd_l, hd_unit, initialization, activation, learning_rate):
    # Data
    X_train, y_train, m = data()
    print('No of training example {}'.format(m))
    print('X_train(data) {}'.format(X_train.shape))
    print('y_train(labels) {}'.format(y_train.shape))
    print('max and min value of data {} {}'.format(np.max(X_train),np.min(X_train)))
    print()
    
    # Initialize Parameters
    parameters=model(no_hd_l, hd_unit, input_size=X_train.shape[0],output_size=10,initialization=None)
    
    for i in range(100):
        cache1=feed_forward(parameters, X_train, no_hd_l, activation)
        print(cost(cache1['A'+str(no_hd_l+1)], y_train, m))
        back_prop(parameters, cache1, X_train, y_train, no_hd_l, activation, learning_rate,m)
    return(parameters)

In [21]:
parameters=train(1, [100], 'He', 'sigmoid', 0.1)

No of training example 42000
X_train(data) (784, 42000)
y_train(labels) (10, 42000)
max and min value of data 1.0 0.0

10.295497664462623
8.235311397084715
7.287615733982725
6.674276804673139
6.190786346075487
5.776629217852684
5.4197347946621175
5.116268032199992
4.859435761832552
4.639664706951085
4.447609284279327
4.27575748227389
4.119125563413873
3.97478162717333
3.84097768755411
3.7165484158250743
3.600622844607252
3.4924742053823095
3.3914583106641873
3.296998761310077
3.208566110790739
3.125664507841067
3.0478377583188525
2.974673982173611
2.9057996364861283
2.840872587787859
2.779578357891488
2.721628482748409
2.6667612827558327
2.6147411235706413
2.565354457214609
2.5184061519051473
2.4737180814374278
2.4311285973050962
2.390491473623899
2.3516742691715637
2.3145565584342713
2.279028699668285
2.2449914008331144
2.212355316090883
2.181039901138951
2.1509716868812863
2.1220826197891816
2.0943088379576724
2.067589943236069
2.0418686833452937
2.01709087582955
1.993205418489177
1.

KeyboardInterrupt: 

In [48]:
df=pd.read_csv("MNIST dataset/train.csv")
X_train=np.array(df.iloc[:,1:]).T
y_train=np.array(df.iloc[:,0])
y_train=y_train.reshape(1,y_train.shape[0])
y_train=one_hot(y_train) #one hot
del df
X_train=X_train/np.max(X_train)
m=X_train.shape[1]

In [20]:
parameters['b1'].shape

(100, 42000)

In [51]:
X_train=X_train[:,1]

In [56]:
k=X_train.reshape(784,1)

In [57]:
cache=feed_forward(parameters, k, 1, activation='relu')

In [61]:
cache['A2'][:,1]

array([1.00000000e+000, 1.18092739e-114, 1.45119260e-055, 2.05879221e-048,
       4.66718644e-090, 3.93994330e-052, 4.13887143e-056, 1.51487185e-041,
       3.85347659e-071, 3.09069724e-080])

In [64]:
y_train[:,1]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [29]:
a3.shape

(10, 42000)

In [30]:
np.mean(np.log(a3))

-93.6275855105389

In [35]:
np.sum(np.log(a3))

-936.275855105389

In [37]:
np.min(np.log(a3),axis=0)

array([-320.3742967 , -249.43129893, -200.6275154 , ..., -205.64943907,
       -215.50247646, -130.13198492])

In [20]:
np.log(a3)

array([[-2.13884995e+00, -1.83304510e+02, -1.13283187e+02, ...,
        -5.95625085e+01, -1.30842016e+02, -1.11185455e+02],
       [-5.27925465e+01, -2.81982274e+02, -6.93987315e+01, ...,
        -1.36085294e+02, -1.67524819e+02, -1.04953814e+02],
       [-1.25326877e-01,  0.00000000e+00, -2.36667524e+01, ...,
         0.00000000e+00, -6.49766965e+01,  0.00000000e+00],
       ...,
       [-8.84971322e+01, -2.69095766e+02, -1.27364478e+02, ...,
        -1.85114422e+02, -1.66354761e+02, -1.55430867e+02],
       [-2.01849786e+01, -3.72821305e+02, -1.28451469e+02, ...,
        -2.20443778e+02, -1.62420445e+02, -1.29474062e+02],
       [-9.01705653e+01, -3.26647515e+02, -2.55702711e+01, ...,
        -1.38812370e+02, -1.49923575e+02, -1.38546675e+02]])

In [21]:
print(a3[:,1])

[2.46525952e-080 3.44075877e-123 1.00000000e+000 2.23592645e-107
 4.41951705e-103 4.83279891e-092 9.68402224e-096 1.35891924e-117
 1.21832877e-162 1.37653384e-142]


In [22]:
print(np.sum(a3,axis=0))

[1. 1. 1. ... 1. 1. 1.]


In [276]:
k=np.random.randn(3,3)*10000

In [277]:
m=np.mean(k)

In [278]:
print(k-m)

[[14368.39345186  -794.23021787 -3908.52201022]
 [  357.56779982  4538.40048224 -7576.72308331]
 [ -496.0912204  -5096.47685493 -1392.3183472 ]]
