# Part 1:

Description:
Logistic regression[1] tries to classify by the probability returned by a sigmoid function (the hypothesis 
in this case) as below: 
            hyp(X.W) = hyp(z) = 1/(1 + exp(-z)) 
            where X = vector of input features  
            W = vector of weights for corresponding feature of X 
            z = product of theta and X = W0 + W1.x1+W2.x2+…Wn.xn 
            where x1,x2,x3…xn are feature values  
The algorithm tries to classify one class from another by this probability. For example, one can say 
the prediction belongs to class 1 when the probability is greater than or equal to 0.5 and belongs to 
class 0 when the probability is less than 0.5. 
            The cost function comes out to be: 
            cost = (-log(hyp(z)).y) - log(1-hyp(z)).(1-y)))  
            where y = actual value                          
            hyp(z) = probability given by the sigmoid function

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator

class LogisticRegression(BaseEstimator): # implemented class for the model
   """A class that does performs logistic regression."""
   
   def __init__(self,W=[],b=[],learningFactor = 0.05,iterations=1000): # some parameters can be set
      """Initializes values.Takes in the supplied learningFactor and the iterations"""
      self.learningFactor = learningFactor                             # set the learning factor
      self.iterations = iterations                                     # set the iterations
      self.X = X                                                       # set the X value
      self.y = y                                                       # set the y value
      self.W = W
      self.b = b
      self.v_delW = []
      self.v_delb = []
      self.class_labels = []

   def sigmoid(self,z): # code for the sigmoid function
      """This function returns the sigmoid value of the input z"""
      return (1/(1+np.exp(-z))) 

   

   def costFunctionAndGradient(self,x,y,W_learn,b_learn):
      """This function returns the costFunctionAndGradient"""
      a = self.sigmoid(np.dot(x,W_learn.T) + b_learn)               # calculate the value of hypothesis
      cost = (-np.dot(np.log(a),y.T) - np.dot(np.log(1-a),(1-y).T)) # calculate cost for Stochastic Gradient descent
      gradient_W = np.dot((a-y),x)                                  # calculate the gradient for Weights W
      gradient_b = a-y                                              # calculate the gradient for bias b
      return cost,gradient_W,gradient_b

   def gradientDescent(self,x,y,W_learn,b_learn):                   # code for the Gradient Descent algorithm
      """This function does Gradient Descent and learns the weight values"""
      costValue,gradient_W,gradient_b = self.costFunctionAndGradient(x,y,W_learn,b_learn)
      self.W = self.W - self.learningFactor * gradient_W            # update the weight values in every iteration
      self.b = self.b - self.learningFactor * gradient_b            # update the bias values in every iteration
      return costValue
        
   def fit(self,X,y): # the fit function which takes in the features & output vectors
      """This function helps the model learn from the given labelled examples"""
      self.X = X
      self.y = y
      seed = 1
      rows,columns = X.shape
      self.W = np.random.random(columns) # start from random values for weights
      self.b = np.random.random()
      self.convertLabels(list(set(y)))   # identify all unique labels
      for i in range(self.iterations):
        sample = np.random.randint(len(X))
        self.gradientDescent(X[sample],y[sample],self.W,self.b) # save the learned weight values
   
   def convertLabels(self,labels):
        self.class_labels = { 0 : labels[0] , 1 : labels[1]}

   def predict(self,X): # function to predict class for given feature vector X
      """This function predicts the class for the given features X"""
      x = np.array(X)
      #print(self.b)  
      a = self.sigmoid(np.dot(x,self.W.T) + self.b)
      #print(f"Prediction: {a}")
      if a >= 0.5:
         return self.class_labels[1]            # choose the class which returns highest value/probability for our hypothesis
      return self.class_labels[0]               # choose the other class

   def score(self,X,y):                         # function which returns the accuracy value
      """This function returns the accuracy score for the X values and compares them with output y."""
      count = 0  
      for x,y_ in zip(X,y):
        predictedLabel = self.predict(x)
        if predictedLabel == y_:
            count += 1
      return count/len(y)                       # calculate accuracy


# Part 2:

Moons dataset:
On this dataset, I got a mean test accuracy of 0.8583333333333334 as seen below. From looking at the plot of the data,it is not linearly seperable.


Blobs dataset:
On this dataset, I got a mean test accuracy of 1.0 as seen below. From looking at the plot of the data,it is linearly seperable and can be easily classified into two distinguishable classes. This explains the high test accuracy.

In [39]:
# Check Logistic regression for Moons data
df1 = pd.read_csv('moons400.csv',sep=',')
features = df1.drop(columns=['Class'])
classes = df1['Class']
X = np.array(features)
y = np.array(classes)
accuracyValues = []
parameters = {'learningFactor' : [1e-3,1e-2,5e-3,5e-2],'iterations':[100,500,1000,2000]}    # draw up a list of hyperparameters
#gs = GridSearchCV(estimator=LogisticRegression(),param_grid=parameters)                    # do a grid search
#gs.fit(X,y)                                                                                # fit to the model
#print(gs.best_estimator_)                                                                  # see the best model/parameters
for i in range(10):                                                                     # use 10 models
   X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.3,random_state=i) # use 15% of data as the test set
   X_val,X_test,y_val,y_test = train_test_split(X_test, y_test,test_size=0.5,random_state=i)
   lr = LogisticRegression()                                           # create an object
   lr.fit(X_train,y_train)                                                              # train the model
   score = lr.score(X_test,y_test)
   print('The accuracy is ', score)                                                     # get the accuracy/score of the given inputs
   accuracyValues.append(score)
print('The average accuracy is',np.mean(accuracyValues))                                # get the mean accuracy


The accuracy is  0.85
The accuracy is  0.8833333333333333
The accuracy is  0.9333333333333333
The accuracy is  0.7833333333333333
The accuracy is  0.8833333333333333
The accuracy is  0.8833333333333333
The accuracy is  0.9
The accuracy is  0.8
The accuracy is  0.85
The accuracy is  0.8166666666666667
The average accuracy is 0.8583333333333334


In [3]:
# Check Logistic regression for blobs data
df2 = pd.read_csv('blobs250.csv',sep=',')
features = df2.drop(columns=['Class'])
classes = df2['Class']
X = np.array(features)
y = np.array(classes)
accuracyValues = []
parameters = {'learningFactor' : [1e-3,1e-2,5e-3,5e-2],'iterations':[100,500,1000,2000]}    # draw up a list of hyperparameters
#gs = GridSearchCV(estimator=LogisticRegression(),param_grid=parameters)                    # do a grid search
#gs.fit(X,y)                                                                                # fit to the model
#print(gs.best_estimator_)                                                                  # see the best model/parameters
for i in range(10):                                                                         # use 10 models
   X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.3,random_state=i) # use 15% of data as the test set
   X_val,X_test,y_val,y_test = train_test_split(X_test, y_test,test_size=0.5,random_state=i)
   lr = LogisticRegression(learningFactor=0.01,iterations=100)                              # create an object
   lr.fit(X_train,y_train)                                                                  # train the model
   score = lr.score(X_test,y_test)
   print('The accuracy is ', score)                                                    # get the accuracy/score of the given inputs
   accuracyValues.append(score)
print('The average accuracy is',np.mean(accuracyValues))                               # get the mean accuracy

The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The average accuracy is 1.0


# Part 3: Implement Shallow NN

Description:
Shallow Neural Networks [2] are Neural Networks with a maximum of two hidden layers. This in contrast to Deep NNs where usually there are more than two hidden layers (and hence the name 'Deep'). For the assignment, I have implemented a Shallow NN with one hidden layer and output layer. The input data points (X) are passed to this hidden layer in a linear combination of weights and bias.For example:<br>
     z1 = 𝑊1,1 * 𝑥1 + 𝑊1,2 * 𝑥2 + 𝑊1,3 * 𝑥3 + 𝑏1 <br>
     where z1 = linear combination that goes into one of the node in the hidden layer <br>
           Wi,j = combination of weights for the xi inputs <br>
           bi = bias <br>
Each node in the hidden layer will then activate this linear combination as: <br>
     a1 = f(𝑊1,1 * 𝑥1 + 𝑊1,2 * 𝑥2 + 𝑊1,3 * 𝑥3 + 𝑏1)                      <br>
     where a1 = value of activation                                         <br>
           f = the activation function                                      <br>
In my case, I have chosen the activation function as sigmoid for all the layers. The model learns in the simple steps as below:
1) We randomly intialise the weight and bias values
2) The forward propagation: For all the layers,we calculate the activation value and pass it on to the next layer.
3) We compare the final output of our network to the actual value and then backpropagate back the errors into the network.This means that we readjust our weights so that they are more closer to the output.
4) We do this until the weights no longer change or till we have minimized the loss function. 

Blobs dataset:
On this dataset, I got a mean test accuracy of 1.0 as seen below.This is similar to the Logistic Regression tests in part 2. From looking at the plot of the data,it is linearly seperable and can be easily classified into two distinguishable classes. This explains the high test accuracy.

Moons dataset:
From looking at the plot of the data,it is not linearly seperable as a whole.On this dataset, I got a mean test accuracy of 0.8716666666666667 as seen below which is better than the Logistic regression. It needed more training than the LR model(Logistic regression) understandably because this model needs sufficient learning to adjust weights and parameters for the nodes in the hidden layers.


In [11]:
# Shallow NN

class ShallowNN(BaseEstimator):
    def __init__(self,alpha=0.05,noOfNodes=2,activation='sigmoid',iterations=1000,epsilon = 1e-06): # some parameters can be set
        self.noOfNodes = noOfNodes                                                  # set the number of nodes in the hidden layer
        self.activation = activation                                                # set the activation
        self.nodesList = []                                                         # initialise a list of nodes
        self.X = []                                                                 # initialise X values               
        self.y = []                                                                 # initialise y values
        self.W = []
        self.iterations = iterations                                                # initialise weight values
        self.b = []                                                                 # initialise bias values
        self.alpha = alpha                                                          # initialise learning factor
        self.class_labels = {}
        self.epsilon = epsilon                                                      # initialise a value for epsilon
    
    def sigmoid(self,z): # code for the sigmoid function
      """This function returns the sigmoid value of the input z"""
      return (1/(1+np.exp(-z)))
    
    def relu(self,z):
      """This function returns the sigmoid value of the input z"""  
      if abs(z) >= 0:
        return z
      else:
        return np.zeros(z)
        
    def backProp(self,x,y,a2,a1,z1,z2):
        del_z2 = a2-y                                                         # del is the partial derivative                                                         
        del_b2 =  del_z2                                                      # calculate del_b2, the partial derivative of bias for the output layer
        del_w2 = del_z2 * a1                                                  # calculate del_w2, the partial derivative of weights in the output layer   
        fz1 = self.sigmoid(z1)                                                # calculate sigmoid of z1, the linear combination from the first layer
        f_z1 = fz1*(1-fz1)                                                    # calculate the derivative of z1
        #print(f_z1)
        #print(np.dot(del_z2,self.W))
        del_z1 = f_z1 * np.dot(del_z2,self.W)                                 # calculate del_z1, the partial derivative for z1 wrt loss
        del_w1 = del_z1 * np.expand_dims(x,axis=1)                            # calculate del_w1, the partial derivative of weights in the first layer   
        del_b1 = del_z1                                                       # calculate del_b1, the partial derivative of bias in the first layer   
        self.W -= self.alpha * del_w2                                         # gradient descent step
        self.b -= self.alpha * del_b2                                         # gradient descent step
        for i in range(self.noOfNodes):                                       # do this for all the nodes in the hidden layer
           node = self.nodesList[i]
           node.W -= self.alpha * del_w1[:,i]                                 # gradient descent step
           node.b -= self.alpha * del_b1[i]                                   # gradient descent step
    
    def forwardProp(self,x):
        a1 = []                                                               # intialise values
        z1 = []                                                               # intialise values
        for i in range(self.noOfNodes):                                       # for all the nodes in the hidden layer
            node = self.nodesList[i]
            z = np.dot(x,node.W.T) + node.b                                   # calculate the linear combination x.W + b
            z1.append(z)                                                      # store the value
            activateNode = self.sigmoid(z)                                    # calculate the activation for the hidden layer
            a1.append(activateNode)                                           # store the activation value
        a1 = np.array(a1)
        z2 = np.dot(self.W,a1) + self.b                                       # calculate the linear combination a1.W + b
        a2 = self.sigmoid(z2)                                                 # calculate the prediction
        return a1,a2,np.array(z1),z2
    
    def loss(self,y,y_):
        return (-np.dot(y,np.log(y_.T+self.epsilon)) - np.dot((1-y),np.log((1-y_+self.epsilon).T)))  #return loss
        
    def fit(self,X,y):
      """This function trains the NN"""
      self.X = X                                                             # save X
      self.convertLabels(list(set(y)))                                       # store distinct values of y and convert them to class numbers
      self.y = self.encodeLabels(y)                                          # encode values of y  
      rows,columns = X.shape
      mu,sigma = 0,0.01                                                      # take mean=0, standard deviation = 0.01
      seed = 1                                                               # initialise seed
      self.W = np.random.normal(mu,sigma,size=self.noOfNodes)                # choose intial weight values
      self.b = np.random.random()                                            # choose intial bias values
      
      for i in range(self.noOfNodes):
          W_node = np.random.normal(mu,sigma,size=columns)                   # start from random values for weights
          b_node = np.random.random()                                        # choose intial bias values
          lr = LogisticRegression()                                          # use the model from part 1
          lr.W = W_node                                                      # save the weights
          lr.b = b_node                                                      # save the bias 
          self.nodesList.append(lr)                                          # store to list 
      for i in range(self.iterations):
          sample = np.random.randint(len(X))                                 # choose a random sample (its SGD)
          a1,a2,z1,z2 = self.forwardProp(X[sample])                          # do forward propagation
          if (i%10000 == 0):                                                 # check the loss value at intervals of 10000  
              lossValue = self.loss(self.y[sample],a2)
              print('loss is:',lossValue)
          self.backProp(X[sample],self.y[sample],a2,a1,z1,z2)                # do back propagation
    
    def convertLabels(self,labels):
        self.class_labels = { 0 : labels[0] , 1 : labels[1]}                 # convert class values to numbers 
    
    def encodeLabels(self,y):
        y_ = []
        for i in range(len(y)):                                              # encode labels as 1 or 0
            if y[i] == self.class_labels[0]:
                y_.append(0)
            else:
                y_.append(1)
        return y_        
    
    def predict(self,X):
        a1,a2,z1,z2 = self.forwardProp(X)                                   # use forward propagation to get the prediction
        #print('prediction is',a2)
        if a2 >= 0.5:
            return self.class_labels[1]                                     # return the highest class
        else:  
            return self.class_labels[0]                                     # return the lowest class
        
    def score(self,X,y): # function which returns the accuracy value
      """This function returns the accuracy score for the X values and compares them with output y."""
      count = 0  
      for x,y_ in zip(X,y):
        predictedLabel = self.predict(x)                                   # get the prediction
        if predictedLabel == y_:                                           # check if it matches actual
            count += 1                                                     # increase the count if True
      return count/len(y)                                                  # calculate accuracy    
        

In [27]:
# Test Moons data for the shallow NN
df1 = pd.read_csv('moons400.csv',sep=',')
features = df1.drop(columns=['Class'])
classes = df1['Class']
X = np.array(features)
y = np.array(classes)
accuracyValues = []
parameters = {'alpha' : [1e-3,1e-2,5e-3,5e-2],'noOfNodes':[2,3,5,7,9],'iterations':[1000,2000,5000,10000]}
#gs = GridSearchCV(estimator=ShallowNN(),param_grid=parameters)
#gs.fit(X,y)
#print(gs.best_estimator_)
for i in range(10): # use 10 iterations as required
     X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.3,random_state=i) # use 15 percent of data as the test set
     X_val,X_test,y_val,y_test = train_test_split(X_test, y_test,test_size=0.5,random_state=i)
     sn = ShallowNN(noOfNodes=9,iterations=10000) # create an object
     sn.fit(X_train,y_train) # train the model
     score = sn.score(X_test,y_test)
     print('The accuracy is ', score) # get the accuracy/score of the given inputs
     accuracyValues.append(score)
print('The average accuracy is',np.mean(accuracyValues))     


The accuracy is  0.8833333333333333
The accuracy is  0.8833333333333333
The accuracy is  0.9
The accuracy is  0.8166666666666667
The accuracy is  0.8666666666666667
The accuracy is  0.85
The accuracy is  0.9166666666666666
The accuracy is  0.85
The accuracy is  0.9
The accuracy is  0.85
The average accuracy is 0.8716666666666667


In [6]:
# Test blobs data for the shallow NN
df2 = pd.read_csv('blobs250.csv',sep=',')
features = df2.drop(columns=['Class'])
classes = df2['Class']
X = np.array(features)
y = np.array(classes)
accuracyValues = []
parameters = {'alpha' : [1e-3,1e-2,5e-3,5e-2],'noOfNodes':[2,3,5,7,9]}
#gs = GridSearchCV(estimator=ShallowNN(),param_grid=parameters)
#gs.fit(X,y)
#print(gs.best_estimator_)
for i in range(10): # use 10 iterations as required
   X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.3,random_state=i) # use 1/3 rd of data as the test set
   X_val,X_test,y_val,y_test = train_test_split(X_test, y_test,test_size=0.5,random_state=i)
   sn = ShallowNN(alpha=0.01) # create an object
   sn.fit(X_train,y_train) # train the model
   score = sn.score(X_test,y_test)
   print('The accuracy is ', score) # get the accuracy/score of the given inputs
   accuracyValues.append(score)
print('The average accuracy is',np.mean(accuracyValues)) 

The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The accuracy is  1.0
The average accuracy is 1.0


# Part 4: Challenging task

Results and observations:
For this task, I used a Shallow NN with a hidden layer of 50 nodes. Every image in the chosen batch had a data of length 3072 (1024 * 3 colour channels). I filtered out the classes given to me and used the red channel of the image only. After this, I normalized the data and did a train,test,validate split.

I got a test accuracy score of 0.8888888888888888 for 200 epochs.The training times was considerably higher because the hidden layer had significantly more nodes than the ShallowNN model used for the moons dataset.I tried for different combinations of nodes in the hidden layer and the number of epochs and I realized that this model will need sufficient training to be a good enough image classifier. 



In [14]:
# This function taken from the CIFAR website

import matplotlib.pyplot as plt
import pickle

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

# Loaded in this way, each of the batch files contains a dictionary with the following elements:
#   data -- a 10000x3072 numpy array of uint8s. Each row of the array stores a 32x32 colour image. 
#           The first 1024 entries contain the red channel values, the next 1024 the green, and the final 1024 the blue. 
#           The image is stored in row-major order, so that the first 32 entries of the array are the red channel values 
#           of the first row of the image.
#   labels -- a list of 10000 numbers in the range 0-9. 
#             The number at index i indicates the label of the ith image in the array data.

def loadbatch(batchname):
    folder = 'cifar-10-batches-py'
    batch = unpickle(folder+"/"+batchname)
    return batch

def loadlabelnames():
    folder = 'cifar-10-batches-py'
    meta = unpickle(folder+"/"+'batches.meta')
    return meta[b'label_names']

def visualise(data, index):
    # MM Jan 2019: Given a CIFAR data nparray and the index of an image, display the image.
    # Note that the images will be quite fuzzy looking, because they are low res (32x32).

    picture = data[index]
    # Initially, the data is a 1D array of 3072 pixels; reshape it to a 3D array of 3x32x32 pixels
    # Note: after reshaping like this, you could select one colour channel or average them.
    picture.shape = (3,32,32) 
    
    # Plot.imshow requires the RGB to be the third dimension, not the first, so need to rearrange
    picture = picture.transpose([1, 2, 0])
    plt.imshow(picture)
    plt.show()
    
batch1 = loadbatch('data_batch_1')
print("Number of items in the batch is", len(batch1))

# Display all keys, so we can see the ones we want
print('All keys in the batch:', batch1.keys())

data = batch1[b'data']
labels = batch1[b'labels']
print ("size of data in this batch:", len(data), ", size of labels:", len(labels))
print (type(data))
print(data.shape)

names = loadlabelnames()

Number of items in the batch is 4
All keys in the batch: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
size of data in this batch: 10000 , size of labels: 10000
<class 'numpy.ndarray'>
(10000, 3072)


In [15]:
# Display a few images from the batch
cifar_dataset = []
cifar_classes = []
for i in range (len(data)):
   if (names[labels[i]] == b'ship') or (names[labels[i]] == b'dog'): # filter the images belonging to the classes given to me for this task (ship,dog)       
       img = np.resize(data[i],(3,1024))                             # reshape according to rgb channel values
       img = img[0]/255                                              # choose one colour channel and normalize it
       cifar_dataset.append(img)                                     # save the image to list
       cifar_classes.append(names[labels[i]])                        # save the label of the above image to a list

In [30]:
# Test CIFAR data for the shallow NN

X = np.array(cifar_dataset)            # create a numpy version of the cifar data
y = np.array(cifar_classes)            # create a numpy version for the classes

accuracyValues = []
parameters = {'alpha' : [1e-3,1e-2,5e-3,5e-2],'noOfNodes':[50,150,200]}
#gs = GridSearchCV(estimator=ShallowNN(),param_grid=parameters)    # do a grid search
#gs.fit(X,y)                                                       # fit to data
#print(gs.best_estimator_)                                         # check the best model
X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.3,random_state=1) # use 1/3 rd of data as the test set
X_val,X_test,y_val,y_test = train_test_split(X, y,test_size=0.5,random_state=1)
sn = ShallowNN(alpha=0.01, iterations=len(y_train) * 200, noOfNodes=50) # create an object
sn.fit(X_train,y_train) # train the model
score = sn.score(X_test,y_test)
print('The accuracy is ', score) # get the accuracy/score of the given inputs
accuracyValues.append(score)
print('The average accuracy is',np.mean(accuracyValues))     

In [19]:
#sn.score(X_test,y_test)                            
with open('ShallowNN.pkl', 'wb') as f:              #save model to disk
    pickle.dump(sn, f)

In [36]:
with open('ShallowNN.pkl', 'rb') as f:             #read model from disk
    sn = pickle.load(f)
print(sn.score(X_test,y_test))    

0.8888888888888888


# Part 5: Deep Learning Enhancements

Results and observations:
I implemented Backprop with Momentum as a Deep Learning Enhancement. The fundamental idea [3] to the idea of 'Momentum' is that the change in parameters (or gradients) must be influenced by the previous changes to the same parameters only.This allows the individual parameters to change at a rate appropriate to itself. This is achieved by computing an exponential moving average of previous gradients and using that to update the current parameter.

So, when momentum is introduced to back propagation, the below changes apply:
VΔW = (1 − β) ΔW + β VΔW (the current value of VΔW is based on an exponential moving average) (for weights)
VΔb = (1 − β) Δb + β VΔb (for biases)

W −= αVΔW , b −= αVΔb (Gradient descent update)

Studies have shown that Momentum helps the model learn quickly than just batch gradient and stochastic gradient descent.


Results:
I got a test accuracy score of 0.8878695208970439 for 200 epochs which is similar to the results of the ShallowNN model .The training times was considerably higher than the ShallowNN model used for the moons dataset because the hidden layer had significantly more nodes. With momentum,it helps the model train quicker than SGD and I feel it is a good enhancement to have.


In [21]:
# Shallow NN with Deep Learning Enhancement

class ShallowNNWithMomentum(BaseEstimator):
    def __init__(self,alpha=0.05,noOfNodes=2,activation='sigmoid',iterations=1000,epsilon = 1e-06,beta = 0.9):
        self.noOfNodes = noOfNodes
        self.activation = activation
        self.nodesList = []
        self.X = []
        self.y = []
        self.W = []
        self.v_delW = []
        self.v_delB = []
        self.iterations = iterations
        self.b = []
        self.alpha = alpha
        self.class_labels = {}
        self.epsilon = epsilon
        self.beta = beta
    
    def sigmoid(self,z): # code for the sigmoid function
      """This function returns the sigmoid value of the input z"""
      return (1/(1+np.exp(-z)))
    
    def relu(self,z):
      """This function returns the sigmoid value of the input z"""  
      if abs(z)>=0:
        return z
      else:
        return np.zeros(z)
        
    def backProp(self,x,y,a2,a1,z1,z2,initial=False):
        del_z2 = a2-y
        del_b2 =  del_z2
        del_w2 = np.dot(del_z2,a1.T)
        fz1 = self.sigmoid(z1)
        f_z1 = fz1*(1-fz1)
        del_z1 = f_z1 * np.dot(del_z2,self.W)
        del_w1 = del_z1 * np.expand_dims(x,axis=1)
        del_b1 = del_z1
        if initial:                                                          
            self.v_delW = del_w2                                           # intitially vdelW = delW
            self.v_delb = del_b2                                           # intitially vdelb = delb
        else:
            self.v_delW = (1-self.beta) * del_w2 + self.beta * self.v_delW
            self.v_delb = (1-self.beta) * del_b2 + self.beta * self.v_delb
        self.W -= self.alpha * self.v_delW
        self.b -= self.alpha * self.v_delb
        
        for i in range(self.noOfNodes):
           node = self.nodesList[i]
           if initial:
              node.v_delW = del_w1[:,i]                                   # intitially vdelW = delw
              node.v_delb = del_b1[i]                                     # intitially vdelb = delb
           else:
              node.v_delW = (1-self.beta) * del_w1[:,i] + self.beta * node.v_delW
              node.v_delb = (1-self.beta) * del_b1[i] + self.beta * node.v_delb
           node.W -= self.alpha * node.v_delW
           node.b -= self.alpha * node.v_delb     
    
    def forwardProp(self,x):
        a1 = []
        z1 = []
        for i in range(self.noOfNodes):
            node = self.nodesList[i]
            z = np.dot(x,node.W.T) + node.b
            z1.append(z)
            activateNode = self.sigmoid(z)
            a1.append(activateNode)
        a1 = np.array(a1)
        z2 = np.dot(self.W,a1) + self.b
        a2 = self.sigmoid(z2)
        return a1,a2,np.array(z1),z2
    
    def loss(self,y,y_):
        return (-np.dot(y,np.log(y_.T+self.epsilon)) - np.dot((1-y),np.log((1-y_+self.epsilon).T)))
        
    def fit(self,X,y):
      """This function trains the NN"""
      self.X = X
      self.convertLabels(list(set(y)))  
      self.y = self.encodeLabels(y)
      rows,columns = X.shape
      mu,sigma = 0,0.01
      seed = 1  
      self.W = np.random.normal(mu,sigma,size=self.noOfNodes)
      self.b = np.random.random()  
      #loss = 0
        
      for i in range(self.noOfNodes):
          rows,columns = X.shape
          W_node = np.random.normal(mu,sigma,size=columns) # start from random values for weights
          #b_node = float(np.random.normal(mu,sigma,size=1))  
          b_node = np.random.random()
          lr = LogisticRegression()
          lr.W = W_node
          lr.b = b_node
          self.nodesList.append(lr)
      for i in range(self.iterations):
          sample = np.random.randint(len(X))
          a1,a2,z1,z2 = self.forwardProp(X[sample])
          #print('a2 is',a1,a2,z1,z2)  
          lossValue = self.loss(self.y[sample],a2)
          if i%10000 == 0:
            print('loss is:',lossValue)
          if i==0:
             self.backProp(X[sample],self.y[sample],a2,a1,z1,z2,True)
          else:      
             self.backProp(X[sample],self.y[sample],a2,a1,z1,z2)  
    
    def convertLabels(self,labels):
        self.class_labels = { 0 : labels[0] , 1 : labels[1]}
    
    def encodeLabels(self,y):
        y_ = []
        for i in range(len(y)):
            if y[i] == self.class_labels[0]:
                y_.append(0)
            else:
                y_.append(1)
        return y_        
    
    def predict(self,X):
        a1,a2,z1,z2 = self.forwardProp(X)
        #print('prediction is',a2)
        if a2 >= 0.5:
            return self.class_labels[1]
        else:
            return self.class_labels[0]
        
    def score(self,X,y): # function which returns the accuracy value
      """This function returns the accuracy score for the X values and compares them with output y."""
      count = 0  
      for x,y_ in zip(X,y):
        predictedLabel = self.predict(x)
        #print('Predicted is',predictedLabel)
        #print('Actual is',y_)
        if predictedLabel == y_:
            count += 1
      return count/len(y) # calculate accuracy    
        

In [33]:
# Test CIFAR data for the shallow NN with momentum

X = np.array(cifar_dataset)
y = np.array(cifar_classes)

accuracyValues = []
parameters = {'alpha' : [1e-3,1e-2,5e-3,5e-2],'noOfNodes':[50,100,150]}
#gs = GridSearchCV(estimator=ShallowNNWithMomentum(),param_grid=parameters)
#gs.fit(X,y)
#print(gs.best_estimator_)
#for i in range(1): # use 10 iterations as required
X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.3,random_state=1) # use 1/3 rd of data as the test set
X_val,X_test,y_val,y_test = train_test_split(X, y,test_size=0.5,random_state=1)
snw = ShallowNNWithMomentum(alpha=0.01, iterations=200 * len(y_train), noOfNodes=50) # create an object
snw.fit(X_train,y_train) # train the model
score = sn.score(X_train,y_train)
print('The accuracy is ', score) # get the accuracy/score of the given inputs
accuracyValues.append(score)
print('The average accuracy is',np.mean(accuracyValues))

In [23]:
print(snw.score(X_test,y_test))
with open('ShallowNNWithMomentum.pkl', 'wb') as f:          #save model to disk
    pickle.dump(snw, f)

0.8878695208970439


In [35]:
with open('ShallowNNWithMomentum.pkl', 'rb') as f:           #read model from disk
    snw = pickle.load(f)
snw.score(X_test,y_test)

0.8878695208970439

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X[:,0], X[:,1], c=y)

Reference:
1) Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.