<a href="https://colab.research.google.com/github/samibahig/IFT6135/blob/main/FeedForwardNeuralNetwork.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import numpy as np


class NN(object):
    def __init__(self,
                 hidden_dims=(512, 256),
                 datapath='cifar10.pkl',
                 n_classes=10,
                 epsilon=1e-6,
                 lr=7e-4,
                 batch_size=1000,
                 seed=None,
                 activation="relu",
                 init_method="glorot",
                 normalization=False
                 ):

        self.hidden_dims = hidden_dims
        self.n_hidden = len(hidden_dims)
        self.datapath = datapath
        self.n_classes = n_classes
        self.lr = lr
        self.batch_size = batch_size
        self.init_method = init_method
        self.seed = seed
        self.activation_str = activation
        self.epsilon = epsilon

        self.train_logs = {'train_accuracy': [], 'validation_accuracy': [], 'train_loss': [], 'validation_loss': []}

        if datapath is not None:
            u = pickle._Unpickler(open(datapath, 'rb'))
            u.encoding = 'latin1'
            self.train, self.valid, self.test = u.load()
            if normalization:
                self.normalize()
        else:
            self.train, self.valid, self.test = None, None, None

    def initialize_weights(self, dims):
        if self.seed is not None:
            np.random.seed(self.seed)

        self.weights = {}
        # self.weights is a dictionnary with keys W1, b1, W2, b2, ..., Wm, Bm where m - 1 is the number of hidden layers
        all_dims = [dims[0]] + list(self.hidden_dims) + [dims[1]]
        for layer_n in range(1, self.n_hidden + 2):
            # WRITE CODE HERE
            self.weights[f"b{layer_n}"] = np.zeros((1, all_dims[layer_n]))
            d1=(all_dims[layer_n-1])
            d2 = all_dims[layer_n]   #will this go out of range for last iteration?  to check.
            w_range = np.sqrt (6/(d1+d2))
            self.weights[f"W{layer_n}"] = np.random.uniform(-w_range, w_range, (d1,d2))

    #ref: https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html
    #ref: https://stackoverflow.com/questions/32109319/how-to-implement-the-relu-function-in-numpy    
    def relu(self, x, grad=False):
       if grad:
          return 1.0 * (x > 0)
       else:
          return x * (x > 0)

    def sigmoid(self, x, grad=False):

        sig = 1 / (1 + np.exp(-x))
        if grad:
            return (sig *(1-sig))
        else:
            return sig


    def tanh(self, x, grad=False):
        z1= np.exp(x)
        z2= np.exp(-x)
        tanh = (z1-z2)/(z1+z2)

        if grad:
            return (1 - tanh*tanh)
        else:
            return tanh

    #ref1:  https://stackoverflow.com/questions/48102882/how-to-implement-the-derivative-of-leaky-relu-in-python/48102959
    #note that deriv is not defined for x==0, what will happen (will be treated same as false)
    def leakyrelu(self, x, grad=False):
        alpha = 0.01
        if grad:
          return (1.0 *(x > 0)) + (alpha* (x <0))
        else:
          return (x * (x > 0)) + (alpha*x*(x <0))

    def activation(self, x, grad=False):
        if self.activation_str == "relu":
            ret=self.relu (x, grad)            
        elif self.activation_str == "sigmoid":
            ret=self.sigmoid (x, grad)            
        elif self.activation_str == "tanh":
            ret=self.tanh (x, grad)
        elif self.activation_str == "leakyrelu":
            ret=self.leakyrelu (x, grad)            
        else:
            raise Exception("invalid")
        return ret



    def softmax(self,x):
       
        z = x - np.max(x, axis=-1, keepdims=True)

        numerator = np.exp(z)

        denominator = np.sum(numerator, axis=-1, keepdims=True)

        ret = numerator / denominator

        return ret

    #ref: https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python
    #axis arg important for 2D array case
    def softmax_old(self, x):
        # Remember that softmax(x-C) = softmax(x) when C is a constant.
        #print (x.shape)
        #print (x)

        print ("inside softmax1", x.ndim)

        if (x.ndim ==1):
         ax=0
        else:
         ax=1

        ex = np.exp(x-np.amax(x))
        print ("inside softmax2")
        sum= np.sum(ex, axis=ax)
        print ("inside softmax3")
        ret = ex / sum
        print ("inside softmax4")

        #print (ret.shape)
        #print (ret)

        return ret



    def forward_original(self, x):
        cache = {"Z0": x}
        # cache is a dictionnary with keys Z0, A0, ..., Zm, Am where m - 1 is the number of hidden layers
        # Ai corresponds to the preactivation at layer i, Zi corresponds to the activation at layer i
        # WRITE CODE HERE
        pass
        return cache

    # self.weights is a dictionnary with keys W1, b1, W2, b2, ..., Wm, Bm where m - 1 is the number of hidden layers
    # for 2 layered network W1,b1, W2,b2, W3,b3 (3rd layer is output)
    def forward(self, x):
        cache = {"Z0": x}
        # cache is a dictionnary with keys Z0, A0, ..., Zm, Am where m - 1 is the number of hidden layers
        # Ai corresponds to the preactivation at layer i, Zi corresponds to the activation at layer i

        for layer_n in range(0, self.n_hidden+1):
            #cache[f"A{layer_n+1}"] =  cache[f"Z{layer_n}"] * self.weights[f"W{layer_n+1}"] + self.weights[f"b{layer_n+1}"]
            
            z=cache[f"Z{layer_n}"]
            w=self.weights[f"W{layer_n+1}"]
            ##print ("**layer, z, w:", layer_n+1, z.shape, w.shape)

            cache[f"A{layer_n+1}"] =  np.matmul (cache[f"Z{layer_n}"],self.weights[f"W{layer_n+1}"]) + self.weights[f"b{layer_n+1}"]

            if (layer_n == self.n_hidden):  #last layer
                cache[f"Z{layer_n+1}"] = self.softmax(cache[f"A{layer_n+1}"])
            
            else: #middle layer
                cache[f"Z{layer_n+1}"] = self.activation (cache[f"A{layer_n+1}"])

        '''  
        #code should be doing the following =>
        for layerId in range(1, self.n_hidden+1):
            #for 1
            A1 = Z0*W1 + b1   # 
            Z1 = Relu (A1)    # 
            #for2
            A2 = Z1*W2 + b2
            Z2 = Relu (A2)
            #for output layer
            A3 = Z2*W3 + b3
            Z3 = Softmax (A3)
        '''

        return cache


    def backward_2(self, cache, labels):

        output = cache[f"Z{self.n_hidden + 1}"]

        grads = {}
                                                                                        

        grads["dA"+str(self.n_hidden + 1)] = output - labels

               
        grads["dZ"+str(self.n_hidden)] = np.dot(grads["dA"+str(self.n_hidden + 1)], self.weights["W"+str(self.n_hidden + 1)].T)  
                
        grads["dW"+str(self.n_hidden + 1)] = (1/self.batch_size)*np.dot(cache["Z"+str(self.n_hidden)].T, grads["dA"+str(self.n_hidden + 1)])   

        grads["db"+str(self.n_hidden + 1)] = (1/self.batch_size)*np.sum(grads["dA"+str(self.n_hidden + 1)], axis=0, keepdims=True)
          

        for layer_n in reversed(range(1,self.n_hidden +1)):

            grads["dA"+str(layer_n)] = grads["dZ"+str(layer_n)]*self.activation(cache["Z"+str(layer_n)],True)

            if (layer_n>1):

                grads["dZ"+str(layer_n-1)] = np.dot(grads["dA"+str(layer_n)],self.weights["W"+str(layer_n)].T)

            grads["dW"+str(layer_n)] = (1/self.batch_size)*np.dot(cache["Z"+str(layer_n)].T, grads["dA"+str(layer_n)])   

            grads["db"+str(layer_n)] = (1/self.batch_size)*np.sum(grads["dA"+str(layer_n)], axis = 0,keepdims=True)

        #print(grads)      

        return grads

    '''
       output = m x n
       labels = batch_size x m

       Backward should have following entries
       dA3, dW3, db3
       dZ2, dA2, dW2, db2
       dZ1, dA1, dW1, db1
    '''

    def backward (self, cache, labels):
        output = cache[f"Z{self.n_hidden + 1}"]
        grads = {}
        # grads is a dictionnary with keys dAm, dWm, dbm, dZ(m-1), dA(m-1), ..., dW1, db1
        # WRITE CODE HERE

        #print ("self.batchsize=", self.batch_size)
        #print ("labels.shape=", labels.shape)

        #1) Evaluate dA3,  f(x) = output of the NN, = Z3
        grads[f"dA{self.n_hidden+1}"] =  output - labels

        LL = self.n_hidden + 1

        for kk in range (LL,0,-1):    # do this for LL=3,2,1

            #todo: Check shapes in below equation
            zz = cache[f"Z{kk-1}"]
            zz_t = np.transpose (zz)
            #print ("kk,shape1,shape2", kk, zz_t.shape, grads[f"dA{kk}"].shape )

            grads[f"dW{kk}"] = (1/self.batch_size)*(np.matmul(  zz_t, grads[f"dA{kk}"]))
            #grads[f"db{kk}"] = (1/self.batch_size)*grads[f"dA{kk}"]

            #print ("kk,dA:", kk, grads[f"dA{kk}"])

            grads[f"db{kk}"] = (1/self.batch_size)*np.sum(grads[f"dA{kk}"], axis=0, keepdims=True)  #todo- check the args
            
            #print ("kk,dB:", kk, grads[f"db{kk}"])

            if (kk > 1):              #do this only for LL=3,2
                
                #todo: Check shapes in below equation
                ww = self.weights[f"W{kk}"]
                ww_t = np.transpose (ww)

                #print ("kk,shape1a,shape2a", kk, grads[f"dA{kk}"].shape, ww_t.shape )
                grads[f"dZ{kk-1}"] = np.matmul(  grads[f"dA{kk}"], ww_t )


                #todo: Check shapes in below equation
                deriv  = self.activation( cache[f"Z{kk-1}"], grad=True) ## TODO - Find this term
                deriv_tr = np.transpose (deriv)
                
                #print ("kk, g.shape, d.shape", kk, grads[f"dZ{kk-1}"].shape, deriv.shape)
                
                grads[f"dA{kk-1}"] = grads[f"dZ{kk-1}"] * (deriv)

        return grads

    def update(self, grads):
        for layer in range(1, self.n_hidden + 2):
            # WRITE CODE HERE
            self.weights[f"b{layer}"] -= grads[f"db{layer}"]*self.lr
            self.weights[f"W{layer}"] -= grads[f"dW{layer}"]*self.lr
            

    #TODO: Test this function
    def one_hot(self, y):
        # WRITE CODE HERE
        
        batch_size = len(y)
        num = np.amax(y)
        out = np.zeros ((batch_size, num+1),dtype=int)

        for i,yy in enumerate (y):
            out[i][yy]=1
        
        return out

    # predictions = batch_size x n_classes
    # labels = batch_size x n_classes
    # Ref: https://pmirla.github.io/2016/10/09/Basics-crossEntropy.html
    #cross-entripy loss formula
    #1/n sigma (-np.log(qi))  => this is because labels is 1 hot (need to multiply only true label with its actual prob as reported by softmax)
    def loss(self, prediction, labels):
        prediction[np.where(prediction < self.epsilon)] = self.epsilon
        prediction[np.where(prediction > 1 - self.epsilon)] = 1 - self.epsilon
        
        # WRITE CODE HERE
        #print ("p.shape, labels.shape:",prediction.shape, labels.shape )
        batch_size = labels.shape[0]
        n_classes = labels.shape[1]

        label_index = np.argmax(labels, axis=1)

        #print ("len(label_index):",len(label_index))
        assert (batch_size == len(label_index))
        
        logVal=0.0
        for i,index in enumerate (label_index):
            #print ("i,index, val", i, index, prediction[i][index])
            logVal += -np.log (prediction[i][index])

        
        logVal = logVal/batch_size

        
        return logVal

    def compute_loss_and_accuracy(self, X, y):
        one_y = self.one_hot(y)
        cache = self.forward(X)
        predictions = np.argmax(cache[f"Z{self.n_hidden + 1}"], axis=1)
        accuracy = np.mean(y == predictions)
        loss = self.loss(cache[f"Z{self.n_hidden + 1}"], one_y)
        return loss, accuracy, predictions

    def train_loop(self, n_epochs):
        X_train, y_train = self.train
        y_onehot = self.one_hot(y_train)
        dims = [X_train.shape[1], y_onehot.shape[1]]
        self.initialize_weights(dims)

        n_batches = int(np.ceil(X_train.shape[0] / self.batch_size))

        for epoch in range(n_epochs):
            
            #print ("Training epoch, n_batches:", epoch+1, n_batches)

            for batch in range(n_batches):
                minibatchX = X_train[self.batch_size * batch:self.batch_size * (batch + 1), :]
                minibatchY = y_onehot[self.batch_size * batch:self.batch_size * (batch + 1), :]
                # WRITE CODE HERE
                
                cache = self.forward(minibatchX)
                grads = self.backward(cache, minibatchY)
                self.update(grads)

            X_train, y_train = self.train
            train_loss, train_accuracy, _ = self.compute_loss_and_accuracy(X_train, y_train)
            X_valid, y_valid = self.valid
            valid_loss, valid_accuracy, _ = self.compute_loss_and_accuracy(X_valid, y_valid)

            self.train_logs['train_accuracy'].append(train_accuracy)
            self.train_logs['validation_accuracy'].append(valid_accuracy)
            self.train_logs['train_loss'].append(train_loss)
            self.train_logs['validation_loss'].append(valid_loss)

            #print ("Epoch, Acc(Tr/Val), Loss(Tr/Val): %d\t%f\t%f\t%f\t%f" % (epoch+1), train_accuracy, valid_accuracy, train_loss, valid_loss)
            print(f"Epoch {epoch+1} | Train loss {train_loss:.04f} | Train acc {train_accuracy:.04f} |"
                      f" Valid loss {valid_loss:.04f} | Valid acc {valid_accuracy:.04f}")

        return self.train_logs

    def evaluate(self):
        X_test, y_test = self.test
        # WRITE CODE HERE
        test_loss, test_accuracy, _ = self.compute_loss_and_accuracy(X_test, y_test)
        return (test_loss, test_accuracy)

    '''
    badguy = list(badguy)
    badguy[0]-=7
    badguy = tuple(badguy)
    '''
    def normalize(self):
        # WRITE CODE HERE
        # compute mean and std along the first axis

        mean = np.mean (self.train[0], axis=0)
        std = np.std (self.train[0], axis=0)

        #print (f"mean{mean:.04f}, std={std:.04f}")

        '''
        self.train[0] = (self.train[0]-mean)/(std)
        self.valid[0] = (self.valid[0]-mean)/(std)
        self.test[0] = (self.test[0]-mean)/(std)
        '''
        x_train = (self.train[0]-mean)/(std)
        y_train = self.train[1]
        x_valid = (self.valid[0]-mean)/(std)
        y_valid = self.valid[1]
        x_test = (self.test[0]-mean)/(std)
        y_test = self.test[1]

        self.train = (x_train, y_train)
        self.valid = (x_valid, y_valid)
        self.test =  (x_test, y_test)

        pass


def testrelu(x, grad=False):
    if grad:
       return 1.0 * (x > 0)
    else:
       return x * (x > 0)

'''
Trying forward prop using a neural network of hidden dimensions (101, 102, 300) and seed 5692 and relu activation on a subset of svhn
Test Failed: The values inside your cache dictionary are wrong
'''
def test_one_hot( y):
        # WRITE CODE HERE
        
        batch_size = len(y)
        num = np.amax(y)
        out = np.zeros ((batch_size, num+1),dtype=int)

        for i,yy in enumerate (y):
            out[i][yy]=1
        
        return out

def SVHN_Q4_depth_and_width ():

   #hidden_dims=(512, 120, 120, 120, 120, 120, 120)
    
   nn1 = NN (datapath='/content/drive/MyDrive/svhn (2) (1).pkl',hidden_dims=(512, 120, 120, 120, 120, 120, 120), lr=0.03, batch_size=100, seed=0, normalization=False)

   no_of_epoch = 30
   logs = nn1.train_loop (no_of_epoch)

   print ("Finished with training")
   print (logs)



def SVHN_Q3_with_normalization ():

   '''
                    lr=7e-4,
                 batch_size=1000,
                 seed=None,
   '''
   nn1 = NN(datapath='svhn.pkl',lr=0.03, batch_size=100, seed=0, normalization=True)

   no_of_epoch = 30
   logs = nn1.train_loop (no_of_epoch)

   print ("Finished with training")
   print (logs)


if __name__ == "__main__":

    #n1 = NN(datapath=None)
    
    
    '''
    //check self.valid[0] = 3000x3072 matrix, self.valid[1] = array of 3000 labels
    //self.train 67000
    //self.valid 3000
    //self.test 20000 
    //report evolution of training and validation accuracies
    //report evolution of training and val loss
    //check that at epoch=30, training accuracy > 0.7
    '''
    #SVHN_Q3_with_normalization()

    '''

    '''
    SVHN_Q4_depth_and_width()

   
    print ("Hello!")

Epoch 1 | Train loss 2.1234 | Train acc 0.2588 | Valid loss 2.1216 | Valid acc 0.2620
Epoch 2 | Train loss 1.7353 | Train acc 0.4051 | Valid loss 1.7280 | Valid acc 0.4047
Epoch 3 | Train loss 1.6373 | Train acc 0.4394 | Valid loss 1.6535 | Valid acc 0.4297
Epoch 4 | Train loss 1.2176 | Train acc 0.6182 | Valid loss 1.2191 | Valid acc 0.6140
Epoch 5 | Train loss 1.3433 | Train acc 0.5664 | Valid loss 1.3415 | Valid acc 0.5730
Epoch 6 | Train loss 1.1031 | Train acc 0.6485 | Valid loss 1.1308 | Valid acc 0.6420
Epoch 7 | Train loss 1.2465 | Train acc 0.5993 | Valid loss 1.2805 | Valid acc 0.5927
Epoch 8 | Train loss 0.9242 | Train acc 0.7058 | Valid loss 0.9663 | Valid acc 0.6930
Epoch 9 | Train loss 1.1294 | Train acc 0.6375 | Valid loss 1.1727 | Valid acc 0.6287
Epoch 10 | Train loss 1.1167 | Train acc 0.6397 | Valid loss 1.1693 | Valid acc 0.6217
Epoch 11 | Train loss 0.9704 | Train acc 0.6905 | Valid loss 1.0277 | Valid acc 0.6747
Epoch 12 | Train loss 0.9379 | Train acc 0.6983 | Va

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
!cp /content/drive/MyDrive/svhn (2) (1).pkl
#https://drive.google.com/file/d/1oaKLzGnU3F9BOD-13422T__E9QaDtLEh/view?usp=sharing
#https://drive.google.com/file/d/1dTVDhHyi3nIqaOgJ_5tmnypzHPEi_WRC/view?usp=sharing.pkl
#!cp /content/drive/MyDrive/6390/Data/test.npz /content/test.npz

Mounted at /content/drive/
/bin/bash: -c: line 0: syntax error near unexpected token `('
/bin/bash: -c: line 0: `cp /content/drive/MyDrive/svhn (2) (1).pkl'


In [None]:
import matplotlib.pyplot as plt
new_arr = []
for i in range(30):
    new_arr.append(i)
plt.plot(new_arr, train_accuracy, label="training accuracy")
plt.plot(new_arr, valid_accuracy, label="testing accuracy")
plt.title("training and validation accuracy")
plt.show()

NameError: ignored