In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import os
import imageio
from sklearn.utils import shuffle
from sklearn.datasets import load_digits

## 1. Perceptron Algorithm

In [2]:
## funtion for normalizing the features
def normalize_features(X,append=True):
    X = (X - np.mean(X, 0)) / np.std(X, 0) #normalize the features
    if append:
        X = np.append(np.ones(X.shape[0]).reshape(-1,1),X,1) #append column of ones for intercept
    return X

In [3]:
class Perceptron:
    def __init__ (self, X, y, iterations=50):
        self.X = X
        self.y = y.reshape(-1,1)
        self.m, self.n = self.X.shape
        self.i = iterations #number of max iterations
    
    def fit(self, lr=0.1, tolerance=1e-3):
        #initialize random weight vector
        W = np.ones(shape=(self.n,1))
        #to check if the updation becomes insignificant
        old_W = W
        #to keep track how the value changes
        w_hist = []
        
        #iterating till max iters
        for i in range(self.i):
            
            #forward pass
            Z = self.X @ old_W
            
            #taking sigmoid
            p = 1./(1+np.exp(-Z))
            
            #calculating loss
            loss = (-1/self.m * (np.dot(self.y.T,np.log(p))+np.dot(np.transpose(1-self.y),np.log(1-p))))
            
            #calaculating gradient
            g = -1/self.m * np.dot(self.X.T,(self.y-p))
            
            #updating value using gradient descent
            new_W = old_W - lr*g

            if np.all(np.abs(new_W - old_W) <= tolerance):
                break
            
            #storing value of updated weights
            w_hist.append(old_W)  
            
            #replacing old weights with new
            old_W = new_W
        
        #storing final weights and history of weights for visualization purposes
        self.W = old_W
        self.hist = w_hist
        
    def predict(self,x):
        scores = x @ self.W
        return 1./(1+np.exp(-scores))

### 1. Linearly Separable Data

In [4]:
#for loading the data
with open('Xlin_sep.npy','rb') as f:
    X = np.load(f)
with open('ylin_sep.npy','rb') as f:
    y = np.load(f)

In [5]:
#to set lables as binary 0 1
y[y == -1] = 0

In [6]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

#normalize training set
X_train_normalized = normalize_features(X_train)

#normalizing test set
X_test_normalized = normalize_features(X_test)

In [7]:
#making the model
p = Perceptron(X_train_normalized,y_train)
#fitting the perceptron
p.fit()

In [8]:
#to get predictions on train and test set
preds_train = p.predict(X_train_normalized)
preds_test = p.predict(X_test_normalized)

In [9]:
print('The accuracy on Train Set is',accuracy_score(y_train,preds_train>0.5))
print('The accuracy on Test Set is',accuracy_score(y_test,preds_test>0.5))

The accuracy on Train Set is 1.0
The accuracy on Test Set is 1.0


In [10]:
class Visualize:
    
    def __init__(self, X, y, W):
        self.X = X
        self.y = y
        self.W = W
    
    def plot_decision_boundary_2D(self,w):
        bias = w[0]
        weights = w[1:]
        bp = (-bias /(weights.T @ weights)) * weights
        xx , yy = bp
        m = yy/xx
        m = -1/m
        xp = np.linspace(-3,3,100)
        yp = yy - m*xx +m*xp
        return xp, yp
    
    def create_frame(self,t,w,name):
    
        if not os.path.exists(name):
            os.mkdir(name)

        fig = plt.figure(figsize=(6, 4))
        xp,yp = self.plot_decision_boundary_2D(w)
        plt.title(f'Decision Boundary of {name} at iteration {t}',fontsize = 14)
        plt.plot(xp,yp,color='green')
        scatter = plt.scatter(self.X[:,1],self.X[:,2],c=self.y,cmap = 'coolwarm', label=self.y)
        plt.ylim(-3, 3)
        classes = ['0','1']
        plt.xlabel('Feature 1',fontsize = 14);
        plt.ylabel('Feature 2',fontsize = 14);
        plt.legend(handles=scatter.legend_elements()[0], labels=classes);
        plt.savefig(f'./{name}/img_{t}.png', 
                    transparent = False,  
                    facecolor = 'white'
                   )
        plt.close()
        
    def create_animation(self,name):

        for t,w in enumerate(self.W):
            self.create_frame(t,w,name)

        frames = []
        for t in range(len(self.W)):
            image = imageio.v2.imread(f'./{name}/img_{t}.png')
            frames.append(image)

        imageio.mimsave(f'./{name}.gif', frames, fps = 10)

In [11]:
#history of weight vectors
W = p.hist

In [13]:
Visualize(X_train_normalized,y_train,W).create_animation('train_lin_sep_')
Visualize(X_test_normalized,y_test,W).create_animation('test_lin_sep_')

<img src="train_lin_sep_.gif" width="350" align="center">

<img src="test_lin_sep_.gif" width="350" align="center">

### 2. Data with Noise

In [14]:
#for loading the data
with open('Xlinnoise_sep.npy','rb') as f:
    X = np.load(f)
with open('ylinnoise_sep.npy','rb') as f:
    y = np.load(f)

In [15]:
#to set lables as binary 0 1
y[y == -1] = 0

In [16]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=158)

#normalize training set
X_train_normalized = normalize_features(X_train)

#normalizing test set
X_test_normalized = normalize_features(X_test)

In [17]:
#making the model
p = Perceptron(X_train_normalized,y_train,iterations=50)
#fitting the perceptron
p.fit()

In [18]:
#to get predictions on train and test set
preds_train = p.predict(X_train_normalized)
preds_test = p.predict(X_test_normalized)

In [19]:
print('The accuracy on Train Set is',accuracy_score(y_train,preds_train>0.5))
print('The accuracy on Test Set is',accuracy_score(y_test,preds_test>0.5))

The accuracy on Train Set is 0.7866666666666666
The accuracy on Test Set is 0.92


In [20]:
#history of weight vectors
W = p.hist

In [21]:
Visualize(X_train_normalized,y_train,W).create_animation('train_linnoise_sep_')
Visualize(X_test_normalized,y_test,W).create_animation('test_linnoise_sep_')

<img src="train_linnoise_sep_.gif" width="350" align="center">

<img src="test_linnoise_sep_.gif" width="350" align="center">

### 3. Circle Data (Non-Linear)

In [22]:
#for loading the data
with open('circles_x.npy','rb') as f:
    X = np.load(f)
with open('circles_y.npy','rb') as f:
    y = np.load(f)

In [23]:
#to set lables as binary 0 1
y[y == -1] = 0

Since the data is not linearly separable, we will add polynomial features into the data. This means instead of giving the perceptron x1 and x2 as input we will now give it  $ x_1^2, x_2^2$

In [24]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(np.square(X), y, test_size=0.25, random_state=42)

#normalize training set
X_train_normalized = normalize_features(X_train)

#normalizing test set
X_test_normalized = normalize_features(X_test)

In [25]:
#making the model
p = Perceptron(X_train_normalized,y_train,iterations=100)
#fitting the perceptron
p.fit()

In [26]:
#to get predictions on train and test set
preds_train = p.predict(X_train_normalized)
preds_test = p.predict(X_test_normalized)

In [27]:
print('The accuracy on Train Set is',accuracy_score(y_train,preds_train>0.5))
print('The accuracy on Test Set is',accuracy_score(y_test,preds_test>0.5))

The accuracy on Train Set is 1.0
The accuracy on Test Set is 1.0


In [28]:
#history of weight vectors
W = p.hist

In [30]:
Visualize(X_train_normalized,y_train,W).create_animation('train_circle_')
Visualize(X_test_normalized,y_test,W).create_animation('test_circle_')

<img src="train_circle_.gif" width="350" align="center">

<img src="test_circle_.gif" width="350" align="center">

### 2. Feed-Forward Neural Network.

In [64]:
##Note: The derivatives of the activation functions are formulated in a way to support the presented implementation of NN
## In reality the derivative of tanh is 1 - tanh(x)^2 and not 1 - x^2

def ReLU(x,y=None):
    return np.maximum(0,x), np.where(x > 0, 1, 0)

def tanh(x,y=None):
    v = np.divide((np.exp(x) - np.exp(-x)),(np.exp(x) + np.exp(-x)))
    d = 1 - np.square(x) #original derivative of function is 1 - np.square(v)
    return v,d

def softmax(x, y=None):
    e = np.exp(x)
    s = np.sum(e,axis=1,keepdims=True)
    p = np.divide(e,s)
    if y is not None:
        m = x.shape[0]
        d = x.copy()
        d[np.arange(m),y] -= 1
        return p, d/m 
    else:
        return p, None

In [166]:
class NN:
    def __init__ (self, X, y, num_neurons, activations, num_classes, dropout):
        self.X = X #features
        self.y = y #targets
        self.activations = activations #activation functions as list
        self.num_neurons = num_neurons #number of neurons in each layer as list
        self.m, self.n = X.shape #number of obs, num of features
        self.num_classes = num_classes #number of classes
        self.dropout = dropout
    
    #to initialize the weights of the vectors, Kaiming Initialization is used to avoid exploding gradient problem
    def initialize_random_weights(self):
        layers_weights = [] #list to store the weights of each layer
        input_dim = self.n #input dimension
        
        #iterate over each layer of neural network and initialize the random weights for its neurons
        for l,n in enumerate(self.num_neurons): #for all layers
            
            #using kaiming initializations to avoid very small and very large gradient values
            w = np.random.randn(input_dim, n) * np.sqrt(2.0 / input_dim)
            b = np.ones((1, n)) #bias term
            
            layers_weights.append((w,b)) #append weights and biases of the layer as tuple in a list
            input_dim = n #the rows of next layer is going to be equal to columns of the previous layer
        
        #weights and biases of the last layer using kaiming initialization scheme
        w = np.random.randn(input_dim,self.num_classes)*np.sqrt(2.0 / input_dim)
        b = np.ones(shape=(1,self.num_classes))
        
        #append layer weights of last layer
        layers_weights.append((w,b))
        
        return layers_weights #returns the weights and biases of all layers
     
    #to calculate the cross entropy loss based on the current weights of the network
    def calculate_loss(self, inputs, layers_weights, y):
        m = inputs.shape[0] #number of sample in current batch
        probs = self.forward_pass(inputs, layers_weights)[0] #forward pass of the network
        y_pred = probs[np.arange(m),y] #obtain the probability of the correct class
        loss = np.sum(-np.log(y_pred)/m) #summing the CE-Loss for each observation and normalizing it by number of samples
        return loss #return the cross entropy loss
    
    #forward pass of the Neural Network given inputs and layer weights
    def forward_pass(self, inputs, layers_weights):
        outputs_cached = [inputs] #cache the activations of each layer to be used later in backprop
        
        for w, a in zip(layers_weights,self.activations): #iterating over one layer at a time
            out = inputs @ w[0] + w[1] #taking the weighted sum of the inputs
            out = a(out)[0] #applying the activation function on the weighted sum of inputs
            outputs_cached.append(out) #caching the outputs after activations(to be used in backprop)
            inputs = out #setting the output of previous layer as input to next layer
        return out,outputs_cached #returning the output of final layer and cached activations of all hidden layers
    
    #calculating gradient of the network (alternatively called as back propagation)
    def calculate_gradients(self, inputs, layers_weights, y):
        #to store the gradients of each layer
        grads = []
        #getting the output and cached activations of the network based on current weights
        _, cached_output = self.forward_pass(inputs,layers_weights)
        #number of samples in current batch
        m = inputs.shape[0]
        
        dz = True #set the initial accumulated to any arbitrary value
        a = cached_output.pop(-1) #to get the activations of last layer
        
        #iterating over all layers and their activations from last to first this is why the procedure is called backprop
        for lw,act in zip(reversed(layers_weights),reversed(self.activations)): 
            dz = np.multiply(dz,act(a,y)[1]) 
            a = cached_output.pop(-1) #to get the activations of previous layer
            
            dw = np.dot(a.T,dz) #the derivative of weights of current layer
            db = np.sum(dz, axis = 0, keepdims=True) #the derivative of biases of current layer
            
            grads.append((dw,db)) #to store the gradients of weights and biases as a tuple
            
            dz = dz @ lw[0].T  #to update the accumulated gradient
        
        return grads[::-1] #since the gradients are calculated from last to first, reverse the gradient list
    
    #early stopping to avoid overfitting on training data
    def early_stopping(self,current_loss,weights,min_delta=1e-9,patience=5):
        #checks if the current loss is lower than the lowest loss, if not increment the counter
        if self.best_loss is not None and (current_loss - min_delta) > self.best_loss:
            self.wait += 1 #If the metric is not improving, increment the wait counter
            if self.wait >= patience: #if the loss stops decreasing for some time, stop the training
                self.stop_training = True #to stop training
        #if the loss is lower than the lowest loss
        else:
            self.wait = 0 #resets the counter
            self.best_loss = current_loss #sets current loss as lowest loss
            self.weights = weights #best weights soo far
    
    #dropout layer to drop random neurons from each layer
    def dropout_weights(self,lw,dp_list):
        dropped_weights = [] #to store weights after turning off some neurons in each layer
        for ws,p in zip(lw[:-1],dp_list): #iterate over all layers till last layer
            #to make copy of data to avoid overwritting
            w = ws[0].copy()
            b = ws[1].copy()
            
            #number of neurons in each layer
            n_neurons = w.shape[1]
            
            #randomly choose some neurons based on the proportion of dropout probability
            r_n = np.random.choice(np.arange(n_neurons),int(n_neurons*p),replace=False)
            
            #set weights to zero
            w[:,r_n] = 0
            b[:,r_n] = 0
            
            dropped_weights.append((w*1/(1-p),b*1/(1-p))) #multiply the layers with inverse of keep probability (normalization)
        dropped_weights.append(lw[-1]) #add last layer too
        
        return dropped_weights #return the weights after dropping some neurons
    
    #main function to fit the neural network to given data
    def fit(self, num_epochs, lr, validation_data=None, batch_size=32):
        if batch_size == None:
            batch_size = self.m//num_epochs #batch size, to be feed into the network for each iteration
        batch_ids = np.arange(0,self.m,batch_size) #start and end indexes of the data based on the size of batch
        lw = self.initialize_random_weights() #initialize the weights of the network
        curr_loss = self.calculate_loss(self.X,lw,self.y) #calculate the loss before the update
        self.wait = 0 #the wait counter to be used in early stopping
        self.best_loss = None #best loss to stop training
        self.stop_training = False #to stop training
        
        #iterating over number of epochs
        for epoch in range(num_epochs):
            #shuffle the data each time before splitting into the batches and feeding in the network 
            x_full , y_full = shuffle(self.X,self.y)
            #prints the current loss and epoch number
            print('Currently at',epoch,'Epoch, Current Training Loss is',np.mean(curr_loss))
            
            #if validation data is provided, use early stopping to stop training
            if validation_data:
                x_valid,y_valid = validation_data #target and features of validation data
                test_loss = self.calculate_loss(x_valid,lw,y_valid) #calculate the validation loss
                print('Testing loss is',test_loss,'\n') #printing the loss
                self.early_stopping(test_loss,lw) #for early stopping
            
            #if the loss stops decreasing, stop the training procedure
            if self.stop_training:
                print('Early Stopped..')
                return None
            
            #list to store the losses of each iteration
            curr_loss = []
            
            #iterating over each mini batch (SGD)
            for i in range(len(batch_ids[:-1])):
                si = batch_ids[i] #start index of the current batch
                ei = batch_ids[i+1] #end index of the current batch
                x = x_full[si:ei] #batch features
                y = y_full[si:ei] #batch targets
                if self.dropout:
                    lwa = self.dropout_weights(lw,self.dropout)
                else:
                    lwa = lw
                grads = self.calculate_gradients(x, lwa, y) #gradients of the weights and biases of the layers
                lwu = [] #store the weights after SGD
                
                #iterate over each layer
                for w,g in zip(lw,grads):
                    #to update weights and biases of the layers
                    wu = w[0] - lr*g[0]
                    bu = w[1] - lr*g[1]
                    #store the updated weights and biases
                    lwu.append((wu,bu))
                
                lw = lwu #set the new weights as the current weights
                curr_loss.append(self.calculate_loss(self.X,lwu,self.y)) #append the loss of current network
                
        self.weights = lwu #once training is done, set the weights in class variable
        
    #prediction function to output the class of current inputs    
    def predict(self,x): 
        #to compute probabilites of given inputs
        probs = self.forward_pass(x,self.weights)[0]
        return np.argmax(probs,axis=1) #output the class label of the input

In [167]:
#to load the MNIST Data by SKlearn
digits = load_digits()

In [168]:
#to separate in the features and labels
X = digits['data']
y = digits['target']

In [169]:
#scalar for scaling the features aka pixels
scalar = StandardScaler()
X = scalar.fit_transform(X) 
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#train valid split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [170]:
#fitting the model - with dropout
model = NN(X = X_train,y = y_train, num_neurons=[50,20], activations=[tanh,ReLU,softmax],num_classes=10,dropout=None)

In [171]:
model.fit(num_epochs = 120, lr = 0.2, validation_data=(X_valid,y_valid))

Currently at 0 Epoch, Current Training Loss is 4.378131214017997
Testing loss is 4.482959084163465 

Currently at 1 Epoch, Current Training Loss is 0.7739740011718188
Testing loss is 0.34949843285594573 

Currently at 2 Epoch, Current Training Loss is 0.19879634258885828
Testing loss is 0.2239722330052484 

Currently at 3 Epoch, Current Training Loss is 0.10341943068953463
Testing loss is 0.16568576307182253 

Currently at 4 Epoch, Current Training Loss is 0.06534101245096022
Testing loss is 0.14696100129106462 

Currently at 5 Epoch, Current Training Loss is 0.045175616319192514
Testing loss is 0.1309491249776022 

Currently at 6 Epoch, Current Training Loss is 0.033650089352488546
Testing loss is 0.12531940964932023 

Currently at 7 Epoch, Current Training Loss is 0.025021539416859195
Testing loss is 0.12145273812169163 

Currently at 8 Epoch, Current Training Loss is 0.020029986237009544
Testing loss is 0.11170412508640748 

Currently at 9 Epoch, Current Training Loss is 0.015995466

Currently at 77 Epoch, Current Training Loss is 0.0007639684895194713
Testing loss is 0.08079913336654843 

Currently at 78 Epoch, Current Training Loss is 0.0007516890498718571
Testing loss is 0.08071820159640071 

Currently at 79 Epoch, Current Training Loss is 0.0007401022611635017
Testing loss is 0.08069486678382612 

Currently at 80 Epoch, Current Training Loss is 0.0007283894243981776
Testing loss is 0.08042463090765267 

Currently at 81 Epoch, Current Training Loss is 0.0007175741396551801
Testing loss is 0.08069559533736678 

Currently at 82 Epoch, Current Training Loss is 0.0007073914493938999
Testing loss is 0.0804866770077827 

Currently at 83 Epoch, Current Training Loss is 0.0006969090091509229
Testing loss is 0.08020806383054524 

Currently at 84 Epoch, Current Training Loss is 0.0006865347345547132
Testing loss is 0.0802156638569886 

Currently at 85 Epoch, Current Training Loss is 0.0006764887054243548
Testing loss is 0.08022232033925702 

Currently at 86 Epoch, Current

In [172]:
print('The Accuracy of the Neural Network on test set is',accuracy_score(y_test,model.predict(X_test)))

The Accuracy of the Neural Network on test set is 0.9638888888888889


## 4. **Bonus: Dropout

In [173]:
#fitting the model - with dropout
model = NN(X = X_train,y = y_train, num_neurons=[50,20], activations=[tanh,ReLU,softmax],num_classes=10,dropout=[0.2,0.3,None])

In [174]:
#here we will not use validation data
model.fit(num_epochs = 20, lr = 0.2)

Currently at 0 Epoch, Current Training Loss is 4.082958340459529
Currently at 1 Epoch, Current Training Loss is 1.1136455540057433
Currently at 2 Epoch, Current Training Loss is 0.3643219508175049
Currently at 3 Epoch, Current Training Loss is 0.19032758683142878
Currently at 4 Epoch, Current Training Loss is 0.14508525447646195
Currently at 5 Epoch, Current Training Loss is 0.10340994078174832
Currently at 6 Epoch, Current Training Loss is 0.08025315975532595
Currently at 7 Epoch, Current Training Loss is 0.0680460087321394
Currently at 8 Epoch, Current Training Loss is 0.05908268640847355
Currently at 9 Epoch, Current Training Loss is 0.04655971129008196
Currently at 10 Epoch, Current Training Loss is 0.04456253471798619
Currently at 11 Epoch, Current Training Loss is 0.036498629964305127
Currently at 12 Epoch, Current Training Loss is 0.033735897690187235
Currently at 13 Epoch, Current Training Loss is 0.03019930492167891
Currently at 14 Epoch, Current Training Loss is 0.02658314228

In [175]:
print('The Accuracy of the Neural Network on test set with Dropout is',accuracy_score(y_test,model.predict(X_test)))

The Accuracy of the Neural Network on test set with Dropout is 0.9777777777777777


## 3. MLP using Sklearn

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV

In [46]:
sklearn_mlp = MLPClassifier(max_iter=1000)

We will test with random hidden layer size, random activation functions applied to each layer, the optimizer and Learning Rate

In [47]:
#Creating Grid
grid = {
    'hidden_layer_sizes': [(50,30,20),(50,25,10),(25,10)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd'],
    'alpha': [0.001, 0.001, 0.01, 0.1],
}

In [48]:
clf = RandomizedSearchCV(sklearn_mlp, grid, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)

In [91]:
clf.best_params_

{'solver': 'sgd',
 'hidden_layer_sizes': (50, 30, 20),
 'alpha': 0.001,
 'activation': 'tanh'}

**Result:**
The Neural Network with 3 layers with 50,30, and 20 neurons in each with activation function tanh, trained with SGD with Learning Rate 0.001, gave the best accuracy

In [51]:
print('The Accuracy on Test Data is',accuracy_score(y_test,clf.predict(X_test)))

The Accuracy on Test Data is 0.9638888888888889


## 5 **Bonus: Convolutional Neural Networks

In [18]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
    from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import cv2
import warnings
warnings.filterwarnings("ignore")

Let's first make a custom dataset class which will be used by the Pytorch Dataloader to load the data in batches and give it to the CNN. Here we will input it the Pandas Dataframe which contains two columns, First column is names of the images and second column contains their corresponding labels.

In [6]:
class DrivingData(Dataset): #inherit from pytorch Dataset class
    def __init__ (self, data_dir, data):
        super(DrivingData,self).__init__() #initialize the parent class
        self.data_dir = data_dir #directory where the images and labels are present
        self.data = data #dataframe which contains two columns, name of the file and its label

    def __len__ (self):
        return len(self.data)
    
    #to give freedom to torch to call as many sample as it wants depaneding on the batch size
    def __getitem__(self, index):
        img_name = self.data.iloc[index,0] #name of the particular image
        img_path = os.path.join(self.data_dir, img_name) #complete path of the particular image
        
        #the input is rescaled to 66,200 to match the dimensions as presented in the paper
        img = cv2.resize(cv2.cvtColor(cv2.imread(img_path),cv2.COLOR_BGR2RGB),(66,200)).reshape(3,66,200)
        label = self.data.iloc[index,-1] #to take the label
        return img,label

The architecture of the network is:
<img src="architecture.png" width="250" align="center">

In [25]:
class NN(nn.Module):
    def __init__ (self, in_channels=3, out_channels=1):
        super(NN,self).__init__()
        self.in_channels = in_channels #input channels which is 3
        self.norm = nn.LayerNorm([in_channels,66,200], elementwise_affine=False) #normalization layer: takes channel wise norm
        self.c1 = nn.Conv2d(in_channels=in_channels, out_channels=24, kernel_size=5, stride=2) #first conv layer
        self.c2 = nn.Conv2d(in_channels=24, out_channels=36, kernel_size=5, stride=2) #second conv layer
        self.c3 = nn.Conv2d(in_channels=36, out_channels=48, kernel_size=5, stride=2) #third conv layer
        self.c4 = nn.Conv2d(in_channels=48, out_channels=64, kernel_size=3, stride=1) #fourth conv layer
        self.c5 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1) #fifth conv layer
        self.fl = nn.Flatten() #flatten layer
        self.fc1 = nn.Linear(in_features=1152, out_features=100) #first fc layer
        self.fc2 = nn.Linear(in_features=100, out_features=50) #second fc layer
        self.fc3 = nn.Linear(in_features=50, out_features=10) #third fc layer
        self.fc = nn.Linear(in_features=10, out_features=out_channels) #final fc layer
        self.act = nn.ELU() #activation function which is ELU in this case

    def forward(self,x):
        #forward pass of convolutional layers
        conv_output = self.act(self.c5(self.act(self.c4(self.act(self.c3(self.act(self.c2(self.act(self.c1(x))))))))))
        
        #forward pass of fc layers till last layer (excluded)
        fc_layers = self.act(self.fc3(self.act(self.fc2(self.act(self.fc1(self.fl(conv_output)))))))
        
        #forward pass of last fc layer
        out = self.fc(fc_layers)
        return out #return the prediction of steering angle

The early stopping helps us in avoiding overfitting. The below implementation of EarlyStopping is used.

In [26]:
##Implementation of Early Stopping Adopted from https://github.com/Bjarten/early-stopping-pytorch
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [27]:
## Hyperparams
BATCH_SIZE = 128 #in each iteration batch of 128 images will be feed to CNN
LR = 3e-4 #learning rate
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #this ensures to train on GPU if avialble
NUM_EPOCHS = 500 #number of epochs, we can set it to any high number because we are using early stopping anyways
PATIENCE = 7 #patience value for early stopping
PATH = r"/content/driving_dataset" #path where the images and labels are stored

## Defining Optimizer and Loss
model = NN().double().to(DEVICE) #make the model, send it to GPU and set double precision to avoid errors
optimizer = Adam(model.parameters(),lr=LR) #the adam optimizer is used to train the network
criterion = nn.MSELoss() #MSE loss, when loss is applied, sqrt is taken to make it RMSE

## Data Loaders
data = pd.read_csv(os.path.join(PATH,'angles.txt'), header=None, sep=' ') #loading the label files which also has image names
dt,dv = train_test_split(data.iloc[:-10000,:], test_size=0.2, random_state=42) #train validation split
train_data = DrivingData(PATH, dt) #train data
valid_data = DrivingData(PATH, dv) #validation data
test_data = DrivingData(PATH, data.iloc[-10000:,:]) #testing data (last 10k images are used for testing)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) #train data loader
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE) #validation data
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE) #testing data loader

In [None]:
#training loop is also taken from https://github.com/Bjarten/early-stopping-pytorch

# to track the training loss as the model trains
train_losses = []
# to track the validation loss as the model trains
valid_losses = []
# to track the average training loss per epoch as the model trains
avg_train_losses = []
# to track the average validation loss per epoch as the model trains
avg_valid_losses = [] 

# initialize the early_stopping object
early_stopping = EarlyStopping(patience=PATIENCE, verbose=True)

#iterate till max epochs
for e in range(NUM_EPOCHS):
    model.train() # prep model for training
    for batch_idx, (data,target) in enumerate(train_dataloader): #iterating over each batch
        data = data.to(DEVICE).double() #send data to GPU if available and set the data in double precision
        target = target.to(DEVICE).double() #send target to GPU if available and set the data in double precision

        ## forward pass
        scores = model(data) #calculate the steering angle prediction
        loss = torch.sqrt(criterion(scores, target)) #calculate the MSE

        optimizer.zero_grad() #to avoid accumulating gradients of previous batches
        loss.backward() #backpropagation

        #update the parameters
        optimizer.step()

        # record training loss
        train_losses.append(loss.item())

    model.eval() # prep model for evaluation
    for data, target in valid_dataloader: #iterating over each validation batch after an epoch
        data = data.to(DEVICE).double() #send data to GPU if available and set the data in double precision
        target = target.to(DEVICE).double() #send target to GPU if available and set the data in double precision
        
        # forward pass: compute predicted steering angles by passing inputs to the model
        output = model(data)
        # calculate the loss
        loss = torch.sqrt(criterion(output, target))
        # record validation loss
        valid_losses.append(loss.item())

    # print training/validation statistics 
    
    # calculate average loss over an epoch
    train_loss = np.average(train_losses) #mean of RMSE for each epoch in training set
    valid_loss = np.average(valid_losses) #mean of RMSE for each epoch in validation set
    avg_train_losses.append(train_loss)
    avg_valid_losses.append(valid_loss)

    epoch_len = len(str(NUM_EPOCHS))
    
    #printing the training and validation loss after each epoch
    print_msg = (f'[{e:>{epoch_len}}/{NUM_EPOCHS:>{epoch_len}}] ' +
                  f'train_loss: {train_loss:.5f} ' +
                  f'valid_loss: {valid_loss:.5f}')
    
    print(print_msg+'\n')
    
    # clear lists to track next epoch
    train_losses = []
    valid_losses = []
    
    # early_stopping needs the validation loss to check if it has decresed, 
    # and if it has, it will make a checkpoint of the current model
    early_stopping(valid_loss, model)
    if early_stopping.early_stop:
        print("Early stopped..")
        break

[  0/500] train_loss: 31.18434 valid_loss: 32.25564

Validation loss decreased (inf --> 32.255641).  Saving model ...
[  1/500] train_loss: 30.94227 valid_loss: 32.25326

Validation loss decreased (32.255641 --> 32.253258).  Saving model ...
[  2/500] train_loss: 30.98432 valid_loss: 32.25113

Validation loss decreased (32.253258 --> 32.251133).  Saving model ...
[  3/500] train_loss: 30.96847 valid_loss: 32.25476

EarlyStopping counter: 1 out of 7
[  4/500] train_loss: 30.96962 valid_loss: 32.25139

EarlyStopping counter: 2 out of 7
[  5/500] train_loss: 31.00691 valid_loss: 32.25099

Validation loss decreased (32.251133 --> 32.250990).  Saving model ...
[  6/500] train_loss: 31.09519 valid_loss: 32.25281

EarlyStopping counter: 1 out of 7
[  7/500] train_loss: 31.02684 valid_loss: 32.25110

EarlyStopping counter: 2 out of 7
[  8/500] train_loss: 31.03365 valid_loss: 32.25144

EarlyStopping counter: 3 out of 7
[  9/500] train_loss: 31.01149 valid_loss: 32.25192

EarlyStopping counter:

Note: The training was stopped at patience 6 of early stopping becuase runtime expired, but the model was not improving so the retraining was not necessary

In [32]:
#load the weights of the best model which gave the minimum error
model.load_state_dict(torch.load('checkpoint.pt'))

<All keys matched successfully>

In [34]:
test_losses = [] #to store RMSE of each batch in testing set
model.eval() # prep model for evaluation
for data, target in test_dataloader: #iterating over each batch in test set
    data = data.to(DEVICE).double() #send data to GPU if available and set the data in double precision
    target = target.to(DEVICE).double() #send target to GPU if available and set the data in double precision
    
    # forward pass: compute predicted steering angles by passing inputs to the model
    output = model(data)
    # calculate the loss
    loss = torch.sqrt(criterion(output, target))
    # record validation loss
    test_losses.append(loss.item())

In [40]:
#to get a single value of RMSE
test_loss = np.mean(test_losses)
print(f'The RMSE of test set is {test_loss:.03f}')

The RMSE of test set is 15.169


References: 
1. https://towardsdatascience.com/how-to-create-a-gif-from-matplotlib-plots-in-python-6bec6c0c952c
2. https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/algorithms/neuralnetwork/NN.py
3. https://panjeh.medium.com/scikit-learn-hyperparameter-optimization-for-mlpclassifier-4d670413042b
4. https://github.com/Bjarten/early-stopping-pytorch