In [1]:
import numpy as np

# initializing a random generator with seed 
rng = np.random.default_rng(seed = 0)

### Activation Functions:

In [2]:
# from IPython.display import Image
  
# # get the image
# Image(data=r"C:\Users\rachi\DL\ANN\Activations.png",width=500, height=350)

In [3]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def grad_sigmoid(z):
    return sigmoid(z)*(1-sigmoid(z))

def relu(z):
    return np.where(z>=0,z,0)

def grad_relu(z):
    return np.where(z>=0,1,0)

def prelu(z):
    return np.where(z>=0,z,0.01*z)

def grad_prelu(z):
    return np.where(z>=0,1,0.01)

def erelu(z):
    return np.where(z>=0,z,0.05*np.exp(z)-1)

def grad_erelu(z):
    return np.where(z>=0,1,erelu(z)+0.05)

def tanh(z):
    return (2/(1+np.exp(-2*z)))-1

def grad_tanh(z):
    return 1 - tanh(z)**2

# define dictionary of activation functions and their derivatives
act_func = {'sigmoid':sigmoid,'relu':relu,'tanh':tanh,'prelu':prelu,'erelu':erelu}
grad_act_func = {'sigmoid':grad_sigmoid,'relu':grad_relu,'tanh':grad_tanh,'prelu':prelu,'erelu':erelu}

Output layer either uses identity(regression) or softmax(classification) activation

The input to the softmax function will always be a matrix of size n X k, where k is the number of classes. Since we need a probability distribution for each data-point, the softmax will be computed row-wise. As softmax is used in last layer for classification problem

np keepdims: https://stackoverflow.com/questions/39441517/in-numpy-sum-there-is-parameter-called-keepdims-what-does-it-do

In [4]:
def identity(z):
    return z

def softmax(z):
    assert z.ndim == 2
    
#     prevent overflow
    z -= z.max(axis=1,keepdims=True)
    
    prob = np.exp(z)/(np.exp(z).sum(axis=1,keepdims=True))
    return prob

output_layer = {'identity':identity,'softmax':softmax}

#### Loss functions:
https://towardsdatascience.com/loss-functions-and-their-use-in-neural-networks-a470e703f1e9#:~:text=A%20loss%20function%20is%20a,the%20predicted%20and%20target%20outputs.

In [5]:
def mse_loss(y,yp):
    return np.sum((yp-y)*(yp-y))/(y.shape[0])

def r2_score(y,yp):
    mean = np.mean(y)
    return 1 - ((np.sum((yp-y)*(yp-y)))/(np.sum((mean-y)*(mean-y))))

def cce_loss(y,yp):
    return -1*np.sum(y*np.log(yp))/(y.shape[0])

loss_func = {'mse':mse_loss,'cce':cce_loss}

In [6]:
# count number of parameters

def count_params(layers):
    total = 0
    for i in range(1,len(layers)):
        numw = layers[i-1]*layers[i]
        numb = layers[i]
        total += (numw + numb)
    return total

#### Parameter initialization:

Weight vector **W** is of same size as layers, where **W[ i ]** stores a matrix containing weights connecting layer i-1 to i.
Hence size of matrix at **W[ i ]** is layers[i-1] * layers[i]. For Bias vector **b** it is of same size of layers and **b[ i ]** contains contains a array of biases for layer i. First element of **W & b** will always be None.

To make the gradient descent update simpler, it will be useful to have a **master vector, $\theta$,** that has a reference to all the parameters in the network.<br>
We will do the same for the gradients  $\theta^{(g)}$. So, whenever $\theta$ is updated, the weights W, will also be updated and vice-versa.


One way to do this is to first start with the master vector and then **reshape chunks of it into the dimensions of a weight matrix.** Reshaping an
array usually returns a view of an array and not a copy. To understand this function better, refer to NumPy's documentation on "Copies and
Views*: https://numpy.org/doc/stable/user/basics.copies.html

In [7]:
def init_params(layers):
    num_params = count_params(layers)
    
    w = [None for i in range(len(layers))]
    b = [None for i in range(len(layers))]
    gw = [None for i in range(len(layers))]
    gb = [None for i in range(len(layers))]
    
    theta = rng.standard_normal(num_params)
    gtheta = rng.standard_normal(num_params)
    
#     (start, end) specify the portion of the theta
#     that corresponds to the parameter, W_1 or b_1
    start, end = 0, 0
    for i in range(1, len(layers)):
        # Reshape the section (start, end) and assign it to W[i]
        end = start + layers[i - 1] * layers[i]
        w[i] = theta[start: end].reshape(layers[i - 1], layers[i])
        gw[i] = gtheta[start: end].reshape(layers[i - 1], layers[i])

        # Reshape the section (start, end) and assign it to b[i]
        start, end = end, end + layers[i]
        b[i] = theta[start: end].reshape(layers[i])
        gb[i] = gtheta[start: end].reshape(layers[i])
        start = end
    
    return theta, gtheta, w, b, gw, gb

#### Network class
1. Forward Pass
Z is the linear combination or pre-activations of a layer 
$$
Z[l] = A[l-1] \cdot w[l] + b[l]
$$

$A[l-1]$ is the activations of previous layer

2. Backward Pass:
Gradients of pre-activations Z wrt Loss are given by $ Z_{L}^{(g)} = \hat{Y} - Y $ which is same for regression or classification. Similarly other gradients can be update iteratively by following formulas:<br>
2.1 $ W_{L}^{(g)} =  A_{L-1}^{T} Z_{L}^{(g)}$<br><br>
2.2 $ B_{L}^{(g)} = Z_{L}^{(g)}{^{T}}$<br><br>
2.3 $ A_{L-1}^{(g)} = Z_{L}^{(g)} W_{L}^{T}$<br><br>
2.4 $ Z_{L-1}^{(g)} = A_{L-1}^{(g)} \odot g^{'}(Z_{l-1})$<br><br>

In [8]:
class NN_regressor:
    def __init__(self, layers, activation, out, loss_function):
        self.layers = layers
        self.theta, self.gtheta, self.w, self.b, self.gw, self.gb = init_params(layers)
        
        self.hid_act = act_func[activation]
        self.grad_act = grad_act_func[activation]
        self.out_layer_act = output_layer[out]
        
        self.loss = loss_func[loss_function]
        
    def forward(self,X):
        self.z = [None for i in range(len(self.layers))]
        self.a = [None for i in range(len(self.layers))]
        
        self.z[0] = X
        self.a[0] = X
        
        for i in range(1, len(self.layers)):
            self.z[i] = (self.a[i-1] @ self.w[i]) + self.b[i]
            self.a[i] = self.hid_act(self.z[i])
            
#             print(f"Layer: {i} Preactivations: {self.z[i]} Activations: {self.a[i]}\n")
            
#         print(f"Pre activation matrix: {self.z}\nActivation Matrix: {self.a}")
        self.a[-1] = self.out_layer_act(self.z[-1])
        
        return self.a[-1]
    
    def backward(self,y,ypred):
        gz = [None for i in range(len(self.layers))]
        ga = [None for i in range(len(self.layers))]
        gz[-1] = ypred - y
        
#         updating gradient from layer l-1 -> 0 (back-propagating)
        for i in range(len(self.layers)-1, 0, -1):
            self.gw[i][:, :] = self.a[i-1].T @ gz[i]
            self.gb[i][:] = np.sum(gz[i].T,axis=1)
            
            ga[i-1] = gz[i] @ self.w[i].T
            gz[i-1] = ga[i-1]*self.grad_act(self.z[i-1])
            
            
    def fit(self, X, y, lr, epochs):
        self.losses = []
        self.accuracy_score = []
        
        for i in range(epochs):
#             Forward Pass
            ypred = self.forward(X)
#             Loss 
            loss_val = self.loss(y,ypred)
            acc_score = r2_score(y,ypred)
            self.losses.append(loss_val)
            self.accuracy_score.append(acc_score)
            
#             Backward pass
            self.backward(y,ypred)
    
#             Update gradient
            self.theta -= lr*self.gtheta
    
            if(i%10==0):
                print(f"Epoch:{i} Loss: {loss_val} Accuracy: {acc_score}",end='\n---------------------\n')
    
    def predict(self, X):
        ypred = self.forward(X)
        return ypred

In [9]:
class NN_classifier:
    def __init__(self, layers, activation, out, loss_function):
        self.layers = layers
        self.theta, self.gtheta, self.w, self.b, self.gw, self.gb = init_params(layers)
        
        self.hid_act = act_func[activation]
        self.grad_act = grad_act_func[activation]
        self.out_layer_act = output_layer[out]
        
        self.loss = loss_func[loss_function]
        
    def forward(self,X):
        self.z = [None for i in range(len(self.layers))]
        self.a = [None for i in range(len(self.layers))]
        
        self.z[0] = X
        self.a[0] = X
        
        for i in range(1, len(self.layers)):
            self.z[i] = (self.a[i-1] @ self.w[i]) + self.b[i]
            self.a[i] = self.hid_act(self.z[i])
            
#             print(f"Layer: {i} Preactivations: {self.z[i]} Activations: {self.a[i]}\n")
            
#         print(f"Pre activation matrix: {self.z}\nActivation Matrix: {self.a}")
        self.a[-1] = self.out_layer_act(self.z[-1])
        
        return self.a[-1]
    
    def backward(self,y,ypred):
        gz = [None for i in range(len(self.layers))]
        ga = [None for i in range(len(self.layers))]
        gz[-1] = ypred - y
        
#         updating gradient from layer l-1 -> 0 (back-propagating)
        for i in range(len(self.layers)-1, 0, -1):
            self.gw[i][:, :] = self.a[i-1].T @ gz[i]
            self.gb[i][:] = np.sum(gz[i].T,axis=1)
            
            ga[i-1] = gz[i] @ self.w[i].T
            gz[i-1] = ga[i-1]*self.grad_act(self.z[i-1])
            
            
    def fit(self, X, y, lr, epochs):
        self.losses = []
        
        for i in range(epochs):
#             Forward Pass
            ypred = self.forward(X)
#             Loss 
            loss_val = self.loss(y,ypred)
            self.losses.append(loss_val)
            
#             Backward pass
            self.backward(y,ypred)
    
#             Update gradient
            self.theta -= lr*self.gtheta
    
            if(i%10==0):
                print(f"Epoch:{i} Loss: {loss_val}",end='\n---------------------\n')
    
    def predict(self, X):
        ypred = self.forward(X)
        return np.argmax(ypred, axis=1)