# Neural Networks and Deep Learning 
### Course 1 of Deep learning Specialisation

### House Price prediction

![House Price prediction](./img/motivating-example.drawio.svg)

### Scale driving performance

![Scale driving performance](./img/scale-driving-performance.drawio.svg)

### Training data notation

![Training data notation](./img/training-data-notation.drawio.svg)

### Logistic regression

![Logistic regression](./img/logistic-regression.drawio.svg)

### Neural network

![Neural network](./img/nn.drawio.svg)


In [1]:
from typing import Callable

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.special import expit


def relu(x):
    return x * (x > 0)

def relu_derivative(x):
    return 1. * (x > 0)

sigmoid = expit

def sigmoid_derivative(z):
    return sigmoid(z) * (1.0 - sigmoid(z))

def leaky_relu(x, leaky_constant:float = 0.01):
    return np.where(x > 0.0, x,  x * leaky_constant)

def leaky_relu_derivative(x, leaky_constant=0.01):
    return np.where(x > 0, 1, leaky_constant)

def log_loss(A, Y):
    m = A.shape[1] # m is number samples
    return - (1/m) * (Y @ np.log(A).T + (1 - Y) @ np.log(1 - A).T)

def square_loss(A, Y):
    m = A.shape[1] # m is number samples
    return - (1/m) * (A - Y) @ (A - Y).T

ACTIVATION_FUNCTIONS : dict[str, Callable] = {
    "relu": relu,
    "leaky_relu": leaky_relu,
    "sigmoid": expit,
}

ACTIVATION_FUNCTION_DERIVATIVES: dict[str, Callable] = {
    "relu": relu_derivative,
    "leaky_relu": leaky_relu_derivative,
    "sigmoid": sigmoid_derivative,
}

LOSS_FUNCTIONS: dict[str, Callable] = {
    "log_loss": log_loss,
    "square_loss": square_loss
}

In [2]:
import logging

logger = logging.getLogger()

class NeuralNetwork:

    def __init__(
        self,
        layer_sizes: list[int],
        learning_rate: float = 0.5,
        layer_activations: dict[int, str] | None = None,
        cost_function: str = "log_loss"
    ):
        self.learning_rate = learning_rate
        self.layer_sizes = layer_sizes
        self.L = len(layer_sizes) - 1
        self.m = layer_sizes[0]
        self.cost_function = cost_function
        self.learning_rate
        self.layer_activations = layer_activations
        if not layer_activations:
            self.layer_activations = {l:"sigmoid" for l in range(1, self.L)} | {self.L:"sigmoid"}

    def initialise_weights(self) -> None:
        # Note layer indexes are off by one because python indexes by 0
        # so Ws[0] is really W^{[1]}

        # This is using He initialisation. Try changing to * 0.01 and see the change in cost plot.
        self.Ws = {
            l:np.random.normal(size=(n_l, n_l_minus_1)) * np.sqrt(2 / n_l_minus_1)
            for (l, (n_l, n_l_minus_1)) in enumerate(zip(self.layer_sizes[1:], self.layer_sizes), start=1)
        }
        self.bs = {l:np.zeros((n_l, 1)) for l, n_l in enumerate(self.layer_sizes[1:], start=1)}
        logger.info("Weights initialised")
        logger.debug(f"{self.Ws=}")

    def forward(self, X, cache=False) -> None:
        Zs, As = {}, {0:X}
        for l in range(1, self.L + 1):
            Zs[l] = self.Ws[l] @ As[l-1] + self.bs[l]
            g = ACTIVATION_FUNCTIONS[self.layer_activations[l]]
            logger.debug(f"Applying {self.layer_activations[l]} in layer{l}")
            As[l] = g(Zs[l])
        if cache:
            self.Zs, self.As = Zs, As
        return As[self.L]

    def backward(self, Y) -> None:
        dZs = {self.L: self.As[self.L] - Y}
        m = self.As[0].shape[1]
        dWs, dbs = {}, {}
        # [w1, w2, w3]
        for l in range(self.L, 0, -1):
            logger.debug(f"calculating dZ for layer_id {l}")
            if l != self.L:
                dZs[l] = self.Ws[l+1].T @ dZs[l+1] * \
                    ACTIVATION_FUNCTION_DERIVATIVES[self.layer_activations[l]](self.Zs[l])
                    # For sigmoid we could just use self.As[l] * (1 - self.As[l])
            dWs[l] = (1 / m) * dZs[l] @ self.As[l-1].T
            dbs[l] = (1 / m) * np.sum(dZs[l], axis=1, keepdims=True)
        self.dWs, self.dbs = dWs, dbs

    def update_weights(self):
        for l in range(1, self.L + 1):
            self.Ws[l] -= self.learning_rate * self.dWs[l]
            self.bs[l] -= self.learning_rate * self.dbs[l]

    def train(self, X, Y, n_epochs=10, log_every=100, plot_cost=False):
        import plotly.graph_objs as go
        from IPython.display import display, clear_output

        costs = []
        epochs = []

        if plot_cost:
            fig = go.FigureWidget()
            fig.add_scatter(x=[], y=[], mode='lines+markers', name='Cost')
            fig.update_layout(title='Training Cost over Epochs', xaxis_title='Epoch', yaxis_title='Cost')
            display(fig)

        for epoch in range(n_epochs):
            A = self.forward(X, cache=True)
            cost = self.cost(A, Y)
            if epoch % log_every == 0:
                logger.info(f"Cost after epoch {epoch} = {cost}")
            if plot_cost:
                costs.append(float(cost))
                epochs.append(epoch)
                if epoch % 10 == 0:
                    with fig.batch_update():
                        fig.data[0].x = epochs
                        fig.data[0].y = costs
            self.backward(Y)
            self.update_weights()

    def cost(self, A, Y):
        if self.cost_function == "log_loss":
            return log_loss(A, Y)
        elif self.cost_function == "square_loss":
            return square_loss(A, Y)
        else:
            raise Exception(f"Incorrect value for self.cost_function:= {self.cost_function}")

    def predict(self, X, return_probability=False):
        Y_hat = self.forward(X)
        if return_probability:
            return Y_hat
        return np.where(Y_hat>0.5, 1, 0)


In [4]:
def example():
    X_train = pd.read_feather('../titanic/processed/X_train.feather').to_numpy().T
    y_train = pd.read_feather('../titanic/processed/y_train.feather').to_numpy().T
    X_test = pd.read_feather('../titanic/processed/X_test.feather').to_numpy().T
    y_test = pd.read_feather('../titanic/processed/y_test.feather').to_numpy().T

    logging.basicConfig(level=logging.INFO, force=True)

    layer_sizes = [30, 50, 20, 1] # L = 3, A[3] = Yhat
    neural_network = NeuralNetwork(layer_sizes=layer_sizes)

    # uncomment to see relu converges better !
    # layer_sizes = [30, 50, 1] # L = 3, A[3] = Yhat
    # neural_network = NeuralNetwork(
    #     layer_sizes=layer_sizes,
    #     layer_activations={1:"relu", 2:"relu", 3:"sigmoid"},
    # )

    neural_network.initialise_weights()
    neural_network.train(X_train, y_train, n_epochs=2000, plot_cost=True)

    y_test_pred = neural_network.predict(X_test)
    accuracy = (y_test_pred == y_test).sum() / y_test.shape[1]

    print(f"X_train.shape: {X_train.shape}")
    print(f"layer_activations = {neural_network.layer_activations}")
    print(f"L = {neural_network.L}")
    print("ws shapes:",  [(i, w.shape) for i, w in neural_network.Ws.items()])
    print("As shapes:",  [(i, a.shape) for i, a in neural_network.As.items()])
    print("zs shapes:",  [(i, z.shape) for i, z in neural_network.Zs.items()])
    print(f"Accuracy on test: {accuracy}")

example()

INFO:root:Weights initialised


FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Cost',
              'type': 'scatter',
              'uid': '0ce1cee0-b776-4139-a483-7648ff80e29e',
              'x': [],
              'y': []}],
    'layout': {'template': '...',
               'title': {'text': 'Training Cost over Epochs'},
               'xaxis': {'title': {'text': 'Epoch'}},
               'yaxis': {'title': {'text': 'Cost'}}}
})

INFO:root:Cost after epoch 0 = [[0.82054377]]

Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)

INFO:root:Cost after epoch 100 = [[0.48283017]]
INFO:root:Cost after epoch 200 = [[0.43837833]]
INFO:root:Cost after epoch 300 = [[0.42343525]]
INFO:root:Cost after epoch 400 = [[0.41595906]]
INFO:root:Cost after epoch 500 = [[0.41096105]]
INFO:root:Cost after epoch 600 = [[0.4069837]]
INFO:root:Cost after epoch 700 = [[0.40356924]]
INFO:root:Cost after epoch 800 = [[0.40051146]]
INFO:root:Cost after epoch 900 = [[0.39769467]]
INFO:root:Cost after epoch 1000 = [[0.39504862]]
INFO:root:Cost after epoch 1100 = [[0.39253103]]
INFO:root:Cost after epoch 1200 = [[0.3901173]]
INFO:root:Cost after epoch 1300 = [[0.38779262]]
INFO:root:Cost after epoch 1400 = [[0.38554666]]
INFO:root:Cost after epoch 1500 = [[0.38337106]]
INFO:root:Cost after epoch 

X_train.shape: (30, 712)
layer_activations = {1: 'sigmoid', 2: 'sigmoid', 3: 'sigmoid'}
L = 3
ws shapes: [(1, (50, 30)), (2, (20, 50)), (3, (1, 20))]
As shapes: [(0, (30, 712)), (1, (50, 712)), (2, (20, 712)), (3, (1, 712))]
zs shapes: [(1, (50, 712)), (2, (20, 712)), (3, (1, 712))]
Accuracy on test: 0.8324022346368715
