# In-class exercise 7: Deep Learning 1 (Part B)
In this notebook we will gain some hands-on experience with backpropagation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

%matplotlib inline

from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

In [None]:
X, y = load_digits(return_X_y=True)
# Convert labels into one-hot format
Y = label_binarize(y, classes=np.unique(y))
K = Y.shape[1]  # number of classes
D = X.shape[1]  # number of features

np.random.seed(123)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

Check shapes

# 1. Simple backpropagation example

Addition of two vectors

In [None]:
class Add:
    def forward(self, x, y):
        # TODO

    def backward(self, d_out):
        # TODO

Element-wise multiplication of two vectors

In [None]:
class Multiply:
    def forward(self, x, y):
        # TODO

    def backward(self, d_out):
        # TODO

Sum of a vector

In [None]:
class Sum:
    def forward(self, x):
        # TODO

    def backward(self, d_out):
        # TODO

### Dot product of two vectors as composition of multiplication and summation

Dot product:
$$\mathbf{x}\cdot\mathbf{y} = \sum_{i=0}^{n}x_i y_i$$

In [None]:
x = np.arange(1, 5, dtype=np.float32)
y = np.arange(-1, 3, dtype=np.float32)

In [None]:
mult = Multiply()
vec_sum = Sum()

w = mult.forward(x, y)
z = vec_sum.forward(w)

d_w = vec_sum.backward(1.0)
d_x, d_y = mult.backward(d_w)

In [None]:
z, d_x, d_y

### Dot product of two vectors as one operation

Dot product of two vectors

In [None]:
class DotProduct:
    def forward(self, x, y):
        # TODO

    def backward(self, d_out):
        # TODO

In [None]:
x = np.arange(1, 5, dtype=np.float32)
y = np.arange(-1, 3, dtype=np.float32)

In [None]:
dp = DotProduct()
z = dp.forward(x, y)
d_x, d_y = dp.backward(1.0)

In [None]:
z, d_x, d_y

**Lessons:**
1. By implementing `forward` and `backward` method we can compute gradients of an arbitrary composition of functions
2. Use `cache` to store values that will be needed in the backward 
3. Multiple operations can be combined into a single module (i.e. function)
4.  `1.0` as `d_out` for the terminal node in our computational graph

# 2. Multi-class logistic regression (without backprop)

Multi-class logistic regression model

Data:
* Data matrix $\mathbf{X} \in \mathbb{R}^{N \times D}$.
* Target labels in one-hot format $\mathbf{Y} \in \mathbb{R}^{N \times K}$.
$Y_{nk} = 1$ if sample $n$ belongs to class $k$, $Y_{nk} = 0$ otherwise.

Model parameters:
* Weight matrix $\mathbf{W} \in \mathbb{R}^{D \times K}$.
* Bias vector $\mathbf{b} \in \mathbb{R}^{K}$.

Making predictions with the model:
* Logits 
$$\mathbf{a}_n = \mathbf{W x}_n + \mathbf{b}$$
* Denote the matrix of logits as 
$$\mathbf{A} = \mathbf{XW} +  \mathbf{1}_N \mathbf{b}^T \in \mathbb{R}^{N \times K}$$
* Convert logits to probabilities using softmax function
$$p(Y_{nk} = 1 \mid \mathbf{x}_n, \mathbf{W}, \mathbf{b}) = \frac{\exp(A_{nk})}{\sum_{c = 1}^{K} \exp(A_{nc})}$$

Negative log-likelihood


\begin{align}
-\log p(\mathbf{Y} \mid \mathbf{X}, \mathbf{W}, \mathbf{b}) &= - \frac{1}{N}\sum_{n=1}^{N} \sum_{k=1}^{K} Y_{nk} \log p(Y_{nk} = 1 \mid \mathbf{x}_n, \mathbf{W}, \mathbf{b})\\
&= \frac{1}{N} \sum_{n=1}^{N} \sum_{k=1}^{K} Y_{nk} \left(-A_{nk} + \log \left( \sum_{c=1}^{C} \exp(A_{nc}) \right) \right)\\
%&= \frac{1}{N} \sum_{n=1}^{N} \left(\sum_{k=1}^{K} -Y_{nk} A_{nk} \right) + \log \left( \sum_{c=1}^{C} \exp(A_{nc}) \right)
\end{align}


In [None]:
from scipy.special import softmax

In [None]:
def predict(X, W, b):
    """Generate predictions for a multi-class logistic regression model.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)

    Returns:
        Y_pred: Predicted class probabilities, shape (N, K).
            Y_pred[n, k] = probability that sample n belongs to class k.
    """
    # TODO

Negative log-likelihood of multiclass logistic regression 

In [None]:
def nll_loss(X, W, b, Y):
    """Compute negative log-likelihood of a logistic regression model.

    Also known as categorical cross entropy loss.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)
        Y: true labels in one-hot format, shape (N, K)

    Returns:
        loss: loss of the logistic regression model, shape ()
    """
    # TODO

In [None]:
def nll_grad(X, W, b, Y):
    """Compute gradient of the NLL loss w.r.t. W and b.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)
        Y: true labels in one-hot format, shape (N, K)

    Returns:
        d_W: gradient of the los w.r.t. W, shape (D, K)
        d_b: gradient of the los w.r.t. b, shape (K)
    """
    # TODO

In [None]:
# Initialize learnable model parameters
W = np.zeros([D, K])
b = np.zeros([K])

In [None]:
# Specify optimization parameters
learning_rate = 1e-2
max_epochs = 301
report_frequency = 25

In [None]:
for epoch in range(max_epochs):
    # Compute train loss
    # TODO

    # Print train loss every `report_frequency` epochs
    # TODO

    # Perform the update
    # TODO

In [None]:
# Compute test loss
# TODO

# Compute test accuracy
# TODO

# Print test loss and accuracy
# TODO

# 3. Multi-class logistic regression (with backprop)

In [None]:
import nn

In [None]:
class LogisticRegression:
    """Logistic regression model.

    Gradients are computed with backpropagation.
    """

    def __init__(self, num_features, num_classes, learning_rate=1e-2):
        # Initialize hyperparameters
        # TODO

        # Initialize the model parameters
        # TODO

        # Define layers
        # TODO

        # Define loss
        # TODO
        

    def predict(self, X):
        """Generate predictions for one minibatch.

        Args:
            X: data matrix, shape (N, D)

        Returns:
            Y_pred: predicted class probabilities, shape (N, D)
            Y_pred[n, k] = probability that sample n belongs to class k
        """
        # TODO        

    def step(self, X, Y):
        """Perform one step of gradient descent on the minibatch of data."""
        # Forward  - compute the loss on training data
        # TODO

        # Backward  - compute the gradients of loss w.r.t. all the model parameters
        # TODO

        # Apply the gradients
        # TODO

In [None]:
# Specify optimization parameters
learning_rate = 1e-2
max_epochs = 301
report_frequency = 25

In [None]:
log_reg = LogisticRegression(num_features=D, num_classes=K, learning_rate=learning_rate)

In [None]:
for epoch in range(max_epochs)
    # Perform one step of gradient descent
    # TODO

    # Print train loss every `report_frequency` epochs
    # TODO

In [None]:
# Compute test loss
# TODO

# Compute test accuracy
# TODO

# Print test loss and accuracy
# TODO