# In-class exercise 7: Deep Learning 1 (Part B)
In this notebook we will gain some hands-on experience with backpropagation

In [136]:
import numpy as np
import matplotlib.pyplot as plt
import time

%matplotlib inline

from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

In [137]:
X, y = load_digits(return_X_y=True)
# Convert labels into one-hot format
Y = label_binarize(y, classes=np.unique(y))
K = Y.shape[1]  # number of classes
D = X.shape[1]  # number of features

np.random.seed(123)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

Check shapes

In [139]:
X_train.shape, Y_train.shape

((1257, 64), (1257, 10))

# 1. Simple backpropagation example

Addition of two vectors

In [142]:
class Add:
    def forward(self, x, y):
        # TODO
        return x+y
    def backward(self, d_out):
        # TODO
        d_x = d_out
        d_y = d_out
        return d_x, d_y

Element-wise multiplication of two vectors

In [144]:
class Multiply:
    def forward(self, x, y):
        # TODO
        self.cache = (x,y)
        return x*y
    def backward(self, d_out):
        # TODO
        x, y = self.cache
        d_x = d_out * y
        d_y = d_out * x
        return d_x, d_y

Sum of a vector

In [146]:
class Sum:
    def forward(self, x):
        # TODO
        self.cache = x
        return np.sum(x)

    def backward(self, d_out):
        # TODO
        x = self.cache
        d_x = d_out * np.ones_like(x)
        return d_x

### Dot product of two vectors as composition of multiplication and summation

Dot product:
$$\mathbf{x}\cdot\mathbf{y} = \sum_{i=0}^{n}x_i y_i$$

In [149]:
x = np.arange(1, 5, dtype=np.float32)
y = np.arange(-1, 3, dtype=np.float32)

In [150]:
mult = Multiply()
vec_sum = Sum()

w = mult.forward(x, y)
z = vec_sum.forward(w)

d_w = vec_sum.backward(1.0)
d_x, d_y = mult.backward(d_w)

In [151]:
z, d_x, d_y

(10.0,
 array([-1.,  0.,  1.,  2.], dtype=float32),
 array([1., 2., 3., 4.], dtype=float32))

### Dot product of two vectors as one operation

Dot product of two vectors

In [154]:
class DotProduct:
    def forward(self, x, y):
        # TODO
        self.cache = (x,y)
        return np.dot(x,y)

    def backward(self, d_out):
        # TODO
        x,y = self.cache
        d_x = d_out * y
        d_y = d_out * x
        return d_x, d_y

In [155]:
x = np.arange(1, 5, dtype=np.float32)#1.2.3.4 ，没有5
y = np.arange(-1, 3, dtype=np.float32)

In [156]:
dp = DotProduct()
z = dp.forward(x, y)
d_x, d_y = dp.backward(1.0)

In [157]:
z, d_x, d_y

(10.0,
 array([-1.,  0.,  1.,  2.], dtype=float32),
 array([1., 2., 3., 4.], dtype=float32))

**Lessons:**
1. By implementing `forward` and `backward` method we can compute gradients of an arbitrary composition of functions
2. Use `cache` to store values that will be needed in the backward 
3. Multiple operations can be combined into a single module (i.e. function)
4.  `1.0` as `d_out` for the terminal node in our computational graph

# 2. Multi-class logistic regression (without backprop)

Multi-class logistic regression model

Data:
* Data matrix $\mathbf{X} \in \mathbb{R}^{N \times D}$.
* Target labels in one-hot format $\mathbf{Y} \in \mathbb{R}^{N \times K}$.
$Y_{nk} = 1$ if sample $n$ belongs to class $k$, $Y_{nk} = 0$ otherwise.

Model parameters:
* Weight matrix $\mathbf{W} \in \mathbb{R}^{D \times K}$.
* Bias vector $\mathbf{b} \in \mathbb{R}^{K}$.

Making predictions with the model:
* Logits 
$$\mathbf{a}_n = \mathbf{W x}_n + \mathbf{b}$$
* Denote the matrix of logits as 
$$\mathbf{A} = \mathbf{XW} +  \mathbf{1}_N \mathbf{b}^T \in \mathbb{R}^{N \times K}$$
* Convert logits to probabilities using softmax function
$$p(Y_{nk} = 1 \mid \mathbf{x}_n, \mathbf{W}, \mathbf{b}) = \frac{\exp(A_{nk})}{\sum_{c = 1}^{K} \exp(A_{nc})}$$

Negative log-likelihood


\begin{align}
-\log p(\mathbf{Y} \mid \mathbf{X}, \mathbf{W}, \mathbf{b}) &= - \frac{1}{N}\sum_{n=1}^{N} \sum_{k=1}^{K} Y_{nk} \log p(Y_{nk} = 1 \mid \mathbf{x}_n, \mathbf{W}, \mathbf{b})\\
&= \frac{1}{N} \sum_{n=1}^{N} \sum_{k=1}^{K} Y_{nk} \left(-A_{nk} + \log \left( \sum_{c=1}^{C} \exp(A_{nc}) \right) \right)\\
%&= \frac{1}{N} \sum_{n=1}^{N} \left(\sum_{k=1}^{K} -Y_{nk} A_{nk} \right) + \log \left( \sum_{c=1}^{C} \exp(A_{nc}) \right)
\end{align}


In [161]:
from scipy.special import softmax

In [162]:
def predict(X, W, b):
    """Generate predictions for a multi-class logistic regression model.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)

    Returns:
        Y_pred: Predicted class probabilities, shape (N, K).
            Y_pred[n, k] = probability that sample n belongs to class k.
    """
    # TODO
    return softmax (X@W+b, axis=1)

Negative log-likelihood of multiclass logistic regression 

In [175]:
def nll_loss(X, W, b, Y):
    """Compute negative log-likelihood of a logistic regression model.

    Also known as categorical cross entropy loss.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)
        Y: true labels in one-hot format, shape (N, K)

    Returns:
        loss: loss of the logistic regression model, shape ()
    """ 
    # TODO
    N = X.shape[0]
    logits = X@W+b
    logits_shifted = logits - logits.max(axis = 1, keepdims = True)
    log_sum_up = np.log(np.sum(np.exp(logits_shifted),axis=1,keepdims=True))
    log_probs = logits_shifted - log_sum_up#因为做完log了，否则exp时候要相除
    loss = -np.sum(Y*log_probs)/N
    return loss

In [177]:
def nll_grad(X, W, b, Y):
    """Compute gradient of the NLL loss w.r.t. W and b.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)
        Y: true labels in one-hot format, shape (N, K)

    Returns:
        d_W: gradient of the los w.r.t. W, shape (D, K)
        d_b: gradient of the los w.r.t. b, shape (K)
    """
    # TODO
    N = X.shape[0]
    probs = softmax(X@W +b,axis=1)-Y
    d_W = X.T@probs/N
    d_b = probs.sum(axis=0)/N
    return d_W,d_b

In [179]:
# Initialize learnable model parameters
W = np.zeros([D, K])
b = np.zeros([K])

In [181]:
# Specify optimization parameters
learning_rate = 1e-2
max_epochs = 301
report_frequency = 25

In [185]:
for epoch in range(max_epochs):
    # Compute train loss
    # TODO
    loss = nll_loss(X_train,W,b,Y_train)
    
    # Print train loss every `report_frequency` epochs
    # TODO
    if epoch %report_frequency == 0:
        #当 epoch 是 report_frequency 的倍数时（例如 epoch = 0, 5, 10 等），条件为 True，执行打印操作。
        print(f"epoch:{epoch:4d}, loss = {loss:4f}" )
        #:4d 指定将整数按 4 位宽度右对齐，多余的部分用空格填充。
        #:4f 指定浮点数保留 4 位小数
    # Perform the update
    # TODO
    d_W,d_b = nll_grad(X_train,W,b,Y_train)
    W=W-learning_rate*d_W
    b=b-learning_rate*d_b

epoch:   0, loss = 2.302585
epoch:  25, loss = 0.341059
epoch:  50, loss = 0.229275
epoch:  75, loss = 0.186234
epoch: 100, loss = 0.161038
epoch: 125, loss = 0.143956
epoch: 150, loss = 0.131355
epoch: 175, loss = 0.121533
epoch: 200, loss = 0.113577
epoch: 225, loss = 0.106946
epoch: 250, loss = 0.101296
epoch: 275, loss = 0.096397
epoch: 300, loss = 0.092090


In [189]:
# Compute test loss
# TODO
loss_test = nll_loss(X_test, W, b,Y_test)
print(loss_test)

# Compute test accuracy
# TODO
y_pred = predict(X_test, W, b).argmax(axis=1)
y_test = Y_test.argmax(axis=1)
acc_test = accuracy_score(y_test, y_pred)

# Print test loss and accuracy
# TODO
print(acc_test)

0.1511052805399898
0.9555555555555556


# 3. Multi-class logistic regression (with backprop)

In [None]:
import nn

In [None]:
class LogisticRegression:
    """Logistic regression model.

    Gradients are computed with backpropagation.
    """

    def __init__(self, num_features, num_classes, learning_rate=1e-2):
        # Initialize hyperparameters
        # TODO

        # Initialize the model parameters
        # TODO

        # Define layers
        # TODO

        # Define loss
        # TODO
        

    def predict(self, X):
        """Generate predictions for one minibatch.

        Args:
            X: data matrix, shape (N, D)

        Returns:
            Y_pred: predicted class probabilities, shape (N, D)
            Y_pred[n, k] = probability that sample n belongs to class k
        """
        # TODO        

    def step(self, X, Y):
        """Perform one step of gradient descent on the minibatch of data."""
        # Forward  - compute the loss on training data
        # TODO

        # Backward  - compute the gradients of loss w.r.t. all the model parameters
        # TODO

        # Apply the gradients
        # TODO

In [None]:
# Specify optimization parameters
learning_rate = 1e-2
max_epochs = 301
report_frequency = 25

In [None]:
log_reg = LogisticRegression(num_features=D, num_classes=K, learning_rate=learning_rate)

In [None]:
for epoch in range(max_epochs)
    # Perform one step of gradient descent
    # TODO

    # Print train loss every `report_frequency` epochs
    # TODO

In [None]:
# Compute test loss
# TODO

# Compute test accuracy
# TODO

# Print test loss and accuracy
# TODO