# Homework 3 - Luka Radić
## SCIPER: 354502

## Import the libraries needed

In [1]:
import numpy as np

## Exercise 1: Backpropagation with logistic loss

In [2]:
# Defining the constants used throughout the exercise
D = 5
K = 6

def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

def grad_sigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))

### Task 1: `predict` function

In [3]:
def predict(X,W):
    X_0 = X

    W_1 = W["w_1"]
    w_2 = W["w_2"]

    Z_1 = X_0 @ W_1
    X_1 = sigmoid(Z_1)
    z_2 = X_1 @ w_2
    y_hat = sigmoid(z_2)

    return Z_1, z_2, y_hat

### Task 2: `logistic_loss` function

In [4]:
def logistic_loss(y, y_hat):
    eps = 1e-12
    log_loss = -y.dot(np.log(y_hat+eps)) - (1-y).dot(np.log(1-y_hat+eps))
    return np.mean( log_loss )

In [5]:
B = 5 # esentially it can be anything, the result does not depend on it
y = np.zeros(B)
y_hat = np.zeros(B)
log_loss = logistic_loss(y, y_hat)
print("The average log-loss of the batch is {}".format(log_loss))

The average log-loss of the batch is -5.000444502909205e-12


### Task 3: `stable_logistic_loss` function

In [6]:
def stable_logistic_loss(y, z_2):
    log_loss = y.dot(np.logaddexp(0,-z_2)) + (1-y).dot(np.logaddexp(0,z_2))
    return np.mean( log_loss )

In [7]:
B = 5 # esentially it can be anything, the result does not depend on it
y = np.zeros(B)
z_2 = -1e10 * np.ones(B)
stable_log_loss = stable_logistic_loss(y, z_2)
print("The average stable-log-loss of the batch is {}".format(stable_log_loss))

The average stable-log-loss of the batch is 0.0


### Task 4: Analytical backpropagation

Let us denote with:
- $x_i^{(0)}$ the $i$-th component of the input vector
- $w_{ij}^{(1)}$ the weight of the edge connecting $x_i^{(0)}$ and the $j$-th node in the hidden layer $z_j^{(1)}$
- $w_j^{(2)}$ the weight of the edge connecting $z_j^{(1)}$ and the output node $z^{(2)}$
- $\hat{y}$ the final prediction

where $i=\overline{1,D},\; j=\overline{1,K}$. Note that the inputs of activation functions for each node in the NN can be calculated as:
$$
z_j^{(1)} = \sum_{i=1}^D \; w_{ij}^{(1)} x_i^{(0)}, \quad z^{(2)} = \sum_{j=1}^K \; w_j^{(2)} \sigma\left(z_j^{(1)}\right)
$$
We use the stable implementation of logistic loss:
$$
\mathcal{L}(x,y,w) = y \log\left( 1 + e^{-z^{(2)}} \right) + (1-y) \log\left( 1 + e^{z^{(2)}} \right)
$$

We first find the partial derivatives of *hidden-output* weights:
$$
\begin{align*}
    \frac{\partial \mathcal{L}(x,y,w)}{\partial w_j^{(2)}} &= - y \frac{e^{-z^{(2)}}}{1 + e^{-z^{(2)}}} \frac{\partial z^{(2)}}{\partial w_j^{(2)}} + (1-y) \frac{e^{z^{(2)}}}{1 + e^{z^{(2)}}} \frac{\partial z^{(2)}}{\partial w_j^{(2)}} \\
    &= \left[ - y \frac{e^{-z^{(2)}}}{1 + e^{-z^{(2)}}} + (1-y) \frac{e^{z^{(2)}}}{1 + e^{z^{(2)}}} \right] \; \sigma\left(z_j^{(1)}\right)
\end{align*}
$$

Now we find the partial derivatives of *input-hidden* weights. For that purpose, it is useful to rewrite $z^{(2)}$ as:
$$
z^{(2)} = \sum_{j=1}^K \; w_j^{(2)} \sigma\left(z_j^{(1)}\right) = \sum_{j=1}^K \; w_j^{(2)} \sigma\left(\sum_{i=1}^D \; w_{ij}^{(1)} x_i^{(0)}\right)
$$

$$
\begin{align*}
    \frac{\partial \mathcal{L}(x,y,w)}{\partial w_{ij}^{(1)}} &= \left[ - y \frac{e^{-z^{(2)}}}{1 + e^{-z^{(2)}}} + (1-y) \frac{e^{z^{(2)}}}{1 + e^{z^{(2)}}} \right] \frac{\partial z^{(2)}}{\partial w_{ij}^{(1)}} \\
    &= \left[ - y \frac{e^{-z^{(2)}}}{1 + e^{-z^{(2)}}} + (1-y) \frac{e^{z^{(2)}}}{1 + e^{z^{(2)}}} \right] \; w_j^{(2)} \sigma'\left(\sum_{i=1}^D \; w_{ij}^{(1)} x_i^{(0)}\right) x_i^{(0)}
\end{align*}
$$

Considering that
$$
\frac{d\sigma(x)}{dx} = \frac{e^{-x}}{(1+e^{-x})^2} = \frac{1}{1+e^{-x}} \left(\frac{1+e^{-x}}{1+e^{-x}}-\frac{1}{1+e^{-x}}\right) = \sigma(x)\left[1-\sigma(x)\right]
$$
we can finally write the partial derivative as
$$
    \frac{\partial \mathcal{L}(x,y,w)}{\partial w_{ij}^{(1)}} = \left[ - y \frac{e^{-z^{(2)}}}{1 + e^{-z^{(2)}}} + (1-y) \frac{e^{z^{(2)}}}{1 + e^{z^{(2)}}} \right] \; w_j^{(2)} \sigma\left(z_j^{(1)}\right)\left[1-\sigma\left(z_j^{(1)}\right)\right] x_i^{(0)}
$$

### Task 5: `gradient` function

In [8]:
def gradient(X, y, W):
    B = X.shape[0]
    # Feedforward
    Z_1, z_2, y_hat = predict(X, W)
    X_1 = sigmoid(Z_1)
    # Backpropagation
    delta_2 = -y * np.exp(-z_2)/(1+np.exp(-z_2)) + (1-y)*np.exp(z_2)/(1+np.exp(z_2))
    delta_w_2 = delta_2.reshape(-1,1) * X_1
    delta_w_2_mean = np.mean(delta_w_2, axis=0)
    delta_1 = delta_2.reshape(-1,1) * grad_sigmoid(Z_1) * W["w_2"]
    delta_w_1_mean = 1/B * X.T @ delta_1
    
    return {
        "w_1": delta_w_1_mean,
        "w_2": delta_w_2_mean
    }

In [9]:
X = np.array([[0.3,0.3,0.3,0.3],[0.3,0.3,0.3,0.3],[0.01,0.2,0.01,0.3],[0.3,0.3,0.3,0.3]])
W = {
    "w_1": np.ones((4, 5)),
    "w_2": np.ones(5)
}
y = np.ones(4)

In [10]:

expected = 0.93244675427215695
_, _, yours = predict(X, W)
print(yours)
print(np.sum((yours - expected) ** 2) < 1e-15)

[0.97901263 0.97901263 0.9583431  0.97901263]
False


In [11]:

expected = {
    'w_1': np.array([
        [ -1.06113639e-05,  -1.06113639e-05,  -1.06113639e-05, -1.06113639e-05,  -1.06113639e-05],
        [ -2.12227277e-05,  -2.12227277e-05,  -2.12227277e-05, -2.12227277e-05,  -2.12227277e-05],
        [ -3.18340916e-05,  -3.18340916e-05,  -3.18340916e-05, -3.18340916e-05,  -3.18340916e-05],
        [ -4.24454555e-05,  -4.24454555e-05,  -4.24454555e-05, -4.24454555e-05,  -4.24454555e-05]]),
    'w_2': np.array(
        [-0.00223387, -0.00223387, -0.00223387, -0.00223387, -0.00223387])
}
yours = gradient(X,y,W)
print(yours)
print( np.sum(
    [np.sum((yours[key] - expected[key]) ** 2) for key in expected.keys()]) < 1e-15 )


{'w_1': array([[-0.0008644 , -0.0008644 , -0.0008644 , -0.0008644 , -0.0008644 ],
       [-0.00132708, -0.00132708, -0.00132708, -0.00132708, -0.00132708],
       [-0.0008644 , -0.0008644 , -0.0008644 , -0.0008644 , -0.0008644 ],
       [-0.0015706 , -0.0015706 , -0.0015706 , -0.0015706 , -0.0015706 ]]), 'w_2': array([-0.01862824, -0.01862824, -0.01862824, -0.01862824, -0.01862824])}
False
