# Backpropagation

## References

* _Rumelhart et al. 1986, Learning representations by back-propagating errors_, [nature](https://www.nature.com/articles/323533a0)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sklearn.datasets as sk_datasets
import sklearn.model_selection as model_selection
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn import metrics
from torch.utils.data import DataLoader

import random_neural_net_models.backprop_rumelhart as backprop_rumelhart
import random_neural_net_models.data as rnnm_data
import random_neural_net_models.learner as rnnm_learner
import random_neural_net_models.utils as utils

In [None]:
SEED = 42

In [None]:
X, y = sk_datasets.make_regression(
    n_samples=100, n_features=1, noise=20, random_state=SEED
)

In [None]:
X0, X1, y0, y1 = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=SEED, shuffle=True
)

In [None]:
sns.scatterplot(x=X0[:, 0], y=y0)
plt.tight_layout()

In [None]:
X, y = sk_datasets.make_blobs(
    n_samples=1_000,
    n_features=2,
    centers=2,
    random_state=SEED,
)

In [None]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=SEED, shuffle=True
)

In [None]:
sns.scatterplot(x=X_train[:, 0], y=X_train[:, 1], hue=y_train, alpha=0.3);

## Rumelhart et al. 1986, "Learning representations by back-propagating errors"
> The following is in the spirit of the paper, *cough*. Found the presentation in the paper somewhat hard to read.

With input $x$ we predict $y$ using a function $f$ like

$$

y = f(x)

$$

$f$ uses weight $w$ and some non-linearity $g$ like

$$
z =  x \cdot w^T \\
f(x) = g(z)
$$

In the paper they use the sigmoid
$$
g(z) = \frac{1}{1+\exp(-z)}
$$

Indicating repeating weight multiplication and non-linearity with index $i$ (layer), we can write

$$

z_1 =  x \cdot w_1 ^ T \\
a_1 = g_1(z_1) \\

...\\

z_{i} = a_{i-1} \cdot w_{i}^T \\
a_{i} = g_{i}(z_{i}) \\

...\\

z_N = a_{N-1} \cdot w_N^T \\
a_N = g_N(z_N) \\

f(x) = a_N

$$

where $a_i$ is "activation" of layer $i$

Comparing predictions to desired valued $d$ we denote deviations / the loss as $l(d,y)$.

With this we can identify how to change $w_i$ by how much lead to the strongest improvement of $l(d,y)$ for any $x$, $y$, $d$ by differentiating $l$. A straightforward way is scaling that gradient and applying it like

$$

w_{i,\text{new}} = w_{i,\text{old}} + \epsilon \frac{d}{dw_i}l(d,y)

$$

with some factor $\epsilon \in [0,1]$

$\frac{d}{dw_i}l(d,y)$ can analytically be derived using the chain rule as 

$$

\frac{d}{dw_i}l(d,y) = l^\prime(d,y) \cdot g_N^\prime(z_N) \cdot z_N^\prime \cdot g_{N-1}^\prime(z_{N-1}) \cdot z_{N-1}^\prime \cdot \space ... \space \cdot g_{i}^\prime(z_{i}) \cdot a_{i-1}

$$

Replacing $z^\prime = \frac{\partial}{\partial a}z = w ^ T$
$$

\frac{d}{dw_i}l(d,y) = l^\prime(d,y) \cdot g_N^\prime(z_N) \cdot w_N ^ T \cdot g_{N-1}^\prime(z_{N-1}) \cdot w_{N-1} ^ T \cdot \space ... \space \cdot g_{i}^\prime(z_{i}) \cdot a_{i-1}

$$

Another version proposed in Rumelhart et al. is
$$

w_{i,\text{new}} = w_{i,\text{old}} + \epsilon \frac{d}{dw_i}l(d,y) + \alpha \left( \frac{d}{dw_i}l(d,y) \right)_\text{old}

$$

with some factor $\alpha \in [0,1]$.

In [None]:
x = torch.linspace(-10, 10, 100)
a = backprop_rumelhart.sigmoid(x)
a_prime = backprop_rumelhart.sigmoid_derivative(x)

In [None]:
sns.lineplot(x=x, y=a, label="g(z): sigmoid")
sns.lineplot(x=x, y=a_prime, label="dg(z)/dz: sigmoid derivative")

In [None]:
model = backprop_rumelhart.Rumelhart1986PerceptronClassifier(
    n_hidden=(10, 5), epochs=10, verbose=True, eps=1e-3, alpha=1e-3
)

In [None]:
model.fit(X_train, y_train)

In [None]:
model.errors_

In [None]:
x0 = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
x1 = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
X0, X1 = np.meshgrid(x0, x1)
X_plot = np.array([X0.ravel(), X1.ravel()]).T
X_plot[:4]

In [None]:
y_prob = model.predict_proba(X_plot)

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, y_prob.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
plt.show()

In [None]:
y_pred = model.predict(X_val)

In [None]:
print(metrics.classification_report(y_val, y_pred))

## and now with `Learner` & pytorch tools

In [None]:
device = utils.get_device()
device

In [None]:
ds_train = rnnm_data.NumpyTrainingDataset(X_train, y_train)
ds_val = rnnm_data.NumpyTrainingDataset(X_val, y_val)

In [None]:
ds_train[0]

In [None]:
dl_train = DataLoader(
    ds_train,
    batch_size=10,
    collate_fn=rnnm_data.collate_numpy_dataset_to_xyblock,
    shuffle=True,
)
dl_val = DataLoader(
    ds_val,
    batch_size=10,
    collate_fn=rnnm_data.collate_numpy_dataset_to_xyblock,
    shuffle=False,
)

In [None]:
model = backprop_rumelhart.Rumelhart1986PytorchPerceptron(
    n_hidden=(2, 10, 5, 1)
)

In [None]:
n_epochs = 2
optimizer = optim.SGD(model.parameters(), lr=10, momentum=1e-3)
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=10,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
loss = backprop_rumelhart.BCELoss()
save_dir = Path("./models")

loss_callback = rnnm_learner.TrainLossCallback()
activations_callback = rnnm_learner.TrainActivationsCallback(
    every_n=10, max_depth_search=4, name_patterns=(".*act",)
)
gradients_callback = rnnm_learner.TrainGradientsCallback(
    every_n=10, max_depth_search=4, name_patterns=(".*lin",)
)
parameters_callback = rnnm_learner.TrainParametersCallback(
    every_n=10, max_depth_search=4, name_patterns=(".*lin",)
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
callbacks = [
    loss_callback,
    activations_callback,
    gradients_callback,
    parameters_callback,
    scheduler_callback,
]

lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
)

In [None]:
learner.find_learning_rate(
    dl_train, n_epochs=2, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot()

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_val)

In [None]:
loss_callback.plot()

In [None]:
parameters_callback.plot()

In [None]:
gradients_callback.plot()

In [None]:
activations_callback.plot()

In [None]:
x0 = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
x1 = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
X0, X1 = np.meshgrid(x0, x1)
X_plot = np.array([X0.ravel(), X1.ravel()]).T
X_plot[:4]

In [None]:
ds_plot = rnnm_data.NumpyInferenceDataset(X_plot)
dl_plot = DataLoader(
    ds_plot, batch_size=5, collate_fn=rnnm_data.collate_numpy_dataset_to_xblock
)

In [None]:
y_prob = learner.predict(dl_plot)

In [None]:
y_prob = y_prob.detach().numpy()
y_prob

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, y_prob.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
plt.show()