# LeCun et al. 1990, "Handwritten Digit Recognition: Applications of Neural Net Chips and Automatic Learning"
> The following tries to reproduce the original paper. Note that the digits dataset actually used in the paper could not be found and [MNIST 784](https://www.openml.org/search?type=data&status=active&id=554) is used instead

## References

* LeCun et al. 1990, _Handwritten Digit Recognition: Applications of Neural Net Chips and Automatic Learning_, [Neurocomputing](https://link.springer.com/chapter/10.1007/978-3-642-76153-9_35)

specifics in LeCun et al. 1990:

* neural net
    * weight initialization: uniformly at random $\in [-2.4 / F_i, 2.4 / F_i]$ with $F_i = $ number of inputs of the unit
    * "tanh activation": $A \cdot \tanh (S \cdot a)$ with $A = 1.716$, $S = 2/3$ and $a = \text{weights} \cdot \text{input}$
    * 256 input (16 x 16 pixel images)
    * layer #1: 
        * convolution with 12 5x5-kernels and stride 2 (output: 8 x 8 x 12 = 786 "units")
        * tanh activation
        * $F_i = 5 \cdot 5 \cdot n_\text{input-channels} = 5 \cdot 5 \cdot 1 = 25$
    * layer #2: 
        * convolution with 12 5x5-kernels and stride 2 (output: 4 x 4 x 12 = 192 "units")
        * tanh activation
        * $F_i = 5 \cdot 5 \cdot n_\text{input-channels} = 5 \cdot 5 \cdot 12 = 300$
    * layer #3:
        * dense with 30 neurons
        * tanh activation
        * $F_i = 4 \cdot 4 \cdot 12 = 192$
    * layer #4:
        * dense output layer with 10 neurons
        * tanh activation
        * $F_i = 30$
* target: vector of 10 values either 1 or -1 (so 9x -1 and 1x 1)
* loss: mean squared error between prediction and target (paper reached 1.8e-2 on test and 2.5e-3 on train)
* error rates: 0.14% on train, 5% on test
* training:
    * stochastic gradient descent (1 sample per backpropagation)
    * samples always in the same order, no shuffling
    * 23 or 30 epochs, paper is ambiguous
    * learning rate was set using some not defined 2nd order derivative method

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import random
import typing as T
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchinfo
import tqdm
from einops import rearrange
from einops.layers.torch import Rearrange
from sklearn import metrics
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from torch.optim import SGD
from torch.utils.data import DataLoader, Dataset

import random_neural_net_models.convolution_lecun1990 as conv_lecun1990
import random_neural_net_models.data as rnnm_data
import random_neural_net_models.learner as rnnm_learner
import random_neural_net_models.losses as rnnm_losses
import random_neural_net_models.telemetry as telemetry
import random_neural_net_models.utils as utils

sns.set_theme()

In [None]:
DO_OVERFITTING_ONLY = False

In [None]:
mnist = fetch_openml("mnist_784", version=1, cache=True, parser="auto")

Setting seeds

In [None]:
utils.make_deterministic(42)

Getting device

In [None]:
device = utils.get_device()
device

In [None]:
X = mnist["data"]
y = mnist["target"]
X.shape, y.shape

Selecting a few images to overfit on

In [None]:
n = 100
X0, y0 = X.iloc[:n], y.iloc[:n]

## Defining dataset and dataloader

In [None]:
ds = conv_lecun1990.DigitsDataset(X0, y0)

In [None]:
item = ds[4]
plt.imshow(item[0], cmap="gray", origin="upper")
plt.title(f"Label: {item[1]}")
plt.axis("off")
plt.tight_layout()

defining a dataloader

In [None]:
batch_size = 1
dataloader = DataLoader(ds, batch_size=batch_size, shuffle=False)

In [None]:
train_features, train_labels = next(iter(dataloader))

inspecting one image of the first batch

In [None]:
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0]  # .reshape((28,28))
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.axis("off")
plt.show()
print(f"Label: {label}")

estimating convolution block height / width

In [None]:
conv_lecun1990.calc_conv_output_dim(
    28, 5, 2, 2
), conv_lecun1990.calc_conv_output_dim(14, 5, 2, 2)

## Custom 2d convolution

defining a 2d convolution component

In [None]:
kh = kw = 5
n_in_channels = 1
n_out_channels = 1

myconv2d = conv_lecun1990.Conv2d(
    edge=28,
    n_in_channels=n_in_channels,
    n_out_channels=n_out_channels,
    kernel_width=kw,
    kernel_height=kh,
    stride=2,
    padding=2,
    dilation=1,
)

In [None]:
train_features, train_labels = next(iter(dataloader))
train_features = train_features.unsqueeze(dim=1)

print(f"{train_features.shape=}")

applying the convolution to an image

In [None]:
conv_features = myconv2d(train_features)

print(f"{conv_features.shape=}")

visualizing the effect

In [None]:
label = train_labels[0]
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(5, 5))

ax = axs[0]
img = train_features[0][0]
ax.imshow(img, cmap="gray")
ax.axis("off")

ax = axs[1]
img = conv_features.detach().numpy()[0][0]
ax.imshow(img, cmap="gray")
ax.axis("off")

plt.show()
print(f"Label: {label}")

## trying to overfit

using `densify_y` to convert a label to a vector of -1/1, i.e. False/True

In [None]:
train_labels[0:3], conv_lecun1990.densify_y(train_labels[0:3])

creating the model in the paper

In [None]:
model = conv_lecun1990.Model(lecun_init=True, lecun_act=True)
model = telemetry.ModelTelemetry(
    model,
    activations_every_n=100,
    gradients_every_n=100,
    activations_name_patterns=(r".*act.*",),
    gradients_name_patterns=(r"conv\d$", r"lin\d"),
    parameters_name_patterns=(r"conv\d$", r"lin\d"),
)
model.double()

In [None]:
torchinfo.summary(model, input_size=(1, 28, 28), dtypes=[torch.double])

In [None]:
opt = SGD(
    model.parameters(),
    lr=0.1,  # randomly chosen, not provided in the paper
)

In [None]:
# loss_func = nn.CrossEntropyLoss()
loss_func = nn.MSELoss()

In [None]:
model.to(device);

training

In [None]:
n_epochs = 20
_iter = 0
model.train()
for epoch in tqdm.tqdm(range(n_epochs), desc="Epochs", total=n_epochs):
    for i, (xb, yb) in tqdm.tqdm(
        enumerate(dataloader), desc="Batches", total=len(dataloader)
    ):
        xb = xb.to(device)
        yb = yb.to(device)
        yb = conv_lecun1990.densify_y(yb)
        loss = loss_func(model(xb), yb)

        opt.zero_grad()
        loss.backward()
        opt.step()

        model.loss_history_train(loss, _iter)
        model.parameter_history(_iter)

        _iter += 1

print("Done!")

In [None]:
model.draw_gradient_stats()

In [None]:
model.draw_activation_stats()

plotting losses

In [None]:
model.draw_loss_history_train()

In [None]:
model.draw_loss_history_test()

In [None]:
model.draw_parameter_stats()

In [None]:
model.clean_hooks()

inference for a few samples

In [None]:
train_features, train_labels = next(iter(dataloader))

In [None]:
model.eval();

inspecting predictions

In [None]:
train_features = train_features.to(device)
pred_probs = model(train_features)
pred_probs

In [None]:
y_pred = pred_probs.detach().cpu().numpy().argmax(axis=1)
y_pred

In [None]:
train_labels

In [None]:
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].cpu()  # .reshape((28,28))
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.axis("off")
plt.show()
print(f"Label: {label}, pred: {y_pred[0]}")

inspecting the effect of the learned filters on an image

In [None]:
n_filters = model.model.conv1.n_out_channels.item()

fig, axs = plt.subplots(
    nrows=n_filters // 3, ncols=n_filters // 4, figsize=(12, 12)
)
with torch.no_grad():
    conv_features = model.model.act_conv1(
        model.model.conv1(train_features.unsqueeze(1))
    ).cpu()
    for i, ax in enumerate(axs.flatten()):
        ax.imshow(conv_features[0][i], cmap="gray")
        ax.axis("off")
        ax.set_title(f"Filter {i+1}")

plt.show()
print(f"Label: {label}")

## overfitting with `Learner`

In [None]:
ds_train = rnnm_data.MNISTDatasetWithLabels(X0, y0)

In [None]:
int(ds_train.y.iloc[0])

In [None]:
ds_train[0];

In [None]:
dl_train = DataLoader(
    ds_train,
    batch_size=1,
    collate_fn=rnnm_data.collate_mnist_dataset_to_block_with_labels,
    shuffle=True,
)

In [None]:
next(iter(dl_train))

In [None]:
model = conv_lecun1990.Model2(
    lecun_init=True, lecun_act=True, dtype=torch.float
)

In [None]:
n_epochs = 20
lr = 1e-2
optimizer = optim.SGD(model.parameters(), lr=lr)  # , momentum=1e-3
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=lr,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
loss = rnnm_losses.MSELossMNIST1HotLabel()
save_dir = Path("./models")

loss_callback = rnnm_learner.TrainLossCallback()
activations_callback = rnnm_learner.TrainActivationsCallback(
    every_n=100, max_depth_search=4, name_patterns=(r".*act.*",)
)
gradients_callback = rnnm_learner.TrainGradientsCallback(
    every_n=100, max_depth_search=4, name_patterns=(r"conv\d$", r"lin\d")
)
parameters_callback = rnnm_learner.TrainParametersCallback(
    every_n=100, max_depth_search=4, name_patterns=(r"conv\d$", r"lin\d")
)


scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
callbacks = [
    loss_callback,
    activations_callback,
    gradients_callback,
    parameters_callback,
]

lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 10, 100)

learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
)

In [None]:
learner.find_learning_rate(
    dl_train, n_epochs=2, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot()

In [None]:
learner.fit(dl_train, n_epochs=n_epochs)

In [None]:
loss_callback.plot()

In [None]:
dl_inf = DataLoader(
    ds_train,
    batch_size=10,
    collate_fn=rnnm_data.collate_mnist_dataset_to_block_with_labels,
    shuffle=False,
)

y_prob = learner.predict(dl_inf)

In [None]:
y_prob.argmax(dim=1)

In [None]:
y0

In [None]:
if DO_OVERFITTING_ONLY:
    raise SystemExit("Skipping training beyond overfitting.")

## Getting ~95% accuracy on 10k digits

splitting 10k digits

In [None]:
n = 10_000  # bit larger than the number of samples in the paper
X0, X2, y0, y2 = train_test_split(
    X.iloc[:n], y.iloc[:n], test_size=0.2, random_state=42
)  # , stratify=y)
X0, X1, y0, y1 = train_test_split(X0, y0, test_size=0.2, random_state=42)
len(X0), len(X1), len(X2)

In [None]:
ds = conv_lecun1990.DigitsDataset(X0, y0)
ds_valid = conv_lecun1990.DigitsDataset(X1, y1)
ds_test = conv_lecun1990.DigitsDataset(X2, y2)

In [None]:
batch_size = 1
dataloader = DataLoader(ds, batch_size=batch_size, shuffle=False)
dataloader_valid = DataLoader(ds_valid, batch_size=500, shuffle=False)
dataloader_test = DataLoader(ds_test, batch_size=500, shuffle=False)

In [None]:
model = conv_lecun1990.Model(lecun_init=True, lecun_act=True)
model = telemetry.ModelTelemetry(
    model,
    parameters_every_n=100,
    activations_every_n=100,
    gradients_every_n=100,
    activations_name_patterns=(".*act.*",),
    gradients_name_patterns=(r"conv\d$", r"lin\d"),
    parameters_name_patterns=(r"conv\d$", r"lin\d"),
)
model.double()

In [None]:
opt = SGD(
    model.parameters(),
    lr=0.01,  # randomly chosen, not provided in the paper
)

In [None]:
# loss_func = nn.CrossEntropyLoss()
loss_func = nn.MSELoss()

In [None]:
model.to(device);

training and validation

In [None]:
n_epochs = 10
_iter = 0
model.train()
for epoch in tqdm.tqdm(range(n_epochs), desc="Epochs", total=n_epochs):
    for i, (xb, yb) in tqdm.tqdm(
        enumerate(dataloader), desc="Batches", total=len(dataloader)
    ):
        xb = xb.to(device)
        yb = yb.to(device)
        yb = conv_lecun1990.densify_y(yb)
        loss = loss_func(model(xb), yb)

        opt.zero_grad()
        loss.backward()
        opt.step()

        model.loss_history_train(loss, _iter)
        model.parameter_history(_iter)

        _iter += 1

    # compute validation loss
    with torch.no_grad():
        model.eval()
        ys_pred, ys_true = [], []
        for xb, yb in dataloader_valid:
            xb = xb.to(device)
            yb = yb.to(device)
            yb = conv_lecun1990.densify_y(yb)
            yp = model(xb)
            ys_pred.append(yp)
            ys_true.append(yb)
        y_pred = torch.cat(ys_pred, dim=0)
        y_true = torch.cat(ys_true, dim=0)
        loss_test = loss_func(y_pred, y_true)
        model.loss_history_test(loss_test, _iter)

        model.train()

print("Done!")

plotting gradients

In [None]:
model.draw_gradient_stats()

plotting activations

In [None]:
model.draw_activation_stats()

plotting losses

In [None]:
model.draw_loss_history_train()

In [None]:
model.draw_loss_history_test()

drawing histograms of the weights and biases across training iterations

In [None]:
model.draw_parameter_stats()

In [None]:
model.clean_hooks()

inference for a few samples

In [None]:
train_features, train_labels = next(iter(dataloader))

In [None]:
model.eval();

inspecting predictions

In [None]:
train_features = train_features.to(device)
pred_probs = model(train_features)
pred_probs

In [None]:
y_pred = pred_probs.cpu().detach().numpy().argmax(axis=1)
y_pred

In [None]:
train_labels

In [None]:
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].cpu()  # .reshape((28,28))
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.axis("off")
plt.show()
print(f"Label: {label}, pred: {y_pred[0]}")

inspecting the effect of the learned filters on an image

In [None]:
n_filters = model.model.conv1.n_out_channels.item()

fig, axs = plt.subplots(
    nrows=n_filters // 3, ncols=n_filters // 4, figsize=(12, 12)
)
with torch.no_grad():
    conv_features = model.model.act_conv1(
        model.model.conv1(train_features.unsqueeze(1))
    ).cpu()
    for i, ax in enumerate(axs.flatten()):
        ax.imshow(conv_features[0][i], cmap="gray")
        ax.axis("off")
        ax.set_title(f"Filter {i+1}")

plt.show()
print(f"Label: {label}")

computing test set performance

In [None]:
ys_pred = []
ys_true = []
for test_features, test_labels in dataloader_test:
    test_features = test_features.to(device)
    pred_probs = model(test_features)

    y_pred = pred_probs.detach().cpu().numpy().argmax(axis=1)

    ys_true.append(test_labels.numpy())
    ys_pred.append(y_pred)


ys_true = np.concatenate(ys_true)
ys_pred = np.concatenate(ys_pred)

In [None]:
ys_true, ys_pred

In [None]:
accuracy = metrics.accuracy_score(ys_true, ys_pred)
error_rate = 1 - accuracy
print(f"* Accuracy: {accuracy:.2%}")
print(f"* Error rate: {error_rate:.2%}")

* Accuracy: 95.75%
* Error rate: 4.25%

In [None]:
ax = metrics.ConfusionMatrixDisplay.from_predictions(ys_true, ys_pred)
plt.axis("off")
plt.show()

## using `Learner`

In [None]:
ds_train = rnnm_data.MNISTDatasetWithLabels(X0, y0)
ds_valid = rnnm_data.MNISTDatasetWithLabels(X1, y1)
ds_test = rnnm_data.MNISTDatasetWithLabels(X2, y2)

In [None]:
ds_train[0];

In [None]:
dl_train = DataLoader(
    ds_train,
    batch_size=1,
    collate_fn=rnnm_data.collate_mnist_dataset_to_block_with_labels,
    shuffle=True,
)
dl_valid = DataLoader(
    ds_valid,
    batch_size=200,
    collate_fn=rnnm_data.collate_mnist_dataset_to_block_with_labels,
    shuffle=False,
)
dl_test = DataLoader(
    ds_test,
    batch_size=200,
    collate_fn=rnnm_data.collate_mnist_dataset_to_block_with_labels,
    shuffle=False,
)

In [None]:
next(iter(dl_train))

In [None]:
model = conv_lecun1990.Model2(
    lecun_init=True, lecun_act=True, dtype=torch.float
)

In [None]:
n_epochs = 10
lr = 1e-3
# optimizer = optim.SGD(model.parameters(), lr=lr)  # , momentum=1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)
# scheduler = optim.lr_scheduler.OneCycleLR(
#     optimizer=optimizer,
#     max_lr=lr,
#     epochs=n_epochs,
#     steps_per_epoch=len(dl_train),
# )
loss = rnnm_losses.MSELossMNIST1HotLabel()
save_dir = Path("./models")

loss_callback = rnnm_learner.TrainLossCallback()
activations_callback = rnnm_learner.TrainActivationsCallback(
    every_n=100, max_depth_search=4, name_patterns=(r".*act.*",)
)
gradients_callback = rnnm_learner.TrainGradientsCallback(
    every_n=100, max_depth_search=4, name_patterns=(r"conv\d$", r"lin\d")
)
parameters_callback = rnnm_learner.TrainParametersCallback(
    every_n=100, max_depth_search=4, name_patterns=(r"conv\d$", r"lin\d")
)
early_stopping_callback = rnnm_learner.EarlyStoppingCallback(patience=3)

# scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
callbacks = [
    loss_callback,
    activations_callback,
    gradients_callback,
    parameters_callback,
    early_stopping_callback,
    # scheduler_callback
]

lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 10, 100)

learner = rnnm_learner.Learner(
    model, optimizer, loss, callbacks=callbacks, save_dir=save_dir
)

In [None]:
learner.find_learning_rate(
    dl_train, n_epochs=2, lr_find_callback=lr_find_callback
)

In [None]:
lr_find_callback.plot()

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot(window=100)

In [None]:
losses_valid = loss_callback.get_losses_valid()
losses_valid

In [None]:
parameters_callback.plot()

In [None]:
gradients_callback.plot()

In [None]:
activations_callback.plot()

computing test set performance

In [None]:
y_prob = learner.predict(dl_test)

In [None]:
ys_pred = y_prob.argmax(dim=1).numpy()
ys_pred

In [None]:
ys_true = np.array([int(v) for v in y2.values])
ys_true

In [None]:
accuracy = metrics.accuracy_score(ys_true, ys_pred)
error_rate = 1 - accuracy
print(f"* Accuracy: {accuracy:.2%}")
print(f"* Error rate: {error_rate:.2%}")

* Accuracy: 95.15%
* Error rate: 4.85%

In [None]:
ax = metrics.ConfusionMatrixDisplay.from_predictions(ys_true, ys_pred)
plt.axis("off")
plt.show()