In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import torch

# MNIST dataset

In [None]:
import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [None]:
x_train = torch.tensor(x_train.reshape(60000, 784)/255, dtype=torch.float32)
x_test = torch.tensor(x_test.reshape(10000, 784)/255, dtype=torch.float32)

y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.int64)

In [None]:
# w = torch.randn(784, 10)
# b = torch.randn(10)

w = torch.randn(784, 10, requires_grad=True)
b = torch.randn(10, requires_grad=True)

In [None]:
x_train @ w + b

tensor([[ -2.5125, -15.0687,  -6.5273,  ...,  -8.1245,  -5.7722, -16.6154],
        [ -3.8493,  -2.0906,  -0.2493,  ..., -20.8255,   5.2351, -10.7412],
        [ -3.2036,   8.5659,   4.7551,  ...,   0.2433,   5.6426,  10.8439],
        ...,
        [  5.5522,  -3.6008,  -1.8677,  ...,  -6.2139,  -0.3410, -16.5657],
        [  5.7218,  -1.8805,  -0.4007,  ...,  -7.4124,  13.2211,  -0.5049],
        [  7.7313,  -2.1718,  13.4819,  ...,  -7.4111,  -1.5386,   4.1648]])

In [None]:
def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()

def model(xb):
    return log_softmax(xb @ w + b)

# Maximum Likelihood (= minimize the negative log likelihood)

In [None]:
y_train

tensor([5, 0, 4,  ..., 5, 6, 8])

In [None]:
pred = x_train @ w + b
pred

tensor([[ -2.5125, -15.0687,  -6.5273,  ...,  -8.1245,  -5.7722, -16.6154],
        [ -3.8493,  -2.0906,  -0.2493,  ..., -20.8255,   5.2351, -10.7412],
        [ -3.2036,   8.5659,   4.7551,  ...,   0.2433,   5.6426,  10.8439],
        ...,
        [  5.5522,  -3.6008,  -1.8677,  ...,  -6.2139,  -0.3410, -16.5657],
        [  5.7218,  -1.8805,  -0.4007,  ...,  -7.4124,  13.2211,  -0.5049],
        [  7.7313,  -2.1718,  13.4819,  ...,  -7.4111,  -1.5386,   4.1648]])

## Ideal situtation

0 0 0 0 0 1 0 0 0 0 -> 5

1 0 0 0 0 0 0 0 0 0 -> 0

0 0 0 1 0 0 0 0 0 0 -> 4

...

0 0 0 0 0 0 0 0 1 0 -> 8


In [None]:
pred[range(y_train.shape[0]),y_train]

tensor([-21.1225,  -3.8493, -11.3915,  ..., -20.0689,  -4.9855,  -1.5386])

In [None]:
pred[range(10), y_train[0:10]]

In [None]:
def nll(pred, target):
    return -pred[range(target.shape[0]), target].mean()

loss_fn = nll

In [None]:
pred = model(x_train)

In [None]:
nll(pred, y_train)

tensor(14.7952)

In [None]:
def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

# Manual Training

In [None]:
pred = model(x_train)
loss = loss_fn(pred, y_train)

In [None]:
loss

tensor(12.3007, grad_fn=<NegBackward0>)

In [None]:
loss.backward()

In [None]:
#w.grad

In [None]:
#b.grad

In [None]:
with torch.no_grad():
    w -= w.grad*0.1
    b -= b.grad*0.1
    w.grad.zero_()
    b.grad.zero_()

# Training

In [None]:
epochs = 100

for epoch in range(epochs):

    pred = model(x_train)
    loss = loss_fn(pred, y_train)
    if epoch % 10 == 0:
        print(loss)

    loss.backward()
    with torch.no_grad():
        w -= w.grad * 0.1
        b -= b.grad * 0.1
        w.grad.zero_()
        b.grad.zero_()

tensor(3.9096, grad_fn=<NegBackward0>)
tensor(3.6638, grad_fn=<NegBackward0>)
tensor(3.4509, grad_fn=<NegBackward0>)
tensor(3.2654, grad_fn=<NegBackward0>)
tensor(3.1025, grad_fn=<NegBackward0>)
tensor(2.9585, grad_fn=<NegBackward0>)
tensor(2.8302, grad_fn=<NegBackward0>)
tensor(2.7153, grad_fn=<NegBackward0>)
tensor(2.6117, grad_fn=<NegBackward0>)
tensor(2.5178, grad_fn=<NegBackward0>)


In [None]:
accuracy(model(x_train), y_train)

tensor(0.5835)

In [None]:
accuracy(model(x_test), y_test)

In [None]:
np.exp(-0.0615)

# Mini-batch

In [None]:
lr = 0.1  # learning rate
epochs = 100  # how many epochs to train for
bs = 64
n, c = x_train.shape

for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        with torch.no_grad():
            w -= w.grad * lr
            b -= b.grad * lr
            w.grad.zero_()
            b.grad.zero_()

    if epoch % 10 == 0:
            print(loss)

tensor(0.4836, grad_fn=<NegBackward0>)
tensor(0.1746, grad_fn=<NegBackward0>)
tensor(0.1239, grad_fn=<NegBackward0>)
tensor(0.1040, grad_fn=<NegBackward0>)
tensor(0.0953, grad_fn=<NegBackward0>)
tensor(0.0908, grad_fn=<NegBackward0>)
tensor(0.0880, grad_fn=<NegBackward0>)
tensor(0.0859, grad_fn=<NegBackward0>)
tensor(0.0843, grad_fn=<NegBackward0>)
tensor(0.0829, grad_fn=<NegBackward0>)


In [None]:
pred

tensor([[-9.6767e+00, -1.8695e+01, -1.3041e+01, -7.9581e+00, -7.5522e+00,
         -4.8227e-03, -5.6319e+00, -1.3945e+01, -8.2594e+00, -1.0424e+01],
        [-9.6451e+00, -2.7131e+01, -1.3113e+01, -5.3890e+00, -5.8191e+00,
         -1.2668e+00, -1.5849e+01, -4.0046e+00, -6.7992e+00, -3.6914e-01],
        [-1.5248e+01, -2.7981e+01, -2.0218e-04, -1.1274e+01, -2.6447e+01,
         -1.1560e+01, -1.6773e+01, -3.5689e+01, -8.6245e+00, -2.7485e+01],
        [-1.2293e+01, -3.7065e+01, -1.3351e-05, -1.3595e+01, -1.9377e+01,
         -1.4956e+01, -1.3748e+01, -2.9878e+01, -1.2139e+01, -2.5337e+01],
        [-1.4458e-03, -3.9381e+01, -1.0708e+01, -1.3764e+01, -1.9548e+01,
         -6.5566e+00, -1.7076e+01, -2.4014e+01, -1.3542e+01, -1.6410e+01],
        [-1.0877e+01, -2.9444e+01, -1.5817e+01, -9.0341e+00, -8.1696e+00,
         -8.9930e+00, -1.8581e+01, -3.0543e+00, -8.4968e+00, -4.9093e-02],
        [-1.1209e+01, -2.1981e+01, -1.3725e+00, -6.9566e+00, -4.5177e-01,
         -6.5802e+00, -2.3093e+0

In [None]:
accuracy(model(x_train), y_train)

tensor(0.9287)

In [None]:
accuracy(model(x_test), y_test)

tensor(0.9207)

In [None]:
np.exp(-0.0829)

0.9204431869367559

# Refactoring with torch.nn

In [None]:
import torch.nn.functional as F

loss_fn = F.cross_entropy

def model(xb):
    return xb @ w + b

In [None]:
w = torch.randn(784, 10, requires_grad=True)
b = torch.randn(10, requires_grad=True)

In [None]:
lr = 0.1  # learning rate
epochs = 100  # how many epochs to train for
bs = 64
n, c = x_train.shape

for epoch in range(epochs):
    for i in range((n - 1) // bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_fn(pred, yb)

        loss.backward()
        with torch.no_grad():
            w -= w.grad * lr
            b -= b.grad * lr
            w.grad.zero_()
            b.grad.zero_()

    if epoch % 10 == 0:
            print(loss)

tensor(0.5285, grad_fn=<NllLossBackward0>)
tensor(0.1856, grad_fn=<NllLossBackward0>)
tensor(0.1382, grad_fn=<NllLossBackward0>)
tensor(0.1124, grad_fn=<NllLossBackward0>)
tensor(0.0959, grad_fn=<NllLossBackward0>)
tensor(0.0846, grad_fn=<NllLossBackward0>)
tensor(0.0765, grad_fn=<NllLossBackward0>)
tensor(0.0706, grad_fn=<NllLossBackward0>)
tensor(0.0663, grad_fn=<NllLossBackward0>)
tensor(0.0632, grad_fn=<NllLossBackward0>)


In [None]:
accuracy(model(x_train), y_train)

tensor(0.9286)