In [1]:
!nvidia-smi

Wed Nov  9 02:07:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from urllib.request import urlretrieve
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [3]:
with gzip.open(path_gz, 'rb') as f: 
  ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
  
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [4]:
!ls -l data

total 16656
-rw-r--r-- 1 root root 17051982 Nov  9 02:08 mnist.pkl.gz


# Foundations data

## Basic architecture

In [5]:
row, col = x_train.shape
c = y_train.max() + 1
row, col, c

(50000, 784, tensor(10))

In [6]:
y_train.unique()

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
# num_hidden
nh = 50

In [8]:
w1 = torch.randn(col, nh) # 784 50
b1 = torch.zeros(nh) # 50
w2 = torch.randn(nh, 1) # 50, 1
b2 = torch.zeros(1)

w1.shape, b1.shape, w2.shape, b2.shape

(torch.Size([784, 50]), torch.Size([50]), torch.Size([50, 1]), torch.Size([1]))

In [9]:
def lin(x, w, b):
  return x@w + b

In [10]:
print(x_valid.shape)

torch.Size([10000, 784])


In [11]:
t = lin(x_valid, w1, b1) # [10000, 784] * [784, 50] -> [10000, 50] + [50]
t.shape

torch.Size([10000, 50])

In [12]:
t

tensor([[ -0.09,  11.87, -11.39,  ...,   5.48,   2.14,  15.30],
        [  5.38,  10.21, -14.49,  ...,   0.88,   0.08,  20.23],
        [  3.31,   0.12,   3.10,  ...,  16.89,  -6.05,  24.74],
        ...,
        [  4.01,  10.35, -11.25,  ...,   0.23,  -5.30,  18.28],
        [ 10.62,  -4.27,  10.72,  ...,  -2.87,  -2.87,  18.23],
        [  2.84,  -0.22,   1.43,  ...,  -3.91,   5.75,   2.12]])

In [13]:
# all negative numbers get set to 0.00
def relu(x):
  return x.clamp_min(0.)

In [14]:
t = relu(t)
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [15]:
def model(xb):
  l1 = lin(xb, w1, b1) # [10000, 50]
  l2 = relu(l1) # all negative items get set to 0.0
  return lin(l2, w2, b2) # [10000, 50] * [50, 1] -> [10000, 1]

In [16]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

# Loss function: MSE

( Of course, `mse` is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use `mse` for now to keep things simple. )

In [17]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [18]:
(res - y_valid).shape

torch.Size([10000, 10000])

We need to get rid of that trailing (,1), in order to use `mse`

In [19]:
# option 1
res[:, 0].shape

torch.Size([10000])

In [20]:
# option 2
res.squeeze(1).shape

torch.Size([10000])

In [21]:
(res[:,0]-y_valid).shape

torch.Size([10000])

In [22]:
y_train, y_valid = y_train.float(), y_valid.float()

preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [23]:
def mse(output, target): 
  return (output[:,0] - target).pow(2).mean()

In [24]:
mse(preds, y_train)

tensor(4308.76)

## Gradients and backward pass

In [25]:
from sympy import symbols, diff

x, y = symbols('x y')
# find deriv.
diff(x**2, x)

2*x

In [26]:
diff(3*x**2+9, x)

6*x

In [30]:
w1.shape, b1.shape, w2.shape, b2.shape

(torch.Size([784, 50]), torch.Size([50]), torch.Size([50, 1]), torch.Size([1]))

In [31]:
x_train.shape, y_train.shape

(torch.Size([50000, 784]), torch.Size([50000]))

In [35]:
def lin_grad(input, output, weight, bias):
  # grad of matmul with respect to input
  input.g = output.g @ weight.t()
  # import pdb; pdb.set_trace() # debugger
  weight.g = (input.unsqueeze(-1) * output.g.unsqueeze(1)).sum(0)
  bias.g = output.g.sum(0)

In [36]:
# def lin(x, w, b):
#   return x@w + b

def forward_and_backward(input, target):
  # forward pass:
  l1 = lin(input, w1, b1) # [50000, 784] * [784, 50] + [50]. -> [50000, 50]
  l2 = relu(l1) # remove all negativesp and replace with 0.
  output = lin(l2, w2, b2) # [50000, 50] * [50, 1] + [1] -> [50000, 1]
  diff = output[:,0] - target # [50000] - [50000]
  loss = diff.pow(2).mean() # calculate mse

  # backward pass:
  output.g = 2. * diff[:, None] / input.shape[0] 
  lin_grad(l2, output, w2, b2)
  l1.g = (l1 > 0).float() * l2.g
  lin_grad(input, l1, w1, b1)

In [37]:
forward_and_backward(x_train, y_train)

In [38]:
def get_grad(x):
  return x.g.clone()

chks = w1, w2, b1, b2, x_train
grads = w1g, w2g, b1g, b2g, ig = map(get_grad, chks)

In [39]:
def mkgrad(x): 
  return x.clone().requires_grad_(True)
ptgrads = w12,w22,b12,b22,xt2 = map(mkgrad, chks)

In [40]:
def forward(input, target):
  l1 = lin(input, w12, b12)
  l2 = relu(l1)
  output = lin(l2, w22, b22)
  return mse(output, target)

In [41]:
loss = forward(xt2, y_train)
loss.backward()

In [42]:
for a, b in zip(grads, ptgrads):
  test_close(a.grad, b, eps=0.01)

# Refactor model

## Layers as classes

In [49]:
class Relu():
  def __call__(self, input):
    self.input = input
    self.output = input.clamp_min(0.)
    return self.output
  
  def backward(self):
    self.input.g = (self.input > 0).float() * self.output.g

In [50]:
class Lin():
  def __init__(self, w, b): 
    self.w = w
    self.b = b
  
  def __call__(self, input):
    self.input = input
    self.output = lin(input, self.w, self.b)
    return self.output
  
  def backward(self):
    self.input.g = self.output.g @ self.w.t()
    self.w.g = self.input.t() @ self.output.g
    self.b.g = self.output.g.sum(0)

In [51]:
class Mse():
  def __call__(self, input, target):
    self.input = input
    self.target = target
    self.output = mse(input, target)
    return self.output
  
  def backward(self):
    self.input.g = 2. * (self.input.squeeze() - self.target).unsqueeze(-1) /  self.target.shape[0]           

In [52]:
class Model():
  def __init__(self, w1, b1, w2, b2):
    self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
    self.loss = Mse()

  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)
    return self.loss(x, targ)
  
  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers):
      l.backward()

In [53]:
model = Model(w1, b1, w2, b2)

In [54]:
loss = model(x_train, y_train)

In [55]:
model.backward()

In [56]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

# Module.foward()

In [57]:
class Module():
  def __cal__(self, *args):
    self.args = args
    self.out = self.forward(*args)
    return self.out
  
  def forward(self): 
    raise Exception('not implemented')
  
  def backward(self):
    self.bwd(self.out, *self.args)
  
  def bwd(self):
    raise Exception('not implemented')

In [None]:
class Relu(Module):
  def forward(self, input):
    return input.clamp_min(0.)
  
  def bwd(self, output, input):
    input.g = (input > 0).float() * output.g

In [None]:
class Lin(Module):
  def __init__(self, w, b):
    self.w = w
    self.b = b
  
  def forward(self, input):
    return input@self.w + self.b
  
  def bwd(self, output, input):
    input.g = self.output.g @ self.w.t()
    self.w.g = input.t() @ self.output.g
    self.b.g = self.output.g.sum(0)

In [None]:
class Mse(Module):
  def forward(self, input, target):
    return (input.squeeze() - target).pow(2).mean()
  def bwd(self, output, input, target):
    input.g = 2*(input.squeeze() - target).unsqueeze(-1) / target.shape[0]

In [None]:
model = Model(w1, b1, w2, b2) 

In [None]:
loss = model(x_train, y_train)

In [None]:
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

# Autograd

In [1]:
from torch import nn
import torch.nn.functional as F

In [3]:
class Linear(nn.Module):
  def __init__(self, n_input, n_output):
    super().__init__()
    self.w = torch.randn(n_input, n_output).requires_grad_()
    self.b = torch.zeros(n_output).requires_grad_()
  
  def forward(self, input):
    return input@self.w + self.b

In [None]:
class Model(nn.Module):
  def __init__(self, n_input, n_hidden, n_output):
    super().__init__()
    self.layers = [Linear(n_input, n_hidden), nn.ReLU(), Linear(n_hidden, n_output)]

  def __cal__(self, x, target):
    for l in self.layers:
      x = l(x)
    return F.mse_loss(x, target[:,None])

In [None]:
model = Model(model, n_hidden, 1)
loss = model(x_train, y_train)
loss.backward()

In [None]:
l0 = model.layers[0]
l0.b.grad