# Lesson 2

## 1. Getting Data and Normalizing

In [153]:
#export

# In the course, Jeremy builds on top of previous notebooks by importing them
# Since we are storing previous files in Drive, synced with git, this is a workaround to import them
# ref: https://stackoverflow.com/questions/4383571/importing-files-from-different-folder

import sys
import math
sys.path.insert(1, '/content/drive/MyDrive/git/fast-ai-part2/exp/')
from nb_01 import *

def get_data():
  path = datasets.download_data(MNIST_URL, ext = '.gz')
  with gzip.open(path, 'rb') as f:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding = 'latin-1')
  return map(tensor, (x_train, x_valid, y_train, y_valid))

def normalize(x,m,s): return (x-m)/s

In [154]:
x_train, x_valid, y_train, y_valid = get_data()

In [155]:
train_mean, train_std = x_train.mean(), x_train.std() 
train_mean, train_std 

(tensor(0.1304), tensor(0.3073))

In [156]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [157]:
x_train.mean(), x_train.std()

(tensor(3.8966e-08), tensor(1.))

In [158]:
def test_near_zero(a, atol = 1e-03): assert abs(a)<atol, f"{a}:Near Zero"

In [159]:
test_near_zero(x_train.mean())

In [160]:
test_near_zero(1-x_train.std())

In [161]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

## 2. Creating the Layers

### Xavier init

In [162]:
# Number of hidden layer nodes
# 1 Hidden layer with 50 nodes
nh = 50

In [163]:
# xavier init
w1 = torch.randn(m, nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [164]:
w1.mean(), w1.std()

(tensor(0.0002), tensor(0.0357))

In [165]:
def lin(x,w,b): return x@w+b

In [166]:
y1 = lin(x_valid,w1,b1)

In [167]:
y1.mean(), y1.std()

(tensor(-0.0328), tensor(1.0056))

In [168]:
def relu(x): return x.clamp_min(0.)

In [169]:
y2 = relu(y1)

In [170]:
y2.mean(), y2.std()

(tensor(0.3818), tensor(0.5924))

Because of the Relu activation, the distribution of the output is affected and the standard deviation reduces a lot. Variance keeps dropping each layer and affects convergence for deep neural networks

In [171]:
# Kaiming/ He init

In [172]:
w1 = torch.randn(m, nh)*math.sqrt(2/m)

In [173]:
w1.mean(), w1.std()

(tensor(-0.0002), tensor(0.0505))

In [174]:
y1 = lin(x_valid, w1, b1)

In [175]:
y1.mean(), y1.std()

(tensor(0.1674), tensor(1.4082))

In [176]:
y2 = relu(y1)

In [177]:
y2.mean(), y2.std()

(tensor(0.6478), tensor(0.8696))

Gets much better with Kaiming init. Basically adding a 2 to the numerator of the initialization to improve the variance of inputs as we propogate through the network.

In [178]:
# Covered initialization. So can use pytorch init.
from torch.nn import init

In [179]:
w1 = torch.zeros(m,nh)
w1 = init.kaiming_normal(w1, mode="fan_out")

  


In [180]:
y2 = relu(lin(x_valid, w1, b1))

In [181]:
y2.mean(), y2.std()

(tensor(0.5640), tensor(0.8094))

In [182]:
# Jeremy modifies Relu a little to bring mean to 0
def relu(x): return x.clamp_min(0.) - 0.5

In [183]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.0082), tensor(0.7793))

In [184]:
def model(x):
  l1 = lin(x, w1, b1)
  l2 = relu(l1)
  l3 = lin(l2, w2, b2)
  return l3

In [185]:
%timeit -n 10 _=model(x_valid)

10 loops, best of 3: 16 ms per loop


## Loss Function

In [186]:
model(x_valid).shape

torch.Size([10000, 1])

In [187]:
def mse(output, target): return ((output.squeeze(1) - target)**2).mean()

In [188]:
mse(model(x_valid), y_valid)

tensor(27.5448)

## Backward Pass

In [189]:
def mse_grad(inp, targ):
  inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1)/inp.shape[0]

In [190]:
def relu_grad(inp, out):
  inp.g = (inp > 0).float() * out.g

In [191]:
def lin_grad(inp, out, w, b):
  inp.g = out.g @ w.t()
  w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
  b.g = out.g.sum(0)

In [192]:
def forward_and_backward(inp, targ):
  l1 = inp @ w1 + b1
  l2 = relu(l1)
  l3 = l2 @ w2 + b2
  
  loss = mse(l3, targ)

  mse_grad(l3, targ)
  lin_grad(l2, l3, w2, b2)
  relu_grad(l1, l2)
  lin_grad(inp, l1, w1, b1)

In [193]:
forward_and_backward(x_train, y_train)

In [194]:
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig = x_train.g.clone()

In [195]:
# Comparing results with Pytorch autograd

In [196]:
x2t = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [197]:
def forward(inp, targ):
  l1 = inp @ w12 + b12
  l2 = relu(l1)
  l3 = l2 @ w22 + b22
  
  return mse(l3, targ)

In [198]:
loss = forward(x2t, y_train)

In [199]:
loss.backward()

In [200]:
test_near(w1.g, w12.grad)
test_near(w2.g, w22.grad)
test_near(b1.g, b12.grad)
test_near(b2.g, b22.grad)
test_near(ig, x2t.grad)


## Refactoring

In [201]:
class Relu:
  def __call__(self, inp):
    self.inp = inp
    self.out = self.inp.clamp_min(0.) - 0.5
    return self.out

  def backward(self):
    return (self.inp > 0).float() * self.out.g

In [202]:
class Linear:
  def __init__(self, w, b):
    self.w, self.b = w,b
  
  def __call__(self, inp):
    self.inp = inp
    self.out =  inp@self.w + self.b
    return self.out

  def backward(self):
    self.inp.g =  self.out.g @ self.w.t()
    self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
    self.b.g = self.out.g.sum(0)

In [203]:
class Mse:
  def __call__(self, inp, targ):
    self.inp = inp
    self.targ = targ
    self.loss = (inp.squeeze() - targ).pow(2).mean()

  def backward(self):
    self.inp.g = 2 * (self.inp.squeeze - self.targ).unsqueeze(-1)/self.targ.shape[0]

In [204]:
class Model:
  def __init__(self, w1, b1, w2, b2):
    self.layers = [Linear(w1, b1), Relu(), Linear(w2, b2)]
    self.loss = Mse()

  def __call__(self, x, targ):
    for l in self.layers: x = l(x)
    return self.loss(x, targ)

  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers): l.backward()

In [205]:
w1.g,b1.g,w2.g,b2.g = [None]*4

In [206]:
model = Model(w1, b1, w2, b2)

In [207]:
%time loss = model(x_train, y_train)

CPU times: user 92.1 ms, sys: 2.48 ms, total: 94.6 ms
Wall time: 97 ms


## Module

In [300]:
class Module:
  '''Abstract Module class'''
  def __call__(self, *args):
      self.args = args
      self.out = self.forward(*args)
      return self.out

  def forward(self): raise(Exception("Not implemented"))
  
  def backward(self): self.bwd(self.out, *self.args)

In [301]:
class Relu(Module):
  
  def forward(self, inp): return inp.clamp_min(0.) - 0.5
  
  def bwd(self, out, inp): inp.g = (inp > 0).float() * out.g

In [302]:
class Linear(Module):
  def __init__(self, w, b): self.w, self.b = w,b

  def forward(self, inp): return inp@self.w + self.b
  
  def bwd(self, out, inp):
    inp.g = out.g@self.w.t()
    self.w.g = inp.t() @ out.g
    self.b.g = out.g.sum(0)

In [303]:
class Mse(Module):
  
  def forward(self, inp, targ): 
    return (inp.squeeze() - targ).pow(2).mean()

  def bwd(self, out, inp, targ): inp.g = 2 * (inp.squeeze() - targ).unsqueeze(-1)/inp.shape[0]

In [304]:
class Model:
  def __init__(self, w1, b1, w2, b2):
    self.layers = [Linear(w1, b1), Relu(), Linear(w2, b2)]
    self.loss = Mse()

  def __call__(self, x, targ): 
    for l in self.layers: x = l(x)
    return self.loss(x, targ)

  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers): l.backward()

In [305]:
w1.g, b1.g, w2.g, b2.g = [None]*4
model = Model(w1, b1, w2, b2)

In [306]:
%time loss = model(x_train, y_train)

CPU times: user 93.8 ms, sys: 0 ns, total: 93.8 ms
Wall time: 97.2 ms


In [307]:
model.layers[0].args

(tensor([[-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
         [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
         [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
         ...,
         [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
         [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
         [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245]]),)

In [308]:
%time model.backward()

CPU times: user 176 ms, sys: 1.24 ms, total: 177 ms
Wall time: 192 ms


In [309]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

## Torch NN

In [310]:
from torch import nn

In [294]:
??mse

In [311]:
class Model(nn.Module):
  def __init__(self, n_in, nh, n_out):
    super().__init__()
    self.layers = [nn.Linear(n_in, nh), Relu(), nn.Linear(nh, n_out)]
    self.loss = Mse()

  def __call__(self, x, targ):
    for l in self.layers: x = l(x)
    return self.loss(x, targ)


In [312]:
model = Model(m, 50, 1)

In [313]:
%time loss = model(x_train, y_train)

CPU times: user 78 ms, sys: 132 µs, total: 78.1 ms
Wall time: 79 ms


In [315]:
%time loss.backward()

CPU times: user 62.2 ms, sys: 0 ns, total: 62.2 ms
Wall time: 64.7 ms
