In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms
import torch.optim as optim

In [2]:
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): 
    return (x-m)/s

In [3]:
x_train,y_train,x_valid,y_valid = get_data()

In [4]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [
            nn.Linear(n_in,nh),
            nn.ReLU(),
            nn.Linear(nh, n_out)
        ]
        
    def __call__(self,x):
        for l in self.layers:       # similar to DeepLizard's forward() call
            x = l(x)
            
        return x

In [5]:
loss_func = F.cross_entropy


In [6]:
def accuracy(preds, labels):
    return (torch.argmax(preds, dim=1)==labels).float().mean()

In [7]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50
(n,m)

# m = 784 ---> features

(50000, 784)

In [8]:
model = Model(m, nh, 10)

In [9]:
bs=64                  # batch size

xb = x_train[0:bs]     # a mini-batch from x

preds = model(xb)      # predictions
preds[0], preds.shape


(tensor([ 0.1306, -0.0799,  0.0040,  0.1322,  0.0154, -0.0362, -0.1753,  0.1845,
          0.0927, -0.0243], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [10]:
lr = 0.5
epochs = 10

# Basic training loop

In [11]:
for epoch in range(epochs):
    for i in range((n-1)//bs+1):
        
        start_i = i*bs
        end_i = start_i + bs
        
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        loss = loss_func(model(xb), yb)
        
        loss.backward()
        
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l,'weight'):
                    l.weight -= l.weight.grad*lr
                    l.bias -= l.bias.grad*lr
                    l.weight.grad.zero_()
                    l.bias.grad.zero_()

In [12]:
loss_func(model(xb), yb), accuracy(model(xb), yb)


(tensor(0.0013, grad_fn=<NllLossBackward>), tensor(1.))

# Using parameters & optim

In [13]:
class Model(nn.Module):
    def __init__(self, n_in, n_nh, out):
        super().__init__()
        self.l1 = nn.Linear(n_in, n_nh)
        self.l2 = nn.Linear(n_nh, out)
        
    def __call__(self, x):
        return self.l2(F.relu(self.l1(x)))

In [14]:
model = Model(m, nh, 10)

In [15]:
model.named_children

<bound method Module.named_children of Model(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)>

In [16]:
for name,l in model.named_children():
    print(f"{name}_{l}")

l1_Linear(in_features=784, out_features=50, bias=True)
l2_Linear(in_features=50, out_features=10, bias=True)


In [17]:
model

Model(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [18]:
def fit():
    for epoch in range(epochs):
        for i in range((n-1)//bs+1):
            start_i = i*bs
            end_i = start_i + bs
            xb = x_train[start_i:end_i]
            yb = y_train[start_i:end_i]
            
            pred = model(xb)
            loss = loss_func(pred, yb)
            
            loss.backward()
            with torch.no_grad():
                for p in model.parameters():
                    p -= lr*p.grad
                model.zero_grad()

In [19]:
fit()

In [20]:
loss_func(model(xb), yb), accuracy(model(xb), yb)


(tensor(0.0022, grad_fn=<NllLossBackward>), tensor(1.))

Behind the scenes, PyTorch overrides the __setattr__ function in nn.Module so that the submodules you define are properly registered as parameters of the model.


In [21]:
class DummyModule():
    def __init__(self, n_in, nh, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in,nh)
        self.l2 = nn.Linear(nh,n_out)
        
    def __setattr__(self,k,v):
        if not k.startswith("_"): self._modules[k] = v
        super().__setattr__(k,v)
        
    def __repr__(self): return f'{self._modules}'
    
    def parameters(self):
        for l in self._modules.values():
            for p in l.parameters(): yield p

In [22]:
mdl = DummyModule(m,nh,10)
mdl

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [23]:
[o.shape for o in mdl.parameters()]


[torch.Size([50, 784]),
 torch.Size([50]),
 torch.Size([10, 50]),
 torch.Size([10])]

# Registering modules
We can use the original layers approach, but we have to register the modules.


In [24]:
layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)]

In [25]:
class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers
        for i,l in enumerate(self.layers): self.add_module(f'layer_{i}', l)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [26]:
model = Model(layers)

In [27]:
model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

# nn.ModuleList
nn.ModuleList does this for us.


In [28]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [29]:
model = SequentialModel(layers)

In [30]:
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)


# nn.Sequential
nn.Sequential is a convenient class which does the same as the above:

In [31]:
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))


In [32]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.0025, grad_fn=<NllLossBackward>), tensor(1.))

In [33]:
nn.Sequential??

In [34]:
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

# optim
Let's replace our previous manually coded optimization step: 

with torch.no_grad():
    for p in model.parameters(): p -= p.grad * lr
    model.zero_grad()


and instead use just:


opt.step()
opt.zero_grad()

In [35]:
class Optimizer():
    def __init__(self, params, lr=0.5): self.params,self.lr=list(params),lr
        
    def step(self):
        with torch.no_grad():
            for p in self.params: p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.params: p.grad.data.zero_()

In [36]:
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))

In [37]:
opt = Optimizer(model.parameters())

In [38]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [39]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss,acc

(tensor(0.0013, grad_fn=<NllLossBackward>), tensor(1.))

PyTorch already provides this exact functionality in optim.SGD (it also handles stuff like momentum, which we'll look at later - except we'll be doing it in a more flexible way!)

In [40]:
from torch import optim

In [41]:
optim.SGD.step??


In [42]:
def get_model():
    model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10))
    return model, optim.SGD(model.parameters(), lr=lr)

In [43]:
model,opt = get_model()
loss_func(model(xb), yb)

tensor(2.3576, grad_fn=<NllLossBackward>)

In [44]:
for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        pred = model(xb)
        loss = loss_func(pred, yb)

        loss.backward()
        opt.step()
        opt.zero_grad()

In [45]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss,acc

(tensor(0.0022, grad_fn=<NllLossBackward>), tensor(1.))

In [46]:
assert acc>0.9

# Throws AssertionError if the accuracy isnt greater than 90%

# Dataset and DataLoader

In [47]:
class Dataset():
    def __init__(self, x, y):
        self.x, self.y = x,y
        
    def __len__(self):
        return self.x
    
    def __getitem__(self,i):
        return self.x[i], self.y[i]
        

In [48]:
train_ds, valid_Ds = Dataset(x_train,y_train), Dataset(x_valid, y_valid)

# Below line throws error, dont know why exactly

#assert len(train_ds)==len(x_train)
#assert len(valid_ds) == len(x_valid)

In [49]:
xb,yb = train_ds[0:5]
assert xb.shape==(5,28*28)
assert yb.shape==(5,)
xb,yb

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([5, 0, 4, 1, 9]))