<a href="https://colab.research.google.com/github/sauravakolia/Fastai2/blob/master/minibatch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import torch.nn.functional as F

In [3]:
import operator

def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"

def test_eq(a,b): test(a,b,operator.eq,'==')


In [4]:
#export
def near(a,b): return torch.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

In [5]:
#export
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor

MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

In [6]:
#export
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [7]:
mpl.rcParams['image.cmap'] = 'gray'

In [8]:
x_train,y_train,x_valid,y_valid = get_data()
# train_mean,train_std = x_train.mean(),x_train.std()
# x_train = normalize(x_train, train_mean, train_std)
# x_valid = normalize(x_valid, train_mean, train_std)

Downloading http://deeplearning.net/data/mnist/mnist.pkl.gz


In [9]:

n,m = x_train.shape
c = y_train.max()+1
nh = 50

In [10]:
from torch import nn

In [11]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        
    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [12]:
model = Model(m, nh, 10)

In [13]:
pred=model(x_train)

### Cross entropy loss

In [14]:
def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdims=True))).log()

In [15]:
sm_pred=log_softmax(pred)

In [16]:
y_train[:3]

tensor([5, 0, 4])

The cross entropy loss for some target $x$ and some prediction $p(x)$ is given by:

$$ -\sum x\, \log p(x) $$

But since our $x$s are 1-hot encoded, this can be rewritten as $-\log(p_{i})$ where i is the index of the desired target.

In [17]:
sm_pred[[0,1,2], [5,0,4]]

tensor([-2.3473, -2.4834, -2.3253], grad_fn=<IndexBackward>)

In [18]:
def nll(input,target): return -input[range(target.shape[0]),target].mean()

# def nll(input, target): return -input[range(target.shape[0]), target].mean()

In [19]:
range(y_train.shape[0])
y_train

tensor([5, 0, 4,  ..., 8, 4, 8])

In [20]:
loss = nll(sm_pred, y_train)

In [21]:
loss

tensor(2.3124, grad_fn=<NegBackward>)

Note that the formula 

$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$ 

gives a simplification when we compute the log softmax, which was previously defined as `(x.exp()/(x.exp().sum(-1,keepdim=True))).log()`

In [22]:
def log_softmax(x):return x-(x.exp().sum(-1,keepdims=True).log())
# def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()

In [23]:
sm_pred=log_softmax(pred)

In [24]:
test_near(nll(log_softmax(pred), y_train), loss)

Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp). The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$

where a is the maximum of the $x_{j}$.

In [25]:
def logsumexp(x):
  m = x.max(-1)[0] 
  return m+(x-m[:,None]).exp().sum(-1).log()

In [26]:
test_near(logsumexp(pred), pred.logsumexp(-1))

In [27]:
def log_softmax(x): return x-x.logsumexp(-1,keepdim=True)

In [28]:
test_near(nll(log_softmax(pred), y_train), loss)

In [29]:
test_near(F.nll_loss(F.log_softmax(pred, -1), y_train), loss)

In [30]:
test_near(F.cross_entropy(pred, y_train), loss)

# Basic training loop

In [31]:
loss_func=F.cross_entropy

In [32]:
# def accuracy(out,yb):return (torch.argmax(out,dim=1) ==yb).float().mean()

def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean()

In [33]:
bs=64                  # batch size

xb = x_train[0:bs]     # a mini-batch from x
preds = model(xb)      # predictions
preds[0], preds.shape

(tensor([-0.1617, -0.1613, -0.1309,  0.0157, -0.0181, -0.0447,  0.0741,  0.1564,
          0.0920,  0.1175], grad_fn=<SelectBackward>), torch.Size([64, 10]))

In [34]:
preds[0]

tensor([-0.1617, -0.1613, -0.1309,  0.0157, -0.0181, -0.0447,  0.0741,  0.1564,
         0.0920,  0.1175], grad_fn=<SelectBackward>)

In [35]:
yb=y_train[0:bs]
accuracy(preds,yb)

tensor(0.0938)

In [36]:
loss_func(preds,yb)

tensor(2.3215, grad_fn=<NllLossBackward>)

In [37]:
lr=0.5
epochs=1

In [38]:
# for epoch in range(epochs):
#   for i in range(n-1//bs):
#     start_i=i*bs
#     end_i=i*bs+bs
#     xb = x_train[start_i:end_i]
#     yb=y_train[start_i:end_i]
#     preds=model(xb)

#     loss=loss_func(preds,yb)

#     loss.backward()
#     with torch.no_grad():
#       for l in model.layers:
#         if hasattr(l,'weight'):
#           l.weight-=l.weight.grad *lr
#           l.bias-=lr*l.bias.grad
#           l.weight.grad.zero_()
#           l.bias.grad.zero_()  

for epoch in range(epochs):
    for i in range((n-1)//bs + 1):
#         set_trace()
        start_i = i*bs
        end_i = start_i+bs
        xb = x_train[start_i:end_i]
        yb = y_train[start_i:end_i]
        loss = loss_func(model(xb), yb)

        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()

In [39]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.1519, grad_fn=<NllLossBackward>), tensor(0.9375))

## Using parameters and optim

### Parameters

In [40]:
class Model(nn.Module):
  def __init__(self,n_in,nh,n_out):
    super().__init__()
    self.l1=nn.Linear(n_in,nh)
    self.l2=nn.Linear(nh,n_out)

  # def __call__(self,x):
  #   return self.l2(F.ReLU(self.l1(x)))

  def __call__(self, x): return self.l2(F.relu(self.l1(x)))  


In [41]:
model=Model(m,nh,10)

In [42]:
model

Model(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
)

In [43]:
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [44]:
def fit():
  for epoch in range(epochs):
    for i in range((n-1)//bs +1):
      start_i=i*bs
      end_i=start_i+bs
      xb=x_train[start_i:end_i]
      yb=y_train[start_i:end_i]
      pred=model(xb)
      loss=loss_func(pred,yb)

      loss.backward()
      with torch.no_grad():
        for p in model.parameters():
          p-=lr*p.grad
          model.zero_grad()

          # if hasattr(l,'weight'):
          #   l.weight-=lr*l.weight.grad
          #   l.bias-=lr*l.bias.grad

          #   l.weight.grad.zero_()
          #   l.bias.grad.zero_()


In [45]:
fit()

In [46]:
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.4747, grad_fn=<NllLossBackward>), tensor(0.8750))

In [47]:
class DummyModule():
  def __init__(self,n_in,nh,n_out):
    self._modules={}
    self.l1=nn.Linear(n_in,nh)
    self.l2=nn.Linear(nh,n_out)

  def __setattr__(self,k,v):
    if not k.startswith("_"):self._modules[k]=v
    super().__setattr__(k,v)

  def __repr__(self):return f'{self._modules}'

  def parameters(self):
    for l in self._modules.values():
      for p in l.parameters(): yield p

In [48]:
mdl=DummyModule(m,nh,10)

In [49]:
mdl

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [50]:
[o.shape for o in mdl.parameters()]

[torch.Size([50, 784]),
 torch.Size([50]),
 torch.Size([10, 50]),
 torch.Size([10])]

### Registering modules

We can use the original `layers` approach, but we have to register the modules.

In [51]:
layers=[nn.Linear(m,nh),nn.ReLU(),nn.Linear(nh,10)]

In [52]:
 class Model(nn.Module):
  def __init__(self,layers):
    super().__init__()
    self.layers=layers
    for i,l in enumerate(self.layers):self.add_module(f'layer_{i}',l)

  def __call__(self,x):
    for l in self.layers: l=l(x)  
    return x


In [53]:
model = Model(layers)
model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

### nn.ModuleList

In [54]:
class SequentialModel(nn.Module):
  def __init__(self,layers):
    super().__init__()
    self.layers=nn.ModuleList(layers)

  def __call__(self,x):
    for l in self.layers:  x=l(x)
    return x  


In [55]:
model=SequentialModel(layers)

In [56]:
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [57]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.4207, grad_fn=<NllLossBackward>), tensor(0.8750))

### nn.Sequential

In [58]:
model=nn.Sequential(nn.Linear(m,nh),nn.ReLU(),nn.Linear(nh,10))

In [59]:
fit()
loss_func(model(xb), yb), accuracy(model(xb), yb)

(tensor(0.4104, grad_fn=<NllLossBackward>), tensor(0.8750))

In [61]:
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

### optim

Let's replace our previous manually coded optimization step:

```python
with torch.no_grad():
    for p in model.parameters(): p -= p.grad * lr
    model.zero_grad()
```

and instead use just:

```python
opt.step()
opt.zero_grad()
```

In [62]:
class optimizer():
  def __init__(self,params,lr=0.5):
    self.params,self.lr=list(params),lr

  def one_step(self):  
    with torch.no_grad():
      for p in self.params:
        p-=self.lr*p.grad

  def zero_grad(self):
    for p in self.params:
      p.grad.data.zero_()        


In [63]:
model=nn.Sequential(nn.Linear(m,nh),nn.ReLU(),nn.Linear(nh,10))

In [64]:
opt=optimizer(model.parameters())

In [65]:
for epoch in range(epochs):
  for i in range((n-1)//bs +1):
    start_i=i*bs
    end_i=start_i+bs
    xb=x_train[start_i:end_i]
    yb=y_train[start_i:end_i]
    preds=model(xb)
    loss=loss_func(preds,yb)

    loss.backward()
    opt.one_step()
    opt.zero_grad()


In [66]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss,acc

(tensor(0.0678, grad_fn=<NllLossBackward>), tensor(1.))

PyTorch already provides this exact functionality in `optim.SGD` (it also handles stuff like momentum, which we'll look at later - except we'll be doing it in a more flexible way!)

In [67]:
from torch import optim

In [68]:
def get_model():
  model=nn.Sequential(nn.Linear(m,nh),nn.ReLU(),nn.Linear(nh,10))
  return model,optim.SGD(model.parameters(),lr=lr)

In [69]:
model,opt=get_model()

In [70]:
loss_func(model(xb), yb)

tensor(2.2706, grad_fn=<NllLossBackward>)

In [71]:
for epoch in range(epochs):
  for i in range((n-1)//bs +1):
    start_i=i*bs
    end_i=start_i+bs
    xb=x_train[start_i:end_i]
    yb=y_train[start_i:end_i]
    preds=model(xb)
    loss=loss_func(preds,yb)

    loss.backward()
    opt.step()
    opt.zero_grad()


In [72]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
loss,acc

(tensor(0.1621, grad_fn=<NllLossBackward>), tensor(0.9375))

## Dataset and DataLoader

### Dataset

It's clunky to iterate through minibatches of x and y values separately:

```python
    xb = x_train[start_i:end_i]
    yb = y_train[start_i:end_i]
```

Instead, let's do these two steps together, by introducing a `Dataset` class:

```python
    xb,yb = train_ds[i*bs : i*bs+bs]
```

In [73]:
class Dataset():
  def __init__(self,x,y):
    self.x=x
    self.y=y

  def __len__(self):
    return len(self.x) 

  def __getitem__(self,i):
    return self.x[i],self.y[i]

In [74]:
train_ds,valid_ds=Dataset(x_train,y_train),Dataset(x_valid,y_valid)
assert len(train_ds)==len(x_train)
assert len(valid_ds)==len(x_valid)

In [75]:
model,opt=get_model()

In [76]:
for epoch in range(epochs):
  for i in range((n-1)//bs +1):
    start_i=i*bs
    end_i=start_i+bs
    xb,yb=train_ds[start_i:end_i]
    
    preds=model(xb)
    loss=loss_func(preds,yb)

    loss.backward()
    opt.step()
    opt.zero_grad()
  

In [77]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc

(tensor(0.0900, grad_fn=<NllLossBackward>), tensor(1.))

### DataLoader

Previously, our loop iterated over batches (xb, yb) like this:

```python
for i in range((n-1)//bs + 1):
    xb,yb = train_ds[i*bs : i*bs+bs]
    ...
```

Let's make our loop much cleaner, using a data loader:

```python
for xb,yb in train_dl:
    ...
```

In [78]:
class DataLoader():
  def __init__(self,ds,bs):
    self.ds=ds
    self.bs=bs

  def __iter__(self): 
    for i in range(0,len(self.ds),self.bs): yield self.ds[i:i+self.bs]

In [79]:
train_dl,valid_dl=DataLoader(train_ds,bs),DataLoader(valid_ds,bs)

In [80]:
xb,yb = next(iter(valid_dl))
assert xb.shape==(bs,28*28)
assert yb.shape==(bs,)

In [81]:
def fit():
  for epoch in range(epochs):  
    for xb,yb in train_dl: 
      preds=model(xb)
      loss=loss_func(preds,yb)

      loss.backward()
      opt.step()
      opt.zero_grad()
  

In [82]:
fit()

In [83]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc

(tensor(0.0653, grad_fn=<NllLossBackward>), tensor(0.9844))

### Random sampling

We want our training set to be in a random order, and that order should differ each iteration. But the validation set shouldn't be randomized.

In [84]:
class Sampler():
  def __init__(self,ds,bs,shuffle=False):
    self.n=len(ds)
    self.bs=bs
    self.shuffle=shuffle
  
  def __iter__(self):
    self.idxs=torch.randperm(self.n)if self.shuffle  else torch.arange(self.n)
    for i in range(0,self.n,self.bs):yield self.idxs[i:i+self.bs]

In [85]:
small_ds = Dataset(*train_ds[:10])

In [86]:
s = Sampler(small_ds,3,False)
[o for o in s]

[tensor([0, 1, 2]), tensor([3, 4, 5]), tensor([6, 7, 8]), tensor([9])]

In [87]:
s = Sampler(small_ds,3,True)
[o for o in s]

[tensor([0, 7, 3]), tensor([5, 4, 1]), tensor([8, 2, 6]), tensor([9])]

In [88]:
def collate(b):
    xs,ys = zip(*b)
    return torch.stack(xs),torch.stack(ys)

class DataLoader():
  def __init__(self,ds,sampler,collate_fn=collate):
    self.ds=ds
    self.sampler=sampler
    self.collate_fn=collate_fn

  def __iter__(self):
    for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])
      

In [89]:
train_samp=Sampler(train_ds,bs,shuffle=True)
valid_samp=Sampler(valid_ds,bs,shuffle=True)


In [90]:
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, sampler=valid_samp, collate_fn=collate)

In [91]:
fit()

In [92]:
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc

(tensor(0.0794, grad_fn=<NllLossBackward>), tensor(0.9688))

### PyTorch DataLoader

In [93]:
from torch.utils.data import DataLoader,RandomSampler,SequentialSampler

In [94]:
train_dl=DataLoader(train_ds,bs,sampler=RandomSampler(train_ds),collate_fn=collate)
valid_dl=DataLoader(valid_ds,bs,sampler=RandomSampler(valid_ds),collate_fn=collate)

In [95]:
fit()
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc

(tensor(0.0724, grad_fn=<NllLossBackward>), tensor(0.9688))

PyTorch's defaults work fine for most things however:

In [96]:
train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, bs, shuffle=False)

In [97]:
fit()
loss,acc = loss_func(model(xb), yb), accuracy(model(xb), yb)
assert acc>0.7
loss,acc

(tensor(0.0346, grad_fn=<NllLossBackward>), tensor(1.))

## Validation

In [98]:
# def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
#    for epoch in range(epochs):
#      model.train()  
#      for xb,yb in train_dl: 
#       preds=model(xb)
#       loss=loss_func(preds,yb)

#       loss.backward()
#       opt.step()
#       opt.zero_grad()

#      model.eval()

#      with torch.no_grad():
#        tot_loss,tot_acc = 0.,0.
#        for epoch in range(epochs):
#          for xb,yb in train_dl: 
#            preds=model(xb)
#            tot_loss+=loss_func(preds,yb)
#            tot_acc+=accuracy(preds,yb)

#        nv=len(valid_dl)
#        print(epoch, tot_loss/nv, tot_acc/nv)
#     return tot_loss/nv, tot_acc/nv
#     # for epoch in range(epochs):
#     # return tot_loss/nv, tot_acc/nv


In [99]:
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
  for epoch in range(epochs):
    model.train()  
    for xb,yb in train_dl:
      preds=model(xb)
      loss=loss_func(preds,yb)

      loss.backward()
      opt.step()
      opt.zero_grad()
    model.eval()
    with torch.no_grad():
      tot_loss,tot_acc = 0.,0.
      for xb,yb in valid_dl:
        pred = model(xb)
        tot_loss += loss_func(pred, yb)
        tot_acc  += accuracy (pred,yb)
    nv = len(valid_dl)
    print(epoch, tot_loss/nv, tot_acc/nv)
  return tot_loss/nv, tot_acc/nv

`get_dls` returns dataloaders for the training and validation sets:

In [100]:
def get_dls(train_ds,valid_ds,bs,**kwargs):
  return (DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
            DataLoader(valid_ds, batch_size=bs*2, **kwargs))


In [101]:
train_dl,valid_dl=get_dls(train_ds,valid_ds,bs)
model,opt=get_model()
loss,acc=fit(5, model, loss_func, opt, train_dl, valid_dl)

0 tensor(0.1773) tensor(0.9493)
1 tensor(0.3361) tensor(0.9013)
2 tensor(0.1008) tensor(0.9715)
3 tensor(0.1212) tensor(0.9642)
4 tensor(0.1698) tensor(0.9475)


In [102]:
assert acc>0.9