In [1]:
#default_exp lesson3

In [2]:
import os
os.chdir("..")

In [3]:
# export
import torch
from torch import nn
from torch.optim import Adam
from solutions.lesson1 import *
from solutions.lesson2 import *
from fastai.datasets import download_data
from torch.functional import F
from functools import partial

In [4]:
# export
def get_mnist_data():
    """Returns X_train, y_train, X_test, y_test for MNIST dataset."""
    MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'
    path = download_data(MNIST_URL, ext=".gz")
    return get_data(path) 

In [5]:
# export
def stats(x):
    return x.mean(), x.std()

In [6]:
X_train, y_train, X_test, y_test = get_mnist_data()

# Basic Model

In [7]:
x_train_resized = X_train.reshape(-1, 1, 28, 28)

In [8]:
mean, std = stats(x_train_resized)

In [9]:
x_train_resized.size()

torch.Size([50000, 1, 28, 28])

In [10]:
x_train_norm = (x_train_resized - mean)/std

In [11]:
stats(x_train_norm)

(tensor(-3.0466e-06), tensor(1.))

In [12]:
nh = 100
mdl = nn.Sequential(
    nn.Conv2d(1, 5, 5),
    nn.ReLU()
)

In [13]:
for m in mdl.modules():
    if hasattr(m, 'weight'):
        nn.init.kaiming_normal_(m.weight, mode='fan_in')

In [14]:
stats(mdl(x_train_norm))

(tensor(0.4363, grad_fn=<MeanBackward0>),
 tensor(0.8158, grad_fn=<StdBackward0>))

In [15]:
# export
class Lambda(nn.Module):
    def __init__(self, f):
        super().__init__()
        self.f = f
        
    def forward(self, x):
        return self.f(x)

In [16]:
# def squeeze(x):
#     return torch.squeeze(x)

In [17]:
def conv_layer(ni, nf, size, stride=2):
    return nn.Sequential(
        nn.Conv2d(ni, nf, size, stride, padding=size//2),
        nn.ReLU()
    )

In [18]:
mdl = nn.Sequential(
    conv_layer(1, 8, 5),
    conv_layer(8, 16, 3),
    conv_layer(16, 32, 3),
    conv_layer(32, 64, 3),
    conv_layer(64, 64, 3),
    conv_layer(64, 10, 3),
    nn.AdaptiveAvgPool2d(1),
    nn.LogSoftmax(dim=1),
    Lambda(torch.squeeze)
)

In [19]:
mdl(x_train_norm).size()

torch.Size([50000, 10])

In [20]:
mdl(x_train_norm).squeeze().size()

torch.Size([50000, 10])

In [21]:
x_valid_norm = X_test.reshape(-1, 1, 28, 28)
x_valid_norm = (x_valid_norm - mean)/std

In [22]:
stats(x_valid_norm)

(tensor(-0.0059), tensor(0.9924))

In [23]:
train_data = Dataset(x_train_norm, y_train)
valid_data = Dataset(x_valid_norm, y_test)

In [24]:
c = (torch.max(y_test) + 1).item()

In [25]:
train_dl = DataLoader(train_data, 512)
valid_dl = DataLoader(valid_data, 1024)
data = DataBunch(train_dl, valid_dl, c=c)

In [26]:
data.c

10

In [27]:
g = nn.Conv2d(8, 16, 3)

In [28]:
# export
class GeneralReLU(nn.Module):
    def __init__(self, a=0.01, subtract=0.4):
        super().__init__()
        self.a = a
        self.subtract = subtract
        
    def forward(self, x):
        return F.leaky_relu(x, self.a) - self.subtract

In [29]:
# export
def init_cnn_(mdl):
    for layer in mdl.children():
        if isinstance(layer, nn.Conv2d):
            print("initializing conv2d...")
            nn.init.kaiming_normal_(layer.weight)
            if hasattr(layer, 'bias'):
                nn.init.zeros_(layer.bias)
        if isinstance(layer, nn.Sequential): init_cnn_(layer)
            
def conv_layer(ni, nf, size, stride=2, **kwargs):
    return nn.Sequential(
        nn.Conv2d(ni, nf, size, stride, padding=size//2),
        GeneralReLU(**kwargs)
    )
    
def get_model():
    conv_layers = [ 
        conv_layer(1, 8, 5),
        conv_layer(8, 16, 3),
        conv_layer(16, 32, 3),
        conv_layer(32, 64, 3),
        conv_layer(64, 10, 3),
    ]
             
    mdl = nn.Sequential(
        *conv_layers,
        nn.AdaptiveAvgPool2d(1),
        nn.LogSoftmax(dim=1),
        Lambda(torch.squeeze)
    ) 
    
    init_cnn_(mdl)
    
    return mdl 

In [30]:
mdl = get_model()

initializing conv2d...
initializing conv2d...
initializing conv2d...
initializing conv2d...
initializing conv2d...


In [31]:
mdl

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): GeneralReLU()
  )
  (1): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralReLU()
  )
  (2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralReLU()
  )
  (3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralReLU()
  )
  (4): Sequential(
    (0): Conv2d(64, 10, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): GeneralReLU()
  )
  (5): AdaptiveAvgPool2d(output_size=1)
  (6): LogSoftmax()
  (7): Lambda()
)

In [54]:
opt = Adam(mdl.parameters(), lr=1e-3)
loss = F.cross_entropy
learn = Learn(mdl, opt, data, loss)
runner = Runner(learn)

In [36]:
nn.NLLLoss()(F.log_softmax(torch.tensor([[1,2,3]]).float()), torch.tensor([2]))

  """Entry point for launching an IPython kernel.


tensor(0.4076)

In [37]:
# %time runner.fit(epochs=1)

# Some useful callbacks

In [38]:
torch.cuda.current_device()

0

In [39]:
torch.cuda.is_available()

True

In [40]:
# export
def current_gpu():
    print(torch.cuda.get_device_name(torch.cuda.current_device()))

In [41]:
# export
class AverageStatsCallback(Callback):
    def __init__(self, f, name, runner):
        self.f = f
        self.name = name
        self.runner = runner
        
    def on_epoch_start(self):
        self.value = 0.
        self.count = 0
        
    def on_batch_end(self):
        if runner.mode == ModelMode.VALID:
            batch_count = self.runner.xb.size()[0]
            self.count += batch_count
            self.value += self.f(self.runner.pred, self.runner.yb) * batch_count
            
    def on_epoch_end(self):
        if runner.mode == ModelMode.VALID:
            print("{} after epoch {}: {}".format(self.name, self.runner.epochs, self.value/self.count))

In [55]:
# export 
def accuracy(preds, actuals):
    return (torch.argmax(preds, axis=1) == actuals).float().sum()/(preds.size()[0])

In [56]:
test_pred = torch.tensor([[0.1, 0.5, 0.3]])
test_actual = torch.tensor([1])
assert accuracy(test_pred, test_actual) == 1.

test_pred_2 = torch.tensor([[0.1, 0.5, 0.3], [0.9, 0.8, 0.7]])
test_actual_2 = torch.tensor([1, 2])
assert accuracy(test_pred_2, test_actual_2) == 0.5

In [57]:
# export
# class AccuracyCallback(Callback):
#     def __init__(self, runner):
#         self.runner = runner
        
#     def on_epoch_start(self):
#         if runner.mode == ModelMode.VALID:
#             self.correct = 0
#             self.total = 0
        
#     def on_batch_end(self):
#         if runner.mode == ModelMode.VALID:
#             self.total += self.runner.xb.shape[0]
#             preds = torch.argmax(self.runner.pred, axis=1) 
#             self.correct += (preds == self.runner.yb).int().sum().item()
        
#     def on_epoch_end(self):
#         if runner.mode == ModelMode.VALID:
#             print("Validation accuracy: {}".format(self.correct/self.total)) 
AccuracyCallback = partial(AverageStatsCallback, accuracy, 'accuracy')

In [58]:
stats(data.train_dl.ds.x)

(tensor(-3.0466e-06), tensor(1.))

In [59]:
stats(data.valid_dl.ds.x)

(tensor(-0.0059), tensor(0.9924))

In [88]:
mdl = get_model()
opt = Adam(mdl.parameters(), lr=1e-3)
loss = F.cross_entropy
learn = Learn(mdl, opt, data, loss)
runner = Runner(learn)

initializing conv2d...
initializing conv2d...
initializing conv2d...
initializing conv2d...
initializing conv2d...


In [89]:
scheduler = CombinedScheduler([0.3, 0.7], CosineScheduler(1e-3, 1e-2, 'lr'), CosineScheduler(1e-2, 1e-5, 'lr'))
runner = Runner(learn, cb_funcs=[AccuracyCallback, scheduler])

In [90]:
runner.fit(epochs=10)

accuracy after epoch 0: 0.9495000243186951
Validation loss: 0.1687
accuracy after epoch 1: 0.9724000096321106
Validation loss: 0.1293
accuracy after epoch 2: 0.9807999730110168
Validation loss: 0.1095
accuracy after epoch 3: 0.9789999723434448
Validation loss: 0.1000
accuracy after epoch 4: 0.9814000129699707
Validation loss: 0.0928
accuracy after epoch 5: 0.982699990272522
Validation loss: 0.0885
accuracy after epoch 6: 0.9854999780654907
Validation loss: 0.0839
accuracy after epoch 7: 0.9889000058174133
Validation loss: 0.0792
accuracy after epoch 8: 0.9879000186920166
Validation loss: 0.0752
accuracy after epoch 9: 0.9883999824523926
Validation loss: 0.0722


# Convenience Method for Generating a Model

In [92]:
# export
def get_runner(lr, loss, data, cbs): 
    mdl = get_model()
    opt = Adam(mdl.parameters(), lr=lr)
    learn = Learn(mdl, opt, data, loss)
    return Runner(learn, cb_funcs=cbs)

# Hooks

In [144]:
class Hook:
    def __init__(self, f, m):
        self.f = f 
        self.m = m
        self.hook = m.register_forward_hook(f)
        
    def remove(self):
        self.hook.remove() 
            
    def __del__(self): self.remove()
        
    def __repr__(self): return "Hook(" + self.f.__name__ + ")" 

In [149]:
class ListContainer:
    def __init__(self, *items):
        self.items = list(items)
        
    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self.items[idx]
        
        if isinstance(idx, list):
            if len(idx) == 0:
                return []
            
            if isinstance(idx[0], int):
                return [self.items[i] for i in idx]
            
            if isinstance(idx[0], bool) and len(idx) == len(self.items):
                return [self.items[i] for i in range(len(self.items)) if idx[i]] 
            
        raise ValueError("idx must be an integer, list of integers, or list of bools.")
        
    def __setitem__(self, idx, item):
        self.items[idx] = item
        
    def __iter__(self): return iter(self.items)
    
    def __repr__(self):
        if len(self.items) <= 10:
            return str(self.items)
        else:
            strs = [str(i) for i in self.items[:10]]
            strs.append("...")
            return str(strs).replace("\'", "") 
        
    def __str__(self): return self.__repr__()

In [154]:
class Hooks(ListContainer):
    def __init__(self, *hooks):
        self.hooks = list(hooks)
            
    def remove(self):
        for hook in self.hooks: hook.remove()
            
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.remove()

In [155]:
# Next steps:
# - write a hook that stores the stats of some given activations after each call to `forward`
# - plot those stats across iterations for each layer

In [152]:
r = get_runner(1e-3, F.cross_entropy, data, [AccuracyCallback, scheduler])

initializing conv2d...
initializing conv2d...
initializing conv2d...
initializing conv2d...
initializing conv2d...


In [153]:
mdl = r.learn.model

In [114]:
def print_hello_world(model, input, result):
    print("hello world!")
    
h = Hook(print_hello_world)

In [115]:
h(mdl)

In [116]:
mdl(x_train_norm)

hello world!
hello world!
hello world!
hello world!
hello world!
hello world!
hello world!
hello world!


tensor([[-2.5997, -2.1941, -1.8751,  ..., -2.5991, -2.5953, -2.5916],
        [-2.6487, -1.8077, -2.0634,  ..., -2.6493, -2.6570, -2.6551],
        [-2.3922, -2.3923, -2.3425,  ..., -2.0967, -2.3948, -2.3954],
        ...,
        [-2.6028, -2.0407, -2.3006,  ..., -2.5544, -2.6006, -1.8758],
        [-2.4840, -2.2429, -1.8023,  ..., -2.4810, -2.4837, -2.4890],
        [-2.5992, -1.7751, -1.6194,  ..., -2.6045, -2.6031, -2.5989]],
       grad_fn=<SqueezeBackward0>)

# Implement Batch Norm

In [47]:
# export 
class BatchNorm1d(nn.Module):
    def __init__(self, size, eps=1e-5, mom=0.1):
        super().__init__()
        self.eps = eps
        self.mom = mom
        self.register_buffer('gamma', torch.ones(size))
        self.register_buffer('beta', torch.zeros(size))
        self.mean = 0
        self.std = 1
        
    def forward(self, x):
        if self.mean is not None:
            self.mean = x.mean(axis=0) * self.mom + self.mean * (1 - self.mom)
        else:
            self.mean = x.mean(axis=0)
            
        if self.std is not None:
            self.std = x.std(axis=0) * self.mom + self.std * (1 - self.mom)
        else:
            self.std = x.std(axis=0)
            
        x_hat = (x - self.mean) / (self.std + self.eps)
        return self.gamma * x_hat + self.beta

In [70]:
bn.gamma, bn.beta

NameError: name 'bn' is not defined

In [None]:
X_train, y_train, X_test, y_test = get_mnist_data()

In [None]:
bn_test = nn.Sequential(
    nn.Linear(784, 100),
    BatchNorm1d(100)
)

In [None]:
def stats(x):
    return x.mean(), x.std()

In [None]:
stats(bn_test(X_train))

In [None]:
stats(X_train)

In [63]:
mean, std = stats(X_train)
stats((X_train - mean)/std)

(tensor(-3.0466e-06), tensor(1.))

# Utilities

# 