In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
#export
from exp.nb_07 import *

## Layerwise Sequential Unit Variance (LSUV) 

In [15]:
x_train, y_train, x_valid, y_valid = get_data()

x_train, x_valid = normalize_to(x_train, x_valid)
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

nh, bs = 50, 512
c = y_train.max().item()+1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

In [16]:
mnist_view = view_tfm(1, 28, 28)
cbfs = [Recorder,
        partial(AvgStatsCallback, accuracy),
        CudaCallback,
        partial(BatchTransformXCallback, mnist_view)] # we put batch in gpu then transform

In [17]:
nfs = [8, 16, 32, 64, 64] # number of filters

In [18]:
class ConvLayer(nn.Module):
    def __init__(self, ni, nf, ks=3, stride=2, sub=0., **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True)
        self.relu = GeneralRelu(sub=sub, **kwargs)
        
    def forward(self, x):
        return self.relu(self.conv(x))
    
    @property
    def bias(self): return -self.relu.sub
    
    @bias.setter
    def bias(self, v): self.relu.sub = -v
        
    @property
    def weight(self): return self.conv.weight

In [19]:
learn, run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

In [20]:
run.fit(2, learn)

train: [1.17839640625, tensor(0.6076)]
valid: [0.36718818359375, tensor(0.8822)]
train: [0.157655361328125, tensor(0.9521)]
valid: [0.09827435913085937, tensor(0.9694)]


In [21]:
learn, run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

In [22]:
#export
def get_batch(dl, run):
    run.xb, run.yb = next(iter(dl))
    for cb in run.cbs: cb.set_runner(run)
    run("begin_batch")
    return run.xb, run.yb

In [25]:
xb, yb = get_batch(data.train_dl, run)

In [24]:
[1, 2] + [] + [3, 4]

[1, 2, 3, 4]

In [27]:
sum([[1], [], [2]], [])

[1, 2]

In [28]:
sum([1, 2], 2)

5

In [26]:
#export
def find_modules(m, cond):
    if cond(m): return [m]
    return sum([find_modules(o, cond) for o in m.children()], [])

def is_lin_layer(l):
    lin_layers = (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear, nn.ReLU)
    return isinstance(l, lin_layers)

In [29]:
mods = find_modules(learn.model, lambda o: isinstance(o, ConvLayer))

In [30]:
mods

[ConvLayer(
   (conv): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 ), ConvLayer(
   (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralRelu()
 )]

In [31]:
learn.model

Sequential(
  (0): ConvLayer(
    (conv): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (relu): GeneralRelu()
  )
  (1): ConvLayer(
    (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (2): ConvLayer(
    (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (3): ConvLayer(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (4): ConvLayer(
    (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (5): AdaptiveAvgPool2d(output_size=1)
  (6): Lambda()
  (7): Linear(in_features=64, out_features=10, bias=True)
)

In [32]:
def append_stat(hook, mod, inp, outp):
    d = outp.data
    hook.mean, hook.std = d.mean().item(), d.std().item()

In [37]:
if torch.cuda.is_available():
    mdl = lean.model.cuda()
else:
    mdl = learn.model

In [38]:
with Hooks(mods, append_stat) as hooks:
    mdl(xb)
    for hook in hooks: print(hook.mean, hook.std)

0.3906221091747284 0.7596608400344849
0.27785801887512207 0.5827661752700806
0.20774070918560028 0.43739715218544006
0.20151448249816895 0.3500889241695404
0.16141344606876373 0.22637952864170074


In [41]:
#export
def lsuv_module(m, xb):
    h = Hook(m, append_stat)
    
    while mdl(xb) is not None and abs(h.mean)  > 1e-3: m.bias -= h.mean
    while mdl(xb) is not None and abs(h.std-1) > 1e-3: m.weight.data /= h.std
        
    h.remove()
    return h.mean, h.std

In [42]:
for m in mods: print(lsuv_module(m, xb))

(0.12358032166957855, 1.0)
(0.10398585349321365, 0.9999999403953552)
(0.16722050309181213, 1.0)
(0.13194599747657776, 1.0)
(0.3073921203613281, 1.0)


here we start with the batch and do a pass forward, for the batch we calculate the mean and variance. If not mean 0 and variance subtract current mean and varaince untill output is normalized.

In [43]:
%time run.fit(2, learn)

train: [0.4182991796875, tensor(0.8669)]
valid: [0.13828924560546876, tensor(0.9550)]
train: [0.10886802734375, tensor(0.9656)]
valid: [0.10248668212890626, tensor(0.9682)]
CPU times: user 31.2 s, sys: 2.32 s, total: 33.5 s
Wall time: 11.3 s


## Export 

In [44]:
!python notebook2script.py 07a_lsuv.ipynb

Converted 07a_lsuv.ipynb to exp/nb_07a.py
