In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [111]:
#export
import torch
from torch import nn
from torch import tensor
from torch.nn import init
import math

In [4]:
#export
def pp(*args, n=75):
    for arg in args:
        print(arg)
        print("-"*n)

## Does nn.Conv2d init work well?

In [113]:
#export
from pathlib import Path
import requests
import pickle, gzip

def get_data_():
    DATA_PATH = Path("data")
    PATH = DATA_PATH / "mnist"
    PATH.mkdir(parents=True, exist_ok=True)

    URL = "http://deeplearning.net/data/mnist/"
    FILENAME = "mnist.pkl.gz"

    if not (PATH/FILENAME).exists():
        content = requests.get(URL+FILENAME).content
        (PATH/FILENAME).open('wb').write(content)

    with gzip.open((PATH/FILENAME).as_posix(), 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')

    return map(tensor,(x_train, y_train, x_valid, y_valid))

def normalize(x, m, s): return (x-m)/s

In [6]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [7]:
x_train, y_train, x_valid, y_valid = get_data_()
train_mean, train_std = x_train.mean(), x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [8]:
x_train.shape

torch.Size([50000, 784])

In [9]:
x_train = x_train.view(-1, 1, 28 , 28)
x_valid = x_valid.view(-1, 1, 28 , 28)
x_train.shape, x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [10]:
n,*_ = x_train.shape
c = y_train.max() + 1
nh = 32
n,c

(50000, tensor(10))

In [11]:
l1 = nn.Conv2d(1, nh, 5)

In [12]:
x = x_valid[:100]

In [13]:
x.shape

torch.Size([100, 1, 28, 28])

In [14]:
def stats(x): return x.mean(), x.std()

In [15]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [16]:
pp(stats(l1.weight), stats(l1.bias))

(tensor(0.0041, grad_fn=<MeanBackward0>), tensor(0.1169, grad_fn=<StdBackward0>))
---------------------------------------------------------------------------
(tensor(-0.0020, grad_fn=<MeanBackward0>), tensor(0.1009, grad_fn=<StdBackward0>))
---------------------------------------------------------------------------


In [17]:
t = l1(x)

In [18]:
stats(t)

(tensor(0.0081, grad_fn=<MeanBackward0>),
 tensor(0.6693, grad_fn=<StdBackward0>))

In [22]:
init.kaiming_normal_(l1.weight, a=1.)
stats(t)

(tensor(0.0081, grad_fn=<MeanBackward0>),
 tensor(0.6693, grad_fn=<StdBackward0>))

In [23]:
import torch.nn.functional as F

In [28]:
def f1(x, a=0): return F.leaky_relu(l1(x), a)

In [37]:
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.5141, grad_fn=<MeanBackward0>),
 tensor(1.0236, grad_fn=<StdBackward0>))

In [38]:
l1 = nn.Conv2d(1, nh, 5)
stats(f1(x))

(tensor(0.2069, grad_fn=<MeanBackward0>),
 tensor(0.3861, grad_fn=<StdBackward0>))

In [39]:
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [51]:
rec_fs = l1.weight[0,0].shape.numel()
rec_fs

25

In [55]:
no, ni, *_ = l1.weight.shape
no, ni

(32, 1)

In [57]:
fan_in = ni * rec_fs
fan_out = no * rec_fs
fan_in, fan_out 

(25, 800)

In [58]:
def gain(a): return math.sqrt(2.0 / (1 + a**2))

In [62]:
gain(1.0), gain(0), gain(0.1), gain(0.01), gain(math.sqrt(5.))

(1.0,
 1.4142135623730951,
 1.4071950894605838,
 1.4141428569978354,
 0.5773502691896257)

In [64]:
torch.zeros(10000).uniform_(-1, 1).std()

tensor(0.5758)

In [65]:
1/math.sqrt(3)

0.5773502691896258

In [101]:
def kaiming2(x, a, use_fan_out=False):
    nof, nif, *_ = x.shape
    rec_fs = x[0][0].shape.numel()
    fan = nof*rec_fs if use_fan_out else nif*rec_fs
    std = gain(a)/math.sqrt(fan)
    bound = math.sqrt(3.) * std # sqrt(3.) because it's uniform distribution
    x.data.uniform_(-bound, bound)

In [100]:
init.kaiming_uniform_??

In [70]:
kaiming2(l1.weight, a=0)
stats(f1(x))

(tensor(0.5584, grad_fn=<MeanBackward0>),
 tensor(0.9482, grad_fn=<StdBackward0>))

In [71]:
kaiming2(l1.weight, a=math.sqrt(5))
stats(f1(x))

(tensor(0.2157, grad_fn=<MeanBackward0>),
 tensor(0.4164, grad_fn=<StdBackward0>))

In [107]:
#export
class Flatten(nn.Module):
    def forward(self, x): return x.view(-1)

In [80]:
m = nn.Sequential(
    nn.Conv2d(1,8, 5,stride=2,padding=2), nn.ReLU(),
    nn.Conv2d(8,16,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(16,32,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(32,1,3,stride=2,padding=1),
    nn.AdaptiveAvgPool2d(1),
    Flatten(),
)

In [81]:
y = y_valid[:100].float()

In [83]:
t = m(x)
stats(t)

(tensor(0.0230, grad_fn=<MeanBackward0>),
 tensor(0.0072, grad_fn=<StdBackward0>))

In [102]:
m

Sequential(
  (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (1): ReLU()
  (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (3): ReLU()
  (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (5): ReLU()
  (6): Conv2d(32, 1, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (7): AdaptiveAvgPool2d(output_size=1)
  (8): Flatten()
)

In [108]:
#export
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [97]:
l = mse(t,y)
l.backward()

In [99]:
stats(m[0].weight.grad)

(tensor(-0.0108), tensor(0.0348))

In [103]:
for l in m:
    if isinstance(l, nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [104]:
t = m(x)
stats(t)

(tensor(-0.4193, grad_fn=<MeanBackward0>),
 tensor(0.2953, grad_fn=<StdBackward0>))

In [105]:
l = mse(t,y)
l.backward()
stats(m[0].weight.grad)

(tensor(0.1555), tensor(0.5100))

## Export

In [114]:
!./notebook2script.py 02a_why_sqrt5-Copy1.ipynb

Converted 02a_why_sqrt5-Copy1.ipynb to exp/nb_02a.py
