In [1]:
# from clear_nbcode import *
# clear_nbcode('02b_initializing-Copy1.ipynb')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import torch
import math

In [4]:
def stats(x): return x.mean(), x.std()

def pp(*args, n=75):
    for arg in args:
        print(arg)
        print("-"*n)

###### Why you need a good init

To understand why initialization is important in a neural net, we'll focus on the basic operation you have there: matrix multiplications. So let's just take a vector `x`, and a matrix `a` initiliazed randomly, then multiply them 100 times (as if we had 100 layers). 

_when initialization of weights is too high_

In [5]:
x = torch.randn(512)
a = torch.randn(512, 512)
pp(stats(x), stats(a))

(tensor(0.0018), tensor(0.9480))
---------------------------------------------------------------------------
(tensor(-0.0035), tensor(1.0018))
---------------------------------------------------------------------------


In [6]:
for i in range(100):
    x = a @ x
    if torch.isnan(x.std()): break
stats(x), i

((tensor(nan), tensor(nan)), 28)

_when initialization of weights is too low_

In [7]:
x = torch.randn(512)
a = torch.randn(512, 512) * 0.01
pp(stats(x), stats(a))

(tensor(-0.0280), tensor(0.9478))
---------------------------------------------------------------------------
(tensor(-1.6596e-05), tensor(0.0100))
---------------------------------------------------------------------------


In [11]:
for i in range(100):
    x = a@x
stats(x)    

(tensor(0.), tensor(0.))

In [18]:
mean, var = 0.,0.
for i in range(10000):
    x = torch.randn(512)
    a = torch.randn(512, 512)
    y = a@x
    mean += y.mean().item()
    var += y.pow(2).mean().item()
mean/10000, math.sqrt(var/10000)

(0.006866583323851228, 22.626899718929742)

In [20]:
math.sqrt(512)

22.627416997969522

In [24]:
mean, var = 0.,0.
for i in range(10000):
    x = torch.randn(1)
    a = torch.randn(1)
    y = a*x
    mean += y.mean().item()
    var += y.pow(2).mean().item()
mean/10000, math.sqrt(var/10000)

(0.0045433636779020166, 1.001365381042477)

In [29]:
mean, var = 0.,0.
for i in range(10000):
    x = torch.randn(1)
    a = torch.randn(1)*math.sqrt(1/512.)
    y = a*x
    mean += y.mean().item()
    var += y.pow(2).mean().item()
mean/10000, math.sqrt(var/10000)

(0.0006665773250747219, 0.044218858381667475)

In [28]:
1/512

0.001953125

In [30]:
mean, var = 0.,0.
for i in range(10000):
    x = torch.randn(512)
    a = torch.randn(512, 512)*math.sqrt(1/512.)
    x = a@x

x.mean(), x.std()

(tensor(0.0639), tensor(1.0590))

###### using activation functions

In [8]:
def tanh(x): return torch.tanh(x)

In [9]:
x = torch.randn(512)
a = torch.randn(512, 512)*math.sqrt(1/512.)
for i in range(100):
    x = tanh(a@x)

x.mean(), x.std()

(tensor(-0.0073), tensor(0.1213))

In [18]:
mean, var = 0.,0.
for i in range(10000):
    x = torch.randn(512)
    a = torch.randn(512, 512)*math.sqrt(1/512.)
    y = tanh(a@x)
    mean += y.mean().item()
    var += y.pow(2).mean().item()
mean/10000, math.sqrt(var/10000)

(0.0001461052545811981, 0.6276535678475743)

the std is good, but not 1. To address this issue:

Before Xavier Glorot and Yoshua Bengio published their landmark paper titled _Understanding the difficulty of training deep feedforward neural networks_, the “commonly used heuristic” to which they compared their experiments was that of _initializing weights from a uniform distribution in [-1,1]_ and then scaling by **1/√n.**

In [10]:
x = torch.randn(512)
a = torch.Tensor(512, 512).uniform_(-1, 1) * math.sqrt(1/512.)
for i in range(100):
    x = tanh(a@x)

stats(x)

(tensor(9.0958e-26), tensor(3.1799e-24))

In [22]:
x = torch.randn(512)
a = torch.Tensor(512, 512).uniform_(-1, 1) * math.sqrt(1/512.)
for i in range(1000):
    x = tanh(a@x)

stats(x)

(tensor(0.), tensor(0.))

###### _**Xavier Init:**_

Xavier initialization sets a layer’s weights to values chosen from a random uniform distribution that’s bounded between:
$$\Biggl[
-\frac{\sqrt{6}}{\sqrt{n_{i}+n_{i+1}}} , + \frac{\sqrt{6}}{\sqrt{n_{i}+n_{i+1}}}
\Biggr]$$

here, $n_{i}$ is `fan-in` i.e. `number of incoming network connections to that layer`$,\&$ <br>  $n_{i+1}$ is `fan-out` i.e. `number of outgoing network connections from that layer`

In [21]:
def xavier(m,h): return torch.Tensor(512, 512).uniform_(-1, 1) * math.sqrt(6./(m+h))

In [26]:
# tanh activation
x = torch.randn(512)
a = xavier(512, 512)
for i in range(10000):
    x = tanh(a@x)

stats(x)

(tensor(-0.0100), tensor(0.1266))

######  when using relu activations

In [11]:
def relu(x): return x.clamp_min(0.)

In [45]:
# relu activation with [xavier init]
x = torch.randn(512)
a = xavier(512, 512)

for i in range(10000):
    x = relu(a@x)
    if x.std() == 0.: break 

stats(x), i

((tensor(0.), tensor(0.)), 260)

In [47]:
# relu activation with [random init scaled by 1/sqrt(n)]
x = torch.randn(512)
a = torch.randn(512, 512)*math.sqrt(1/512.)
for i in range(100):
    x = relu(a@x)

x.mean(), x.std()

(tensor(6.5335e-16), tensor(8.8455e-16))

In [90]:
# relu activation with [random init]
mean, var = 0.,0.
x = torch.randn(512)
a = torch.randn(512, 512)
for i in range(10000):
    y = relu(a@x)
    mean += y.mean().item()
    var += y.pow(2).mean().item()
mean/10000, math.sqrt(var/10000)

(9.25283432006836, 16.087787123506143)

In [50]:
math.sqrt(512), math.sqrt(512/2)

(22.627416997969522, 16.0)

In [91]:
# relu activation with [random init scaled by sqrt(2/n)]
mean, var = 0.,0.
x = torch.randn(512)
a = torch.randn(512, 512)*math.sqrt(2./512)
for i in range(10000):
    y = relu(a@x)
    mean += y.mean().item()
    var += y.pow(2).mean().item()
mean/10000, math.sqrt(var/10000)

(0.5680713653564453, 1.0546854089786852)

keeping the standard deviation of layers’ activations around 1 will allow us to stack several more layers in a deep neural network without gradients exploding or vanishing <br>
here mean is ~= .5 because in relu we are removing nearly half of the values (with zero)

###### **Kaiming Init:**

1. _Initialize weights with **random numbers from standard normal distribution** (with appropiate weight dim at the layer)_ <br>
2. _Multiply each randomly chosen number by **√2/√n** where **n is the number of incoming connections** coming into a given layer from the previous layer’s output (also known as the **“fan-in”**)._
3. _**Bias** tensors are initialized to **zero**_

std: $$\sigma=\sqrt{\frac{2}{n_{l}+\hat{n}_{l}}}$$

here, $n_{l}$ is `fan-in` i.e. `number of incoming network connections to that layer`$,\&$ <br>  $\hat{n}_{l}$ is `fan-out` i.e. `number of outgoing network connections from that layer`

In [94]:
def kaiming(m,h): return torch.randn(m,h)*math.sqrt(2./m)

In [96]:
x = torch.randn(512)
for i in range(100):
    a = kaiming(512, 512)
    x = relu(a@x)

stats(x)

(tensor(0.4102), tensor(0.6081))

In [86]:
x = torch.randn(512)
for i in range(1000):
    a = kaiming(512, 512)
    x = relu(a@x)

stats(x)

(tensor(0.1363), tensor(0.1861))

In [99]:
x = torch.randn(512)
for i in range(100000):
    a = kaiming(512, 512)
    x = relu(a@x)
    if x.std() == 0. : break

stats(x),i

((tensor(3.2230e-44), tensor(4.6243e-44)), 99999)

don't know it must have worked, but for 100k layers its weight is too close to zero