# Numerical Stability and Initialization

In [1]:
!pip install mxnet
!pip install d2l

Traceback (most recent call last):
  File "/home/iserina/.local/bin/pip", line 7, in <module>
    from pip._internal import main
ModuleNotFoundError: No module named 'pip._internal'
Traceback (most recent call last):
  File "/home/iserina/.local/bin/pip", line 7, in <module>
    from pip._internal import main
ModuleNotFoundError: No module named 'pip._internal'


In [2]:
%matplotlib inline
import math
from mxnet import nd, autograd
from matplotlib import pyplot as plt
from IPython import display
display.set_matplotlib_formats('svg')

## Product of Random Matrices

In [3]:
def prod_rand_matrices(scale, k):
    Y = nd.diag(nd.ones(k))
    for i in range(100):
        W = nd.random.normal(shape=(k,k), scale=scale)
        Y = nd.dot(W, Y)
    return Y

## Sensitive 

In [4]:
print(prod_rand_matrices(.5, 4))
print(prod_rand_matrices(.7, 4))


[[-3.0383111e-05  1.0993276e-05  1.3308806e-05 -6.5295076e-06]
 [-3.8170743e-05  1.3811012e-05  1.6720047e-05 -8.2031147e-06]
 [ 3.1549363e-05 -1.1415253e-05 -1.3819666e-05  6.7801420e-06]
 [ 2.8754292e-05 -1.0403934e-05 -1.2595330e-05  6.1794649e-06]]
<NDArray 4x4 @cpu(0)>

[[ -1072696.1   15321461.    37594224.   -14354324.  ]
 [   559049.1   -7984973.   -19592708.     7480936.  ]
 [   568999.56  -8127098.   -19941440.     7614089.5 ]
 [ -2092054.5   29881096.    73319176.   -27994908.  ]]
<NDArray 4x4 @cpu(0)>


## Synthetic Gradients for MLP


In [5]:
def synthetic_grad(k, sigma, d_sigma, get_weight):
    res = []
    for repeat in range(10):
        x = nd.random.normal(shape=k)
        h = nd.ones(k)
        Y = nd.diag(nd.ones(k))
        for i in range(50):
            W = get_weight((k, k))
            Wh = nd.dot(W, h)
            Y = nd.dot(d_sigma(Wh)*W.T, Y)
            h = sigma(Wh)
        res.append(Y.abs().mean().asscalar())
    return sum(res)/len(res)

## ReLU 



In [6]:
k = 100
sigma = nd.relu
d_sigma = lambda x : x > 0
def get_weight(scale):
    return lambda shape : nd.random.normal(scale=scale, shape=shape)

for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale', scale, 'gradient mean', 
          synthetic_grad(k, sigma, d_sigma, get_weight(scale)))

scale 0.1 gradient mean 1.4038878531907883e-09
scale 0.2 gradient mean 1318943.45
scale 0.4 gradient mean 1.7015228207303707e+21
scale 0.8 gradient mean nan


## Xavier

In [7]:
scale = (6.0/(k+k))**.5
xavier = lambda shape : nd.random.uniform(low=-scale, high=scale, shape=shape)
synthetic_grad(k, sigma, d_sigma, xavier)

1.1741994598857275e-09

## Sigmoid

In [8]:
sigma = nd.sigmoid
d_sigma = lambda x : (1-nd.sigmoid(x))*nd.sigmoid(x)
for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale', scale, 'gradient mean',
          synthetic_grad(k, sigma, d_sigma, get_weight(scale)))

scale 0.1 gradient mean 2.3150346089870423e-33
scale 0.2 gradient mean 3.689017464558497e-21
scale 0.4 gradient mean 4.3340464572251545e-12
scale 0.8 gradient mean 5.191878954065032e-05


## Scaled Sigmoid


In [9]:
sigma = lambda x: 4 * nd.sigmoid(x) - 2
d_sigma = lambda x : 4 * (1-nd.sigmoid(x))*nd.sigmoid(x)
for scale in [0.1, 0.2, 0.4, 0.8]:
    print('scale', scale, 'gradient mean',
          synthetic_grad(k, sigma, d_sigma, get_weight(scale)))

scale 0.1 gradient mean 0.011168299429118633
scale 0.2 gradient mean 236.85642356872557
scale 0.4 gradient mean 47525888.725
scale 0.8 gradient mean 43667959657267.2
