#### Author: Prakash C. Sukhwal
#### July 2021
#### Associated Lecturer & Consultant
#### Institute of Systems Science, NUS

---

## Shortcuts for Colab

<img src="https://drive.google.com/uc?id=128WKhxbyfKGM4HhhV7I9M1hOrFOXtFNo" alt="image" 
    width="550" 
    height="250" class="center">


In [None]:
## turn on the autocomplete if off by default
%config Completer.use_jedi = False

  


###### All the given implementations are in pytorch version: 1.7.1
- to check you version type the below commands in your notebook
      - import torch
      - torch.__version__

In [None]:
import torch
import torch.nn as nn

## 1. Softmax function

<img src="https://drive.google.com/uc?id=1jyP4tJcjxOiKRz6opHAcDj_Q2W4TF7c6" alt="image" 
    width="400" 
    height="420" class="center">
   

<img src="https://drive.google.com/uc?id=1Cckmxg9G0m6pQ3KHlLMISZ2jxOifJcdL" alt="image" 
    width="400" 
    height="420" class="center">    

    img source: 
    https://medium.com/@shashank.1842000/the-softmax-activation-function-4a184310b3b4
    https://www.programmersought.com/article/77605196667/

<img src="https://drive.google.com/uc?id=1okpiC24EMyp45yXQKG5IuhSlnQ05sR5B" alt="image" 
    width="400" 
    height="350" class="center">
   
       note:
        - we apply softmax activation function for multi-class classification during training 
        - to get output interpreted as normalised class probabilities 
            - yi are unnormalised log probabilities; exp gives you unnormalized probabilities; finally dividing by the sum gives us the normalized probabilities that sum to 1 on which cross-entropy can be applied
        - it is used during training with cross-entroy loss
source: https://cs231n.github.io/linear-classify/#loss-function

###### how to invoke it in torch?
       - we invoke it from class torch.nn.functional.softmax(input, dim=None, _stacklevel=3, dtype=None)
       
source: https://pytorch.org/docs/stable/nn.functional.html#torch.nn.functional.softmax

In [None]:
## Let's try the values given in the figure above in pytorch

lg_prob = [-2.85, 0.86, 0.28]
print(type(lg_prob))

# convert to tensor
lg_prob_tens = torch.tensor(lg_prob)
print(type(lg_prob_tens))

# use softmax 
# instantiate 
sft = nn.Softmax()
out_prob = sft(lg_prob_tens)

print(out_prob)

# final loss
-torch.log(out_prob)

<class 'list'>
<class 'torch.Tensor'>
tensor([0.0154, 0.6312, 0.3534])


  del sys.path[0]


tensor([4.1702, 0.4602, 1.0402])

## How do we initialize weights in pytorch?

source: https://pytorch.org/docs/stable/nn.init.html#torch-nn-init

torch.nn.init.calculate_gain(nonlinearity, param=None)

Where,
      
      nonlinearity – the non-linear function (nn.functional name)

      param – optional parameter for the non-linear function

## 1. Uniform weights

torch.nn.init.uniform_(tensor, a=0.0, b=1.0)
    
    tensor – an n-dimensional torch.Tensor

    a – the lower bound of the uniform distribution

    b – the upper bound of the uniform distribution


<img src="https://drive.google.com/uc?id=12oQg9-f4_5Kf3uRAysKzVu2nxd7s8bqR" alt="image" 
    width="500" 
    height="400" class="center">    

    img source: https://www.astroml.org/book_figures/chapter3/fig_uniform_distribution.htmlng

In [None]:
# create an empty tensor first
tens = torch.empty(3,4, requires_grad=True)
tens

tensor([[2.5909e-18, 3.0848e-41, 1.5975e-43, 1.3873e-43],
        [1.4574e-43, 6.4460e-44, 1.4153e-43, 1.5274e-43],
        [1.5695e-43, 1.6255e-43, 1.6956e-43, 4.6120e+27]], requires_grad=True)

In [None]:
# Fills the input Tensor with values drawn from the uniform distribution U(a,b)
nn.init.uniform_(tens)

tensor([[0.5255, 0.6985, 0.0759, 0.3230],
        [0.8329, 0.9142, 0.2114, 0.1743],
        [0.5924, 0.4145, 0.0919, 0.4669]])

In [None]:
tens

tensor([[2.5909e-18, 3.0848e-41, 1.5975e-43, 1.3873e-43],
        [1.4574e-43, 6.4460e-44, 1.4153e-43, 1.5274e-43],
        [1.5695e-43, 1.6255e-43, 1.6956e-43, 4.6120e+27]], requires_grad=True)

In [None]:
nn.init.uniform_(tens, a = 1, b= 4)

tensor([[1.2641, 1.5549, 3.6858, 3.2618],
        [1.6484, 2.9267, 3.9404, 1.4929],
        [2.2016, 3.2888, 3.9579, 3.5667]], requires_grad=True)

## 2. Normal distribution

torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
   
    tensor – an n-dimensional torch.Tensor
    mean – the mean of the normal distribution
    std – the standard deviation of the normal distribution

<img src="https://drive.google.com/uc?id=11B2L7_XAFkzKimMLJQpKohW49DEYoy8n" alt="image" 
    width="480" 
    height="300" class="center"> 

    img source:https://towardsdatascience.com/understanding-the-68-95-99-7-rule-for-a-normal-distribution-b7b7cbf760c2




In [None]:
# create an empty tensor first
tens = torch.empty(3,4, requires_grad=True)
tens

tensor([[2.5910e-18, 3.0848e-41, 1.5835e-43, 1.6395e-43],
        [1.4714e-43, 1.5975e-43, 1.4153e-43, 1.6115e-43],
        [1.3312e-43, 1.4433e-43, 1.5975e-43, 1.3593e-43]], requires_grad=True)

In [None]:
# Fills the input Tensor with values drawn from the normal distribution N(mean,std^2)
nn.init.normal_(tens)

tensor([[ 2.8079, -0.8518, -0.6627, -2.0036],
        [ 0.2107,  0.1923,  0.9960,  0.1724],
        [-1.4980, -0.3428, -0.8223, -0.6332]], requires_grad=True)

In [None]:
nn.init.normal_(tens, mean = 0, std=0.5)

tensor([[ 0.0813, -0.7933,  0.4791,  0.3848],
        [-0.3035, -0.2670, -0.4118, -0.0851],
        [-0.6429, -1.0950, -0.1813, -0.6947]], requires_grad=True)

## 3. Constant

torch.nn.init.constant_(tensor, val)
   
    tensor – an n-dimensional torch.Tensor
    val – the value to fill the tensor with

note: helpful with creating bias inputs for the NN


In [None]:
# Create an empty tensor first
tens = torch.empty(3,4, requires_grad=True)
# Fills the input Tensor with the value val
nn.init.constant_(tens, 2)

tensor([[2., 2., 2., 2.],
        [2., 2., 2., 2.],
        [2., 2., 2., 2.]], requires_grad=True)

#### Example



In [None]:
# let's create a linear layer

l1 = nn.Linear(3,4)
l1

Linear(in_features=3, out_features=4, bias=True)

In [None]:
print(l1.weight)

print(l1.bias)

Parameter containing:
tensor([[-0.1388, -0.0319, -0.4742],
        [-0.5366, -0.4949, -0.1724],
        [-0.3877, -0.3811, -0.4146],
        [ 0.0435,  0.2758, -0.4778]], requires_grad=True)
Parameter containing:
tensor([ 0.1739,  0.1613, -0.4195, -0.1027], requires_grad=True)


In [None]:
# set the weights using a normal distribution
nn.init.normal_(l1.weight, mean =0, std=1.25)
l1.weight

Parameter containing:
tensor([[ 1.0649, -2.7965, -0.8400],
        [-0.0875, -0.7815,  0.8116],
        [ 0.8899,  1.1292, -1.1337],
        [ 1.6192,  0.3407,  2.0254]], requires_grad=True)

In [None]:
# set the bias to constant value of 0
nn.init.constant_(l1.bias, 0)
l1.bias

Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)

In [None]:
print(l1.weight)

print(l1.bias)

Parameter containing:
tensor([[ 1.0649, -2.7965, -0.8400],
        [-0.0875, -0.7815,  0.8116],
        [ 0.8899,  1.1292, -1.1337],
        [ 1.6192,  0.3407,  2.0254]], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)
