<a href="https://colab.research.google.com/github/simplysumanth/dl-with-pytorch-d2l.ai/blob/main/dl_ch5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn as nn
from torch.nn import functional as F

In [None]:
net = nn.Sequential(
    nn.Linear(20,256),
    nn.ReLU(),
    nn.Linear(256,10))

In [None]:
X = torch.rand(2,20)
X.size()

torch.Size([2, 20])

In [None]:
net(X)

tensor([[ 0.0682,  0.0058, -0.1413,  0.0624, -0.1441, -0.1405,  0.3156, -0.1748,
         -0.0973, -0.2093],
        [ 0.1116, -0.0235, -0.2719, -0.1638, -0.1499, -0.0076,  0.3217, -0.1536,
         -0.1913, -0.1024]], grad_fn=<AddmmBackward>)

In [None]:
class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden = nn.Linear(20,256)
    self.out = nn.Linear(256,10)
  
  def forward(self,x):
    return self.out(F.relu(self.hidden(X)))

In [None]:
net = MLP()
net(X)

tensor([[ 0.0387,  0.0936,  0.1953, -0.1489, -0.0841, -0.0784, -0.1234,  0.0403,
         -0.1476, -0.0161],
        [ 0.1946,  0.1722,  0.0854, -0.0416, -0.0716,  0.0140, -0.2625,  0.0244,
         -0.1652, -0.1082]], grad_fn=<AddmmBackward>)

The same can be implemented in a modular way

In [None]:
class Network(nn.Module):
  def __init__(self,args):
    super().__init__()
    for i,module in enumerate(args):
      self._modules[str(i)] = module
  
  def forward(self,X):
    print('----------')
    for blocks in self._modules.values():
      print(blocks)
      X = blocks(X)
    print('------------')              
    return X

net = Network(nn.Sequential(
    nn.Linear(20,256),
    nn.ReLU(),
    nn.Linear(256,10)))


In [None]:
net(X)

----------
Linear(in_features=20, out_features=256, bias=True)
ReLU()
Linear(in_features=256, out_features=10, bias=True)
------------


tensor([[-0.1256,  0.1404,  0.0114, -0.1972,  0.1033, -0.1459,  0.2155, -0.0606,
         -0.0224, -0.0783],
        [-0.1221,  0.2340,  0.0270, -0.1569,  0.0325, -0.1115,  0.3265,  0.0271,
          0.1065, -0.0804]], grad_fn=<AddmmBackward>)

In [None]:
#implement c*WT.X
class Network(nn.Module):
  def __init__(self):
    super().__init__()
    self.rand_weight = torch.rand((20,20),requires_grad=False)
    self.linear = nn.Linear(20,20)
  
  def forward(self,x):
    x = self.linear(x)
    x = F.relu(torch.mm(x,self.rand_weight)+1)
    x = self.linear(x)
    while x.abs().sum() > 1:
      x/=2
    return x.sum()

net = Network()
net(X) 

tensor(-0.2946, grad_fn=<SumBackward0>)

####QUIZ

In [None]:
#1.
class MySequential(nn.Module):
  def __init__(self,args):
    super().__init__()
    for i, module in enumerate(args):
      self._modules[str(i)] = module

  def forward(self,X):
    for block in self._modules.values():
      X = block(X)
    return X

net = MySequential(nn.Sequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,2)))
net(X)

tensor([[ 0.0128,  0.2135],
        [-0.0061,  0.1850]], grad_fn=<AddmmBackward>)

In [None]:
#2. Take 2 blocks as an arguement -> return concatenated outputs of both 
class TwoBlocks(nn.Module):
  def __init__(self,args1,args2):
    super().__init__()
    i = 0
    for module in args1:
      self._modules[str(i)] = module
      i+=1
    for module in args2:
      self._modules[str(i)] = module
      i+=1
  
  def forward(self,X):
    for block in self._modules.values():
      X = block(X)
    return X

net = TwoBlocks(nn.Sequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,20)),nn.Sequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,2)))
net(X)

tensor([[-0.0258,  0.1388],
        [-0.0572,  0.1488]], grad_fn=<AddmmBackward>)

In [None]:
#3. Implement factory function -> generates multiple instances of same block and build a larger network from it.
class Network(nn.Module):
  def __init__(self):
    super().__init__()
    self.hidden = nn.Linear(20,256)
    self.out = nn.Linear(256,20)

  def forward(self,x):
    return self.out(F.relu(self.hidden(x)))

class Factory(nn.Module):
  def __init__(self, k):
    super().__init__()
    modules = []
    for i in range(k):
      modules.append(Network())
    self.net = nn.Sequential(*modules)
  
  def forward(self,x):
    return self.net(x)

net = Factory(3)
print(net.get_parameter)
net(X)

<bound method Module.get_parameter of Factory(
  (net): Sequential(
    (0): Network(
      (hidden): Linear(in_features=20, out_features=256, bias=True)
      (out): Linear(in_features=256, out_features=20, bias=True)
    )
    (1): Network(
      (hidden): Linear(in_features=20, out_features=256, bias=True)
      (out): Linear(in_features=256, out_features=20, bias=True)
    )
    (2): Network(
      (hidden): Linear(in_features=20, out_features=256, bias=True)
      (out): Linear(in_features=256, out_features=20, bias=True)
    )
  )
)>


tensor([[-0.1419, -0.1074,  0.0027,  0.0179,  0.0247,  0.0812,  0.0473, -0.0322,
          0.0653,  0.0856,  0.0243,  0.0763, -0.0131, -0.0041, -0.0515, -0.0164,
         -0.0075,  0.0581,  0.0924,  0.0167],
        [-0.1439, -0.1074, -0.0049,  0.0215,  0.0308,  0.0832,  0.0518, -0.0380,
          0.0618,  0.0879,  0.0202,  0.0698, -0.0149, -0.0048, -0.0578, -0.0232,
         -0.0006,  0.0574,  0.0989,  0.0064]], grad_fn=<AddmmBackward>)

## Parameter Management

In [None]:
net = nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size = (2,4))
net(X)

tensor([[-0.4379],
        [-0.4572]], grad_fn=<AddmmBackward>)

In [None]:
#Parameter Access
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0067, -0.0484, -0.2627,  0.1842, -0.2240,  0.2429, -0.1095,  0.0324]])), ('bias', tensor([-0.3091]))])


In [None]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.3091], requires_grad=True)
tensor([-0.3091])


In [None]:
list(net[0].named_parameters())[0]

('weight', Parameter containing:
 tensor([[ 0.4682,  0.3674, -0.1032, -0.4335],
         [-0.0809, -0.2018,  0.1894, -0.4921],
         [-0.4748,  0.3785, -0.2880, -0.0220],
         [-0.0828, -0.3051,  0.3851,  0.1888],
         [ 0.1979,  0.0236,  0.3052, -0.0218],
         [-0.4078,  0.1224, -0.0224, -0.3762],
         [ 0.2049,  0.0696, -0.2766,  0.3975],
         [-0.4378,  0.4992,  0.2125,  0.0883]], requires_grad=True))

In [None]:
print(*[(name,param) for name,param in net.named_parameters()])

('0.weight', Parameter containing:
tensor([[ 0.4682,  0.3674, -0.1032, -0.4335],
        [-0.0809, -0.2018,  0.1894, -0.4921],
        [-0.4748,  0.3785, -0.2880, -0.0220],
        [-0.0828, -0.3051,  0.3851,  0.1888],
        [ 0.1979,  0.0236,  0.3052, -0.0218],
        [-0.4078,  0.1224, -0.0224, -0.3762],
        [ 0.2049,  0.0696, -0.2766,  0.3975],
        [-0.4378,  0.4992,  0.2125,  0.0883]], requires_grad=True)) ('0.bias', Parameter containing:
tensor([ 0.0917,  0.3095, -0.1874,  0.2461,  0.3008, -0.3041,  0.0826, -0.1775],
       requires_grad=True)) ('2.weight', Parameter containing:
tensor([[ 0.0067, -0.0484, -0.2627,  0.1842, -0.2240,  0.2429, -0.1095,  0.0324]],
       requires_grad=True)) ('2.bias', Parameter containing:
tensor([-0.3091], requires_grad=True))


In [None]:
#similarly
net.state_dict()

OrderedDict([('0.weight', tensor([[ 0.4682,  0.3674, -0.1032, -0.4335],
                      [-0.0809, -0.2018,  0.1894, -0.4921],
                      [-0.4748,  0.3785, -0.2880, -0.0220],
                      [-0.0828, -0.3051,  0.3851,  0.1888],
                      [ 0.1979,  0.0236,  0.3052, -0.0218],
                      [-0.4078,  0.1224, -0.0224, -0.3762],
                      [ 0.2049,  0.0696, -0.2766,  0.3975],
                      [-0.4378,  0.4992,  0.2125,  0.0883]])),
             ('0.bias',
              tensor([ 0.0917,  0.3095, -0.1874,  0.2461,  0.3008, -0.3041,  0.0826, -0.1775])),
             ('2.weight',
              tensor([[ 0.0067, -0.0484, -0.2627,  0.1842, -0.2240,  0.2429, -0.1095,  0.0324]])),
             ('2.bias', tensor([-0.3091]))])

In [None]:
## collecting  parameters from multiple blocks

def block1():
  return nn.Sequential(nn.Linear(4,8), nn.ReLU(), nn.Linear(8,4), nn.ReLU())

def block2():
  net = nn.Sequential()
  for i in range(4):
    net.add_module(f'block {i}', block1())
  return net

net = nn.Sequential(block2(),nn.Linear(4,1))
net(X)

tensor([[0.0519],
        [0.0520]], grad_fn=<AddmmBackward>)

In [None]:
print(net)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


## Parameter Initilaization

In [None]:
def init_normal(m):
  if type(m) == nn.Linear:
    nn.init.normal_(m.weight, mean=0 , std=0.1)
    nn.init.zeros_(m.bias)

net.apply(init_normal)


Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [None]:
net[0]

Sequential(
  (block 0): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block 1): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block 2): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
  (block 3): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=4, bias=True)
    (3): ReLU()
  )
)

## Parameter Initiliazation

In [None]:
def init_normal(m):
  if type(m) == nn.Linear:
    nn.init.normal_(m.weight, mean=0, std=0.1)
    nn.init.zeros_(m.bias)

net.apply(init_normal)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [None]:
print(*[(name,param) for name,param in net.named_parameters()])

('0.block 0.0.weight', Parameter containing:
tensor([[-0.0080, -0.0031,  0.1112, -0.0341],
        [-0.1622,  0.0234,  0.0934, -0.1226],
        [ 0.0324, -0.0134, -0.0507, -0.1373],
        [-0.0781,  0.0732, -0.0221,  0.0852],
        [ 0.0544,  0.0263, -0.1500, -0.0398],
        [-0.0845, -0.0313,  0.1630, -0.0523],
        [-0.0050,  0.1550, -0.0698, -0.0114],
        [-0.0612,  0.0464,  0.0454,  0.0716]], requires_grad=True)) ('0.block 0.0.bias', Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)) ('0.block 0.2.weight', Parameter containing:
tensor([[-0.0200,  0.2373, -0.0473, -0.1253,  0.0606,  0.0312,  0.0175,  0.0974],
        [ 0.1306,  0.0945, -0.0287,  0.0027,  0.0463,  0.0140,  0.0730,  0.0098],
        [ 0.0924, -0.1254, -0.0833,  0.0379,  0.1290,  0.0413,  0.0792, -0.1536],
        [ 0.1352, -0.1663,  0.1186, -0.0678,  0.1215,  0.1309, -0.0029,  0.1644]],
       requires_grad=True)) ('0.block 0.2.bias', Parameter containing:
tensor([0., 0.,

In [None]:
#lets init with constant for w and zeros for b
def init_constant(m):
  if type(m) == nn.Linear:
    nn.init.constant_(m.weight,1)
    nn.init.zeros_(m.bias)

net.apply(init_constant)
print(*[(name,param) for name, param in net.named_parameters()])

('0.block 0.0.weight', Parameter containing:
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)) ('0.block 0.0.bias', Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)) ('0.block 0.2.weight', Parameter containing:
tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]], requires_grad=True)) ('0.block 0.2.bias', Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)) ('0.block 1.0.weight', Parameter containing:
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)) ('0.block 1.0.bias', 

In [None]:
net(X)

tensor([[2561960.2500],
        [3132997.5000]], grad_fn=<AddmmBackward>)

In [None]:
#We can apply diff init to diff blocks
def xavier(m):
  if type(m) == nn.Linear:
    nn.init.xavier_uniform_(m.weight)

def init_42(m):
  if type(m) == nn.Linear:
    nn.init.constant_(m.weight, 42)

net[0][0].apply(xavier)
net[0][1].apply(init_42)

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
)

In [None]:
print(*[(name,param) for name, param in net.named_parameters()])

#### **Tied Parameter**
Sharing the parameter across multiple layers.

In [None]:
shared = nn.Linear(8,8)
net = nn.Sequential(
    nn.Linear(4,8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8,1)
)
net(X)

tensor([[0.2128],
        [0.2092]], grad_fn=<AddmmBackward>)

##Quiz

In [None]:
#1. Implement a MLP and access its parameters
import pprint
class MLP(nn.Module):
  def __init__(self):
    super().__init__()
    self.fc1 = nn.Linear(4,8)
    self.fc2 = nn.Linear(8,8)
    self.out = nn.Linear(8,1)

  def forward(self,x):
    x = F.relu(self.fc2(F.relu(self.fc1(x))))
    return self.out(x)

p = pprint.PrettyPrinter(depth=6)

net = MLP()
print(net.get_parameter)
p.pprint([(name, param) for name, param in net.named_parameters()])

<bound method Module.get_parameter of MLP(
  (fc1): Linear(in_features=4, out_features=8, bias=True)
  (fc2): Linear(in_features=8, out_features=8, bias=True)
  (out): Linear(in_features=8, out_features=1, bias=True)
)>
[('fc1.weight',
  Parameter containing:
tensor([[-0.1800, -0.0145,  0.0414, -0.0710],
        [ 0.1744,  0.1987, -0.2241,  0.3152],
        [-0.4376,  0.1019, -0.0557,  0.1810],
        [ 0.4192, -0.3329,  0.2662, -0.1542],
        [-0.0841,  0.3637, -0.0296, -0.1712],
        [-0.3130,  0.1165, -0.2293, -0.2028],
        [-0.2087,  0.2894,  0.4423,  0.0521],
        [-0.0128, -0.0635, -0.2364,  0.4394]], requires_grad=True)),
 ('fc1.bias',
  Parameter containing:
tensor([-0.1795, -0.3467,  0.3317,  0.3666,  0.2337,  0.3701,  0.4180,  0.2220],
       requires_grad=True)),
 ('fc2.weight',
  Parameter containing:
tensor([[ 0.0833,  0.1081,  0.2800,  0.3420, -0.1625,  0.2513,  0.0224,  0.0252],
        [-0.2637,  0.0752,  0.0621, -0.3292, -0.2233, -0.1947,  0.1703,  0.0179

## Custom Layers