# Deep Learning Computation

### imports

In [1]:
%matplotlib inline
from d2l import torch as d2l
import torch
from torch import nn
import numpy as np
import math

## Layers and Blocks

In [34]:
import torch
from torch import nn
from torch.nn import functional as F

net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.0457,  0.0009,  0.3846,  0.2036, -0.0649, -0.1804,  0.0720, -0.2940,
         -0.2458,  0.0898],
        [ 0.0724,  0.0071,  0.3211,  0.2329,  0.0597, -0.1464, -0.0923, -0.1612,
         -0.1937,  0.1750]], grad_fn=<AddmmBackward>)

In [35]:
F.relu(torch.Tensor([-10]))

tensor([0.])

In [36]:
class MLP(nn.Module):
    # Declare a layer with model parameters. Here, we declare two fully
    # connected layers
    def __init__(self):
        # Call the constructor of the `MLP` parent class `Block` to perform
        # the necessary initialization. In this way, other function arguments
        # can also be specified during class instantiation, such as the model
        # parameters, `params` (to be described later)
        super().__init__()
        self.hidden = nn.Linear(20, 256)  # Hidden layer
        self.out = nn.Linear(256, 10)  # Output layer

    # Define the forward propagation of the model, that is, how to return the
    # required model output based on the input `X`
    def forward(self, X):
        # Note here we use the funtional version of ReLU defined in the
        # nn.functional module.
        return self.out(F.relu(self.hidden(X)))

In [37]:
net = MLP()
net(X)

tensor([[ 0.1612, -0.1561, -0.1386, -0.1134,  0.1140,  0.1956, -0.0672, -0.0562,
          0.0247, -0.1117],
        [ 0.1673, -0.0426, -0.0120,  0.0743,  0.1640,  0.1579,  0.0759, -0.0857,
          0.1342, -0.0526]], grad_fn=<AddmmBackward>)

In [38]:
list(net.named_modules())

[('', MLP(
    (hidden): Linear(in_features=20, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=10, bias=True)
  )),
 ('hidden', Linear(in_features=20, out_features=256, bias=True)),
 ('out', Linear(in_features=256, out_features=10, bias=True))]

In [41]:
class MySequential(nn.Module):
    def __init__(self, *args):
        super().__init__()
        for block in args:
            # Here, `block` is an instance of a `Module` subclass. We save it
            # in the member variable `_modules` of the `Module` class, and its
            # type is OrderedDict
            self._modules[block] = block

    def forward(self, X):
        # OrderedDict guarantees that members will be traversed in the order
        # they were added
        for block in self._modules.values():
            X = block(X)
        return X

In [42]:
net = MySequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
net(X)

tensor([[-0.2623, -0.1557,  0.1188,  0.3653, -0.0278,  0.0661, -0.1512, -0.1238,
         -0.1801, -0.2117],
        [ 0.0800, -0.2598,  0.1512,  0.2195,  0.0289,  0.1289,  0.0140, -0.1824,
         -0.2907, -0.1895]], grad_fn=<AddmmBackward>)

In [44]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # Random weight parameters that will not compute gradients and
        # therefore keep constant during training
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # Use the created constant parameters, as well as the `relu` and `mm`
        # functions
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # Reuse the fully-connected layer. This is equivalent to sharing
        # parameters with two fully-connected layers
        X = self.linear(X)
        # Control flow
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [45]:
net = FixedHiddenMLP()
net(X)

tensor(0.0912, grad_fn=<SumBackward0>)

In [46]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))

chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(-0.2068, grad_fn=<SumBackward0>)

## Parameter Management

In [47]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.1511],
        [0.1642]], grad_fn=<AddmmBackward>)

In [53]:
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.2385,  0.1152,  0.2925, -0.2927,  0.3414,  0.2726, -0.0143, -0.0719]])), ('bias', tensor([-0.0878]))])


In [54]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.0878], requires_grad=True)
tensor([-0.0878])


In [56]:
net[2].weight.grad == None

True

In [57]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [58]:
net.state_dict()['2.bias'].data

tensor([-0.0878])

In [59]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # Nested here
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.1183],
        [0.1183]], grad_fn=<AddmmBackward>)

In [67]:
rgnet[0][1][0].bias.data

tensor([ 0.1149,  0.3681,  0.1269,  0.3809,  0.3604, -0.2091, -0.3527,  0.3935])

In [73]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0060, -0.0010,  0.0061, -0.0088]), tensor(0.))

In [74]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [75]:
def xavier(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        torch.nn.init.constant_(m.weight, 42)

net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.3632, -0.5208, -0.3898, -0.6506])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [79]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-9.0655, -0.0000,  0.0000,  7.7847],
        [ 9.8323,  0.0000,  6.9076,  8.4067]], grad_fn=<SliceBackward>)

In [80]:
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]

tensor([42.0000,  1.0000,  1.0000,  8.7847])

In [81]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)
# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])
