In [11]:
import torch
from torch import nn

In [12]:
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2,4))
net(X).shape



torch.Size([2, 1])

In [13]:
# initializing to gaussian distrbution
def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0129,  0.0074, -0.0094, -0.0046]), tensor(0.))

In [14]:
# this initializes all the parameters to a given constant value
def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [15]:
# we can also apply different initializers for different blocks
# we initialize the first layer below with the Xavier initializer and the second layer to a constant value
def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.1919, -0.3528, -0.2348, -0.0273])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [16]:
# defining custom initializations
def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-0.0000,  5.8396, -8.8401, -6.2158],
        [-0.0000,  0.0000, -0.0000,  0.0000]], grad_fn=<SliceBackward0>)

## Lazy Initialization
The reason why the deep learning framework could tell the input dimensionality of the network without any information is that it defers initialization, until the first time we pass data through the model, in order to infer the sizes of each layer on the fly. 

In [17]:
import torch
from torch import nn 
from d2l import torch as d2l 

In [18]:
net = nn.Sequential(nn.LazyLinear(256), nn.ReLU(), nn.LazyLinear(10))



The neural network does not yet know the dimensions of the input layer or its weights, as a result it currently appears to be uninitialized.

In [19]:
net[0].weight

<UninitializedParameter>

In [20]:
# passing data through the network to finally initialize parameters
X = torch.rand(2, 20)
net(X)

net[0].weight.shape

torch.Size([256, 20])

In [22]:
# this method passes in dummy inputs through the network for a dry run to infer all paramter shapes and subsequently initializes the parameters. this is used when default random initializations are not desired
@d2l.add_to_class(d2l.Module)  #@save
def apply_init(self, inputs, init=None):
    self.forward(*inputs)
    if init is not None:
        self.net.apply(init)