In [1]:
import torch
import torch.nn as nn

In [2]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_size, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_size[0],layer_size[1]), nn.GELU()),
            nn.Sequential(nn.Linear(layer_size[1],layer_size[2]), nn.GELU()),
            nn.Sequential(nn.Linear(layer_size[2],layer_size[3]), nn.GELU()),
            nn.Sequential(nn.Linear(layer_size[3],layer_size[4]), nn.GELU()),
            nn.Sequential(nn.Linear(layer_size[4],layer_size[5]), nn.GELU()),
        ])

    def forward(self,x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output

        return x

In [3]:
layer_size = [3,3,3,3,3,1]
sample_input = torch.tensor([1.,0.,-1.])
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_size,use_shortcut=False)

In [4]:
def print_gradients(model, x):
    output = model(x)
    target = torch.tensor([[0.]])

    loss = nn.MSELoss()
    loss = loss(output, target)

    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient of {param.grad.abs().mean().item()}")
            

In [5]:
print_gradients(model_without_shortcut, sample_input)

  return F.mse_loss(input, target, reduction=self.reduction)


layers.0.0.weight has gradient of 0.00020174118981231004
layers.1.0.weight has gradient of 0.00012011769285891205
layers.2.0.weight has gradient of 0.0007152436301112175
layers.3.0.weight has gradient of 0.00139885104727
layers.4.0.weight has gradient of 0.005049602594226599


In [6]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_size, use_shortcut=True)
print_gradients(model_with_shortcut,sample_input)

layers.0.0.weight has gradient of 0.22186800837516785
layers.1.0.weight has gradient of 0.20709273219108582
layers.2.0.weight has gradient of 0.3292388319969177
layers.3.0.weight has gradient of 0.2667772173881531
layers.4.0.weight has gradient of 1.3268063068389893


  return F.mse_loss(input, target, reduction=self.reduction)
