## ----------------------------------shortcut Connections--------------------------------

In [90]:
import torch 
from torch import nn

class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        # Pass the input through the GELU activation - Approximate formula
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0) / torch.pi) * (x + 0.44715 * torch.pow(x, 3)))                                                            )

In [91]:
# Creating short-cut connection class

class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut: bool):
        super().__init__()
        # Variable to determine whether to use (not) shortcut 
        self.use_shortcut = use_shortcut
        # Defining the layers
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU())
        ])
    def forward(self, x):
        for layer in self.layers:
            # Getting the output of each individual layer
            layer_output = layer(x)
            # Applying shortcut
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x +  layer_output
            else:
                x = layer_output
        return x        

In [92]:
# Creating sample data
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_inputs = torch.tensor([[1., 0, -1.]])

In [93]:
# Creating a model without shortcuts
torch.manual_seed(42)
model_without_shortcuts = ExampleDeepNeuralNetwork(layer_sizes, False)

In [94]:
# Creating a function to print the mean gradient for each layer

def print_gradients(model, x):
    # Forward pass
    output = model(x)
    # Creating the target
    target = torch.tensor([[0.]])
    # Getting the loss
    loss = nn.MSELoss()
    loss = loss(output, target)
    # Backward propagatioon to calculate the gradients
    loss.backward()
    # Printing the mean of gradient
    for name, param in model.named_parameters():
        if "weight" in name:
            print(f"{name} Has a gradient mean = {param.grad.abs().mean().item()}")

In [95]:
print_gradients(model_without_shortcuts, sample_inputs)

layers.0.0.weight Has a gradient mean = 0.00012371748744044453
layers.1.0.weight Has a gradient mean = 0.00022939023619983345
layers.2.0.weight Has a gradient mean = 0.0002862405963242054
layers.3.0.weight Has a gradient mean = 0.0018804551800712943
layers.4.0.weight Has a gradient mean = 0.00592834735289216


In [96]:
# Creating a model with shortcut implementation
torch.manual_seed(42)
model_with_shortcuts = ExampleDeepNeuralNetwork(layer_sizes, True)
print_gradients(model_with_shortcuts, sample_inputs)

layers.0.0.weight Has a gradient mean = 0.00684445071965456
layers.1.0.weight Has a gradient mean = 0.010904626920819283
layers.2.0.weight Has a gradient mean = 0.007769796531647444
layers.3.0.weight Has a gradient mean = 0.009208275936543941
layers.4.0.weight Has a gradient mean = 0.026989780366420746
