# 2 Gelu Formation

> We motivate our activation function by combining properties from dropout, zoneout, and ReLUs. First note that a ReLU and dropout both yield a neuron's output with the ReLU deterministically multiplying the input by zero or one and dropout stochastically multiplying by zero.

> 2016). We merge this functionality by multiplying the input by zero or one, but the values of this zero-one mask are stochastically determined while also dependent upon the input. Specifically, we can multiply the neuron input \(x\) by \(m \sim \operatorname{Bernoulli(}(\Phi(x))\), where \(\Phi(x)=P(X \leq\) \(x), X \sim \mathcal{N}(0,1)\) is the cumulative distribution function of the standard normal distribution. We choose this distribution since neuron inputs tend to follow a normal distribution, especially with Batch Normalization.

So if the value x is high, the probability of not dropping it is high.


In [17]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader


class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, activation_type="gelu"):
        super(NeuralNetwork, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.l2 = nn.Linear(input_size, hidden_size)
        if activation_type == "gelu":
            self.act = nn.GELU()
        else:
            self.act = nn.ReLU()

    def forward(self, x):
        x = self.l1(x)
        x = self.act(x)
        x = self.l2(x)
        return x


def train(model, criterion, optimizer, training_set):
    model.train()
    for epoch in range(1000):
        loss = 0
        for x, target in training_set:
            optimizer.zero_grad()
            x = model(x)
            loss += criterion(x, target)
            loss.backward()  # retain_graph=True
            optimizer.step()
    print(f"Epoch: {epoch}, Loss: {loss / len(training_set)}")

    # return loss / target_length


def generate_data(batch_size, input_size):
    # 1. Normally distributed data
    normal_input = torch.randn(batch_size, input_size)
    normal_target = torch.randn(batch_size, 1)
    # 2. Uniform distributed data [-1, 1]
    uniform_input = torch.rand(batch_size, input_size) * 2 - 1
    uniform_target = torch.rand(batch_size, 1) * 2 - 1

    return normal_input, normal_target, uniform_input, uniform_target


batch_size = 10
input_size = 64
lr = 0.001

# input: (batch_size, input_size)
# output: (batch_size, 1)

normal_input, normal_target, uniform_input, uniform_target = generate_data(
    batch_size, input_size)

for input, target, data_type in [(normal_input, normal_target, "normal"), (uniform_input, uniform_target, "uniform")]:
    for activation_type in ["gelu", "relu"]:
        print(f"Training activation: {
              activation_type}, Data Type: {data_type}")
        # Instantiate the model
        model = NeuralNetwork(input_size=input_size,
                              hidden_size=64, activation_type=activation_type)

        # Define the loss function
        # Why not Cross entropy? because output is not a d-dim vector we map to a label. not trying to find
        # highest likelihood of a class, just trying to minimize loss
        criterion = nn.MSELoss()

        # Define the optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # Define the training set
        training_set = DataLoader(
            TensorDataset(input, target), batch_size=batch_size)

        # Call the train function
        train(model, criterion, optimizer, training_set)

Training activation: gelu, Data Type: normal
Epoch: 999, Loss: 1.8292492853788644e-08
Training activation: relu, Data Type: normal
Epoch: 999, Loss: 2.05539754595703e-13
Training activation: gelu, Data Type: uniform
Epoch: 999, Loss: 6.628273752085079e-08
Training activation: relu, Data Type: uniform
Epoch: 999, Loss: 4.75004313660321e-10


Training for 10 epochs, we can see from the results that the Gelu activation on normal data performs almost 2x better with half of its loss. (0.1527 < 0.293)
Training for 100 epochs,
