# FFNN
Aseries of linear layer followed by non-linear activation function that map input features to output prediction

It is called feed forward because data flows in one direction: *input -> hidden layers -> output*

In [1]:
import torch
import torch.nn as nn

class SimpleFFNN(nn.Module):
    def __init__(self, input_dim = 10, hidden_dim = 64, output_dim = 3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
            )

    def forward(self, x):
        return self.net(x)

# ReLU
Rectified Linear Unit(ReLU)

        ReLU(x) = max(0,x)

* Introduces non-linearity
* Prevents vanishing gradients (compared to sigmod / tanh)
* Fast and efficient

In [2]:
x = torch.tensor([-2.0, 0.0, 3.0])
relu = nn.ReLU()
print(f'{relu(x)}')

tensor([0., 0., 3.])


# Training Loop (Step-by-Step)

Full Cycle
1. Forward pass: Predict output
2. Compute loss: Compare with target
3. Backward pass: Calculate gradients
4. Optimizer step: Update weights

In [5]:
import torch
import torch.optim as optim
import torch.nn.functional as F

model = SimpleFFNN(10, 64, 3)
optimizer = optim.Adam(model.parameters(), lr = 1e-3)

# Dummy batch
inputs = torch.randn(32, 10)
targets = torch.randint(0, 3, (32,))

for epoch in range(100):
    optimizer.zero_grad()
    logits = model(inputs)
    loss = F.cross_entropy(logits, targets)
    loss.backward()
    optimizer.step()

    if epoch%10 == 0:
        print(f"epoch {epoch}, loss: {loss.item():.4f}")

epoch 0, loss: 1.0960
epoch 10, loss: 1.0279
epoch 20, loss: 0.9679
epoch 30, loss: 0.9120
epoch 40, loss: 0.8584
epoch 50, loss: 0.8050
epoch 60, loss: 0.7525
epoch 70, loss: 0.7002
epoch 80, loss: 0.6483
epoch 90, loss: 0.5975


# FFNNs inside Transformeers
* After self-attentation gives context, each token vector is passed through a *position-wide FFNN*.
* It's exactly this: *Linear -> ReLU -> Linear*
* Done independently for each token

In [6]:
self.ffnn = nn.Sequential(
    nn.Linear(768, 3072),
    nn.ReLU(),
    nn.Linear(3072, 768)
)

NameError: name 'self' is not defined