In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import math


In [17]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]



In [18]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, max_len=5000):
        super(TransformerModel, self).__init__()
        self.model_dim = model_dim
        self.embedding = nn.Embedding(input_dim, model_dim)
        self.pos_encoder = PositionalEncoding(model_dim, max_len)
        encoder_layers = nn.TransformerEncoderLayer(model_dim, num_heads, model_dim * 4)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc_out = nn.Linear(model_dim, output_dim)
    
    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.model_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.fc_out(output)
        return output


In [19]:
batch_size = 32
seq_length = 10
input_dim = 1000  # Vocabulary size
output_dim = 1   # Regression or binary classification
model_dim = 512  # Embedding dimension
num_heads = 8    # Number of attention heads
num_layers = 6   # Number of encoder layers

# Randomly generated data
X_train = torch.randint(0, input_dim, (seq_length, batch_size))
y_train = torch.randn(seq_length, batch_size, output_dim)


In [20]:
model = TransformerModel(input_dim, model_dim, num_heads, num_layers, output_dim)
criterion = nn.MSELoss()  # For regression; use nn.CrossEntropyLoss() for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [21]:
epochs = 100

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 1.5451
Epoch [20/100], Loss: 1.2818
Epoch [30/100], Loss: 1.1321
Epoch [40/100], Loss: 1.0988
Epoch [50/100], Loss: 1.0920
Epoch [60/100], Loss: 1.1025
Epoch [70/100], Loss: 1.0929
Epoch [80/100], Loss: 1.1016
Epoch [90/100], Loss: 1.1002
Epoch [100/100], Loss: 1.1068


In [22]:
model.eval()
with torch.no_grad():
    test_output = model(X_train)
    test_loss = criterion(test_output, y_train)
    print(f'Test Loss: {test_loss.item():.4f}')


Test Loss: 1.0914
