In [None]:
import torch

import torch.nn as nn
import torch.optim as optim

class TransformerModel(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, hidden_dim, n_layers, output_dim, dropout):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 5000, emb_dim))
        
        encoder_layers = nn.TransformerEncoderLayer(emb_dim, n_heads, hidden_dim, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
        
        self.fc_out = nn.Linear(emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        src = self.embedding(src) + self.positional_encoding[:, :src.size(1), :]
        src = self.dropout(src)
        output = self.transformer_encoder(src)
        output = self.fc_out(output)
        return output

# Hyperparameters
input_dim = 10000  # Vocabulary size
emb_dim = 512     # Embedding dimension
n_heads = 8       # Number of attention heads
hidden_dim = 2048 # Hidden dimension of feedforward layers
n_layers = 6      # Number of transformer layers
output_dim = 10000 # Output dimension (usually same as input_dim for language models)
dropout = 0.1     # Dropout rate

# Initialize model, loss function, and optimizer
model = TransformerModel(input_dim, emb_dim, n_heads, hidden_dim, n_layers, output_dim, dropout)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Example input (batch_size, sequence_length)
example_input = torch.randint(0, input_dim, (32, 100))

# Forward pass
output = model(example_input)
print(output.shape)  # Should be (batch_size, sequence_length, output_dim)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_params = count_parameters(model)
print(f'The model has {num_params:,} trainable parameters')