In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math



In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

In [3]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, nhid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output


In [4]:
# Hyperparameters
ntokens = 10000  # size of vocabulary
emsize = 200  # embedding dimension
nhid = 200  # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0.2  # the dropout value



In [5]:
# Create the model
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout)





In [6]:
# Example input (batch_size=32, sequence_length=10)
src = torch.randint(0, ntokens, (10, 32))
src_mask = torch.zeros((10, 10)).type(torch.bool)


In [7]:
# Forward pass
output = model(src, src_mask)
print(output.shape)  # Should be (10, 32, ntokens)



torch.Size([10, 32, 10000])


In [8]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())


In [9]:
# Training loop (example)
for epoch in range(10):
    model.train()
    for batch in range(100):  # Assume we have 100 batches
        optimizer.zero_grad()
        output = model(src, src_mask)
        loss = criterion(output.view(-1, ntokens), src.view(-1))
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

print("Training complete!")



Epoch 1, Loss: 0.011590269394218922
Epoch 2, Loss: 0.005491065792739391
Epoch 3, Loss: 0.0032924532424658537
Epoch 4, Loss: 0.002202109433710575
Epoch 5, Loss: 0.0016091568395495415
Epoch 6, Loss: 0.001220210688188672
Epoch 7, Loss: 0.0009717493085190654
Epoch 8, Loss: 0.0007853916031308472
Epoch 9, Loss: 0.0006514849374070764
Epoch 10, Loss: 0.0005507472087629139
Training complete!
