In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torchtext.data import Dataset, Example, Field, BucketIterator

# Hyperparameters
batch_size = 32
emb_dim = 512
n_head = 8
transformer_layers = 1
dropout_rate = 0.1
lr = 0.0001
num_epochs = 20

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, emb_dim)
        
    def forward(self, x):
        return self.token_embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term) 
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        return x + Variable(self.pe[:x.size(0)], requires_grad=False)

def generate_subsequent_mask(seq_length):
    # Creates a mask where each row i is 1 for j >= i
    mask = torch.triu(torch.ones(seq_length, seq_length)).byte()
    return mask

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head, dropout_rate=0.1):
        super().__init__()
        assert d_model % n_head == 0
        
        self.n_head = n_head
        self.d_k = d_model // n_head
        self.d_v = d_model // n_head
        
        self.slf_attn = nn.MultiheadAttention(self.d_k, self.n_head, dropout=dropout_rate)
        
    def forward(self, x):
        # x: (B, S, D)
        att_output, _ = self.slf_attn(x, x, x)  # Apply attention
        return F.dropout(att_output, p=dropout_rate)

class PositionWiseFFN(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super().__init__()
        self.w1 = nn.Linear(d_model, hidden_dim)
        self.w2 = nn.Linear(hidden_dim, d_model)
        
    def forward(self, x):
        return self.w2(F.relu(self.w1(x)))

def generate_batch_data():
    # Simulate a simple dataset (you would use actual data in practice)
    torch.manual_seed(42)
    vocab_size = 50
    input_seq_length = 64
    
    # Create random batch of tokens
    x = torch.randint(vocab_size, (batch_size, input_seq_length))
    return x

def train_model():
    model = TransformerModel().to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        
        for batch_idx in range(0, len(dataloader), batch_size):
            data, _ = train_loader.__next__()  # Get next batch
            
            optimizer.zero_grad()
            loss = forward_loss(model(data), target)
            loss.backward()
            optimizer.step()

        print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

if __name__ == "__main__":
    # Initialize transformer model
    transformer_model = TransformerModel(emb_dim, n_head, transformer_layers, dropout_rate)
    
    # Create data loader (assuming generate_batch_data is set up properly)
    train_loader = BucketIterator(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        sort_key=lambda x: len(x),
        sort_by_len=True
    )
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = transformer_model.to(device)
    
    # Training loop (simplified; actual training requires proper data loading and management)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch: {epoch+1}, Average Loss: {total_loss / len(train_loader)}')

if __name__ == "__main__":
    # Initialize model and training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Create the model
    model = TransformerModel(emb_dim, n_head, transformer_layers, dropout_rate).to(device)
    
    # Define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    # Training loop (simplified; actual training requires proper data loading and management)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch: {epoch+1}, Average Loss: {total_loss / len(train_loader)}')


ModuleNotFoundError: No module named 'torchtext'