In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [20]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_length=5000):
    super(PositionalEncoding, self).__init__()

    pe = torch.zeros(max_length, d_model)
    position = torch.arange(0, max_length).unsqueeze(1)
    div_term = torch.exp(torch.arange(0,d_model,2)*-(math.log(10000.0)/d_model))

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)

    self.register_buffer('pe',pe)

  def forward(self, x):
    seq_length = x.size(1)  # Get the sequence length
    x = x + self.pe[:, :seq_length, :].to(x.device)  # Match the positional encodings with the sequence length
    return x



In [21]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

    self.d_model = d_model
    self.num_heads = num_heads
    self.d_k = d_model // num_heads

    self.W_q = nn.Linear(d_model, d_model)
    self.W_k = nn.Linear(d_model, d_model)
    self.W_v = nn.Linear(d_model, d_model)
    self.out = nn.Linear(d_model, d_model)

  def scaled_dot_product_attention(self, Q, K, V, mask=None):
    attn_scores = torch.matmul(Q, K.transpose(-2,-1)) / math.sqrt(self.d_k)
    if mask is not None:
      atth_scores = attn_scores.masked_fill(mask == 0, -1e9)
    attn_weights = torch.softmax(attn_scores, dim=-1)
    output = torch.matmul(attn_weights, V)
    return output

  def split_heads(self, x):
    batch_size, seq_length, d_model = x.size()
    return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)

  def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

  def forward(self, Q, K, V, mask=None):
    Q = self.split_heads(self.W_q(Q))
    K = self.split_heads(self.W_k(K))
    V = self.split_heads(self.W_v(V))

    attn_output = self.scaled_dot_product_attention(Q,K,V,mask)
    output = self.out(self.combine_heads(attn_output))
    return output

In [22]:
class FeedForward(nn.Module):
  def __init__(self, d_model, d_ff):
    super().__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.fc2 = nn.Linear(d_ff, d_model)
    self.relu = nn.ReLU()

  def forward(self, x):
    return self.fc2(self.relu(self.fc1(x)))

In [23]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = MultiHeadAttention(d_model, num_heads)
    self.feed_forward = FeedForward(d_model, d_ff)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, mask):
    attn_output = self.self_attn(x, x, x, mask)
    x = self.norm1(x + self.dropout(attn_output))
    ff_output = self.feed_forward(x)
    x = self.norm2(x + self.dropout(ff_output))
    return x

In [24]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [25]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_length=256)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2).to(src.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).to(tgt.device)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(tgt.device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

In [26]:
pip install datasets




In [28]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

# Load the dataset in streaming mode
dataset = load_dataset("wmt14", "de-en", streaming=False)

# Get a subset of the dataset (e.g., 10% of the training data)
subset_percentage = 0.05  # 10%
subset_size = int(len(dataset['train']) * subset_percentage)

# Select the first 'subset_size' samples
train_data = dataset['train'].select(range(subset_size))
test_data = dataset['test']

print(f"Using {len(train_data)} samples for training and {len(test_data)} for testing.")

Using 225439 samples for training and 3003 for testing.


In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained('bert-base-multilingual-cased').to(device)



In [None]:
train_data['translation'][0:3]

[{'de': 'Wiederaufnahme der Sitzungsperiode',
  'en': 'Resumption of the session'},
 {'de': 'Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.',
  'en': 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.'},
 {'de': 'Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.',
  'en': "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."}]

In [30]:
from torch.utils.data import IterableDataset, DataLoader

def tokenize(batch):
    # Initialize lists to store tokenized input/output
    input_ids = []
    attention_masks = []
    target_ids = []
    target_attention_masks = []

    # Loop over each translation in the batch since our data contains a list of dictionaries inside the 'translation' key
    for translation in batch['translation']:
        # Tokenize the English (source) sentences
        source = tokenizer(translation['en'], padding='max_length', truncation=True, max_length=256)

        # Tokenize the German (target) sentences
        target = tokenizer(translation['de'], padding='max_length', truncation=True, max_length=256)

        input_ids.append(source['input_ids'])
        attention_masks.append(source['attention_mask'])
        target_ids.append(target['input_ids'])
        target_attention_masks.append(target['attention_mask'])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "target_ids": target_ids,
        "target_attention_mask": target_attention_masks
    }

# Apply tokenization to the subset of the dataset
tokenized_train_data = train_data.map(tokenize, batched=True)
tokenized_test_data = test_data.map(tokenize, batched=True)

print(f"Tokenized {len(tokenized_train_data)} training samples and {len(tokenized_test_data)} testing samples.")

Map:   0%|          | 0/225439 [00:00<?, ? examples/s]

Tokenized 225439 training samples and 3003 testing samples.


In [31]:
tokenized_train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'])
tokenized_test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'target_ids', 'target_attention_mask'])

# Create PyTorch DataLoaders from the iterable datasets
train_loader = DataLoader(tokenized_train_data, batch_size=8)  # Adjust batch size as needed
test_loader = DataLoader(tokenized_test_data, batch_size=8)

In [32]:
for batch in train_loader:
    print(batch)  # This will print the entire batch
    break

{'input_ids': tensor([[  101, 32070, 94118,  ...,     0,     0,     0],
        [  101,   146, 10104,  ...,     0,     0,     0],
        [  101, 15785,   117,  ...,     0,     0,     0],
        ...,
        [  101, 47464, 28710,  ...,     0,     0,     0],
        [  101,   113, 10117,  ...,     0,     0,     0],
        [  101, 31301, 11008,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'target_ids': tensor([[  101, 23789, 24053,  ...,     0,     0,     0],
        [  101, 21023, 10163,  ...,     0,     0,     0],
        [  101, 23789, 11583,  ...,     0,     0,     0],
        ...,
        [  101, 21023, 17684,  ...,     0,     0,     0],
        [  101,   113, 10672,  ...,     0,     0,     0],
        [  101, 16783, 24571,  ...,     0,     0,     0]]), 'target_a

In [33]:
for batch in train_loader:
    print("Input IDs:", batch['input_ids'])  # Print the input_ids tensor
    print("Target IDs:", batch['target_ids'])  # Print the target_ids tensor
    print("Attention Mask:", batch['attention_mask'])  # Print the attention mask
    print("Target Attention Mask:", batch['target_attention_mask'])  # Print the target attention mask
    break

Input IDs: tensor([[  101, 32070, 94118,  ...,     0,     0,     0],
        [  101,   146, 10104,  ...,     0,     0,     0],
        [  101, 15785,   117,  ...,     0,     0,     0],
        ...,
        [  101, 47464, 28710,  ...,     0,     0,     0],
        [  101,   113, 10117,  ...,     0,     0,     0],
        [  101, 31301, 11008,  ...,     0,     0,     0]])
Target IDs: tensor([[  101, 23789, 24053,  ...,     0,     0,     0],
        [  101, 21023, 10163,  ...,     0,     0,     0],
        [  101, 23789, 11583,  ...,     0,     0,     0],
        ...,
        [  101, 21023, 17684,  ...,     0,     0,     0],
        [  101,   113, 10672,  ...,     0,     0,     0],
        [  101, 16783, 24571,  ...,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Target Attention Ma

In [34]:
import time

def train_transformer(model, train_loader, optimizer, criterion, epochs=5, print_every=100):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        start_time = time.time()  # Track time to monitor speed
        for batch_idx, batch in enumerate(train_loader):
            # Move tensors to the device
            src = batch['input_ids'].to(device)
            tgt = batch['target_ids'].to(device)

            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])  # Exclude <eos> token

            # Reshape and calculate loss
            output = output.contiguous().view(-1, output.shape[-1]).to(device)
            tgt = tgt[:, 1:].contiguous().view(-1).to(device)  # Shift target by 1 to compare predictions
            loss = criterion(output, tgt)
            loss.backward()

            optimizer.step()
            epoch_loss += loss.item()

            # Print progress every 'print_every' batches
            if (batch_idx + 1) % print_every == 0:
                elapsed = time.time() - start_time
                avg_loss = epoch_loss / (batch_idx + 1)
                print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx+1}], '
                      f'Loss: {avg_loss:.4f}, Time: {elapsed:.2f}s')

        # Print epoch loss after each epoch
        avg_epoch_loss = epoch_loss / (batch_idx + 1)  # Use (batch_idx + 1) since len(train_loader) is not available
        print(f'--- Epoch {epoch + 1} Complete. Average Loss: {avg_epoch_loss:.4f} ---')

In [None]:
# Set parameters
d_model = 256
num_heads = 8
num_layers = 4
d_ff = 2048
dropout = 0.1
#max_seq_len = 256

# Initialize model, optimizer, and loss function
model = Transformer(src_vocab_size=len(tokenizer.vocab), tgt_vocab_size=len(tokenizer.vocab), d_model=d_model, num_heads=num_heads, num_layers=num_layers, d_ff=d_ff, dropout=dropout).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)
# Train the model
train_transformer(model, train_loader, optimizer, criterion, epochs=2)

Epoch [1/2], Step [100], Loss: 9.9323, Time: 1682.98s
Epoch [1/2], Step [200], Loss: 8.7850, Time: 3325.77s
Epoch [1/2], Step [300], Loss: 8.2314, Time: 4946.09s
Epoch [1/2], Step [400], Loss: 7.9002, Time: 6567.80s


In [None]:
from datasets import load_metric

# Load the BLEU metric
bleu = load_metric('sacrebleu')

def evaluate_model(model, test_data, tokenizer):
    model.eval()
    predictions, references = [], []

    with torch.no_grad():
        for batch in test_data:
            src = batch['input_ids'].to(device)
            tgt = batch['target_ids'].to(device)

            output = model(src, tgt[:, :-1])
            pred_ids = torch.argmax(output, dim=-1)

            predictions.extend([tokenizer.decode(pred, skip_special_tokens=True) for pred in pred_ids])
            references.extend([[tokenizer.decode(ref, skip_special_tokens=True)] for ref in tgt])

    # Calculate BLEU score
    score = bleu.compute(predictions=predictions, references=references)
    return score

# Evaluate the model
bleu_score = evaluate_model(model, test_data, tokenizer)
print(f"BLEU Score: {bleu_score['score']}")
