In [4]:
!pip install -q -y datasets sentencepiece

# Load the English-German translation dataset
from datasets import load_dataset

# Load the English-German translation dataset
dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-de-en", split={'train': 'train', 'validation': 'validation', 'test': 'test'})

import sentencepiece as spm
import os

# Create directories to store raw and processed data
os.makedirs("raw", exist_ok=True)
os.makedirs("spm", exist_ok=True)

# Save raw text for training SentencePiece
for split in ['train', 'validation', 'test']:
    with open(f"raw/{split}.en", "w", encoding="utf-8") as en_file, \
         open(f"raw/{split}.de", "w", encoding="utf-8") as de_file:
        for example in dataset[split]:
            en_file.write(example['translation']['en'].lower().strip() + "\n")
            de_file.write(example['translation']['de'].lower().strip() + "\n")

# Combine English and German training data for SentencePiece training
with open("raw/combined.txt", "w", encoding="utf-8") as combined_file:
    with open("raw/train.en", "r", encoding="utf-8") as en_file:
        combined_file.write(en_file.read())
    with open("raw/train.de", "r", encoding="utf-8") as de_file:
        combined_file.write(de_file.read())

# Train SentencePiece model
spm.SentencePieceTrainer.train(
    input="raw/combined.txt",
    model_prefix="spm/spm_model",
    vocab_size=8000,
    character_coverage=1.0,
    model_type='bpe'
)

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load("spm/spm_model.model")

# Create directory to store encoded data
os.makedirs("encoded", exist_ok=True)

# Function to encode and save data
def encode_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile, \
         open(output_path, "w", encoding="utf-8") as outfile:
        for line in infile:
            token_ids = sp.encode(line.strip(), out_type=int)
            outfile.write(" ".join(map(str, token_ids)) + "\n")

# Encode train, validation, and test splits
for split in ['train', 'validation', 'test']:
    encode_file(f"raw/{split}.en", f"encoded/{split}.en.ids")
    encode_file(f"raw/{split}.de", f"encoded/{split}.de.ids")

import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset class
class TranslationDataset(Dataset):
    def __init__(self, src_path, tgt_path):
        self.src_data = []
        self.tgt_data = []
        with open(src_path, "r", encoding="utf-8") as src_file, \
             open(tgt_path, "r", encoding="utf-8") as tgt_file:
            for src_line, tgt_line in zip(src_file, tgt_file):
                src_ids = list(map(int, src_line.strip().split()))
                tgt_ids = list(map(int, tgt_line.strip().split()))
                self.src_data.append(torch.tensor(src_ids, dtype=torch.long))
                self.tgt_data.append(torch.tensor(tgt_ids, dtype=torch.long))

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return self.src_data[idx], self.tgt_data[idx]

# Collate function for DataLoader
def collate_batch(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=sp.pad_id())
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=sp.pad_id())
    return src_batch, tgt_batch

# Create DataLoaders
train_dataset = TranslationDataset("encoded/train.en.ids", "encoded/train.de.ids")
valid_dataset = TranslationDataset("encoded/validation.en.ids", "encoded/validation.de.ids")
test_dataset = TranslationDataset("encoded/test.en.ids", "encoded/test.de.ids")

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)



[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m430.1/491.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dep

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

iwslt2017.py:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

The repository for IWSLT/iwslt2017 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/IWSLT/iwslt2017.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


de-en.zip:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/206112 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8079 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/888 [00:00<?, ? examples/s]

In [5]:
from torch import nn
import torch
class SingleHeadAttentionBlock(nn.Module):
    def __init__(self, embedding_dimension, num_of_heads):
        super(SingleHeadAttentionBlock, self).__init__()
        self.num_of_heads = num_of_heads
        self.embedding_dimension = embedding_dimension
        self.layers_input_dimension = self.embedding_dimension // self.num_of_heads
        self.queries_layer = nn.Linear(self.embedding_dimension, self.embedding_dimension)
        self.keys_layer = nn.Linear(self.embedding_dimension, self.embedding_dimension)
        self.values_layer = nn.Linear(self.embedding_dimension, self.embedding_dimension)
    def forward(self, x):
        queries = self.queries_layer(x)
        keys = self.keys_layer(x)
        attention = torch.matmul(queries, keys.transpose(-2, -1))
        attention *= 1 / self.embedding_dimension ** 0.5
        attention = torch.softmax(attention, dim=-1)
        values = self.values_layer(x)
        attention_valued = torch.matmul(attention, values)
        return attention_valued
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, embedding_dimension, num_of_heads):
        super(MultiHeadAttentionBlock, self).__init__()
        self.embedding_dimension = embedding_dimension
        self.num_of_heads = num_of_heads
        self.layers_input_dimension = self.embedding_dimension // self.num_of_heads
        self.queries_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.keys_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.values_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.linear_layer = nn.Linear(self.embedding_dimension, self.embedding_dimension)
    def forward(self, x):
        attention_values = []
        for i in range(0, self.num_of_heads): # inefficient implementation that does not use direct matrix multiplication and thus is not parallelizable but for now we will make do. To make it parallelizable you should just make the queries and keys layers of size embedding dim instead of layers_input and such.
            queries = self.queries_layer[i](x)
            keys = self.keys_layer[i](x)
            values = self.values_layer[i](x)
            attention = torch.matmul(queries, keys.transpose(-2, -1))
            attention *= 1 / self.layers_input_dimension ** 0.5
            attention = torch.softmax(attention, dim=-1)
            attention_values.append(torch.matmul(attention, values))
        return self.linear_layer(torch.cat(attention_values, dim=-1))
class MaskedMultiHeadAttentionBlock(nn.Module):
    def __init__(self, embedding_dimension, num_of_heads):
        super(MaskedMultiHeadAttentionBlock, self).__init__()
        self.embedding_dimension = embedding_dimension
        self.num_of_heads = num_of_heads
        self.layers_input_dimension = self.embedding_dimension // self.num_of_heads
        self.queries_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.keys_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.values_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.linear_layer = nn.Linear(self.embedding_dimension, self.embedding_dimension)
    def forward(self, x):
        batch_size, sequence_length, _ = x.shape
        mask = torch.tril(torch.ones(sequence_length, sequence_length)).unsqueeze(0).unsqueeze(0).to(x.device) # just noticed that docs entail static generate_square_subsequent_mask(sz, device=None, dtype=None) which is probably better but whatever.
        attention_values = []
        for i in range(0, self.num_of_heads): # inefficient implementation that does not use direct matrix multiplication and thus is not parallelizable but for now we will make do. To make it parallelizable you should just make the queries and keys layers of size embedding dim instead of layers_input and such.
            queries = self.queries_layer[i](x)
            keys = self.keys_layer[i](x)
            values = self.values_layer[i](x)
            attention = torch.matmul(queries, keys.transpose(-2, -1))
            attention *= 1 / self.layers_input_dimension ** 0.5
            attention = attention.masked_fill(mask == 0, float('-inf'))
            attention = torch.softmax(attention, dim=-1)
            attention_values.append(torch.matmul(attention, values))
        return self.linear_layer(torch.cat(attention_values, dim=-1))
class CrossMultiHeadAttention(nn.Module):
    def __init__(self, embedding_dimension, num_of_heads):
        super(CrossMultiHeadAttention, self).__init__()
        self.embedding_dimension = embedding_dimension
        self.num_of_heads = num_of_heads
        self.layers_input_dimension = self.embedding_dimension // self.num_of_heads
        self.queries_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.keys_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.values_layer = nn.ModuleList([nn.Linear(self.embedding_dimension, self.layers_input_dimension) for i in range(0, num_of_heads)])
        self.linear_layer = nn.Linear(self.embedding_dimension, self.embedding_dimension)
    def forward(self, encoder_output, decoder_output):
        attention_values = []
        for i in range(0, self.num_of_heads): # inefficient implementation that does not use direct matrix multiplication and thus is not parallelizable but for now we will make do. To make it parallelizable you should just make the queries and keys layers of size embedding dim instead of layers_input and such.
            queries = self.queries_layer[i](decoder_output)
            keys = self.keys_layer[i](encoder_output)
            values = self.values_layer[i](encoder_output)
            attention = torch.matmul(queries, keys.transpose(-2, -1))
            attention *= 1 / self.layers_input_dimension  ** 0.5 # might need to separate this into 2 different computations to ensure it is calculated on cude.
            attention = torch.softmax(attention, dim=-1)
            attention_values.append(torch.matmul(attention, values))
        return self.linear_layer(torch.cat(attention_values, dim=-1))

In [6]:
class EncoderBlock(nn.Module):
    def __init__(self, embedding_dimension, num_of_heads):
        super(EncoderBlock, self).__init__()
        self.multi_head_attention = MultiHeadAttentionBlock(embedding_dimension, num_of_heads)
        self.layer_norm = nn.LayerNorm(embedding_dimension)
        self.mlp_layer = nn.Sequential(
            nn.Linear(embedding_dimension, embedding_dimension * 4),
            nn.ReLU(),
            nn.Linear(embedding_dimension * 4, embedding_dimension)
        ) # This is taken directly from the paper. Honestly it feels to me kind of whack but whatever.
        self.final_layer_norm = nn.LayerNorm(embedding_dimension)
    def forward(self, x):
      x = self.multi_head_attention(x)   + x
      x = self.layer_norm(x)
      x = self.mlp_layer(x) + x
      x = self.final_layer_norm(x)
      return x

In [7]:
class DecoderBlock(nn.Module):
    def __init__(self, embedding_dimension, num_of_heads):
      super(DecoderBlock, self).__init__()
      self.masked_multi_head_attention = MaskedMultiHeadAttentionBlock(embedding_dimension, num_of_heads)
      self.layer_norm = nn.LayerNorm(embedding_dimension)
      self.multi_head_attention = CrossMultiHeadAttention(embedding_dimension, num_of_heads)
      self.layer_norm_second = nn.LayerNorm(embedding_dimension)
      self.mlp_layer = nn.Sequential(
          nn.Linear(embedding_dimension, embedding_dimension * 4),
          nn.ReLU(),
          nn.Linear(embedding_dimension * 4, embedding_dimension)
      )
      self.final_layer_norm = nn.LayerNorm(embedding_dimension)
    def forward(self, x, encoder_output):
      x = self.masked_multi_head_attention(x) + x
      x = self.layer_norm(x)
      x = self.multi_head_attention(encoder_output, x) + x
      x = self.layer_norm_second(x)
      x = self.mlp_layer(x) + x
      x = self.final_layer_norm(x)
      return x

In [8]:
class PositionalEncoding(torch.nn.Module):
    def __init__(self, max_length, embedding_dimension):
        super(PositionalEncoding, self).__init__()
        self.positional_embeddings = torch.nn.Embedding(max_length, embedding_dimension)
    def forward(self, x):
        batch_size, sequence_length, _ = x.shape
        return self.positional_embeddings(torch.arange(sequence_length, device=x.device).unsqueeze(0).repeat(batch_size, 1))
class RelativePositionalEncoding(torch.nn.Module):
    def __init__(self, embedding_dimension, max_length):
        super(RelativePositionalEncoding, self).__init__()
        self.embedding_dim = embedding_dimension
        self.max_len = max_length
        self.relative_positions = torch.nn.Embedding(2 * max_length - 1, embedding_dimension)
    def forward(self, seq_len):
        positions = torch.arange(seq_len, device=self.relative_positions.weight.device)
        relative_positions = positions.unsqueeze(0) - positions.unsqueeze(1)
        relative_positions += self.max_len - 1
        return self.relative_positions(relative_positions)

In [16]:
class EncoderDecoderTransformer(nn.Module): # need to add dropout.
  def __init__(self, embedding_dimension, num_of_heads, num_of_layers):
    super(EncoderDecoderTransformer, self).__init__()
    self.embedding_dimension = embedding_dimension
    self.num_of_heads = num_of_heads
    self.num_of_layers = num_of_layers
    self.positional_encoding_layer = PositionalEncoding(1024, embedding_dimension)
    self.encoder_blocks = nn.ModuleList([EncoderBlock(embedding_dimension, num_of_heads) for i in range(0, num_of_layers)])
    self.decoder_blocks = nn.ModuleList([DecoderBlock(embedding_dimension, num_of_heads) for i in range(0, num_of_layers)])
    self.embedding_layer_input = nn.Embedding(sp.vocab_size() + 1, embedding_dimension)
    self.embedding_layer_output = nn.Embedding(sp.vocab_size() + 1, embedding_dimension)
    self.linear_layer = nn.Linear(embedding_dimension, sp.vocab_size() + 1)
  def forward(self, x, y):
    print(x.shape)
    input_embedding = self.embedding_layer_input(x)
    output_embedding = self.embedding_layer_output(y)
    encoder_x = input_embedding + self.positional_encoding_layer(input_embedding)
    for i in range(0, self.num_of_layers):
      encoder_x = self.encoder_blocks[i](encoder_x)
    decoder_y = output_embedding + self.positional_encoding_layer(output_embedding)
    for i in range(0, self.num_of_layers):
      decoder_y = self.decoder_blocks[i](decoder_y, encoder_x)
    return self.linear_layer(decoder_y)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]
model = EncoderDecoderTransformer(1024, 1, 1).to(device)

In [18]:
def shift_target(target_tensor):
    return target_tensor[:, :-1], target_tensor[:, 1:]

eos_token_id = 8000
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)
criterion = nn.CrossEntropyLoss(ignore_index=sp.pad_id())
model.train()
total_loss = 0
for source_batch, target_batch in train_loader:
    source_batch[source_batch == -1] = eos_token_id
    target_batch[target_batch == -1] = eos_token_id
    source_batch = source_batch.to(device)
    target_batch = target_batch.to(device)
    target_input, target_expected = shift_target(target_batch)
    optimizer.zero_grad()
    # print(source_batch[:1], target_input[:1])
    output = model(source_batch, target_input)
    loss = criterion(output.reshape(-1, output.size(-1)), target_expected.reshape(-1))
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    print(loss)


torch.Size([32, 49])
tensor(9.0499, grad_fn=<NllLossBackward0>)
torch.Size([32, 52])
tensor(4.1445, grad_fn=<NllLossBackward0>)
torch.Size([32, 98])
tensor(2.5746, grad_fn=<NllLossBackward0>)
torch.Size([32, 53])
tensor(3.4796, grad_fn=<NllLossBackward0>)
torch.Size([32, 94])
tensor(1.7125, grad_fn=<NllLossBackward0>)
torch.Size([32, 70])
tensor(2.8485, grad_fn=<NllLossBackward0>)
torch.Size([32, 66])
tensor(2.5207, grad_fn=<NllLossBackward0>)
torch.Size([32, 69])
tensor(2.1504, grad_fn=<NllLossBackward0>)
torch.Size([32, 91])
tensor(2.1831, grad_fn=<NllLossBackward0>)
torch.Size([32, 52])
tensor(2.4593, grad_fn=<NllLossBackward0>)
torch.Size([32, 97])
tensor(2.1920, grad_fn=<NllLossBackward0>)
torch.Size([32, 59])
tensor(3.1272, grad_fn=<NllLossBackward0>)
torch.Size([32, 72])
tensor(2.1465, grad_fn=<NllLossBackward0>)
torch.Size([32, 86])
tensor(2.0604, grad_fn=<NllLossBackward0>)
torch.Size([32, 71])
tensor(2.8652, grad_fn=<NllLossBackward0>)
torch.Size([32, 61])
tensor(3.1407, grad

KeyboardInterrupt: 

In [22]:
source_batch[0]

tensor([  82, 7885, 7865,   22, 1701, 3151, 7875, 1231,  570, 2238,  167,  324,
         307,    7, 6300,  221, 7880,  614, 7879, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000])

In [21]:
target_batch[0]

tensor([  99,   98,   86,  165, 7868,  700, 3151, 7875, 1231,  570, 2238, 7890,
         145, 1035,  179,   50,   72, 3318,  264, 7879, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000,
        8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000, 8000])

In [36]:
torch.argmax(model(source_batch[0].unsqueeze(0), torch.tensor([99]).unsqueeze(0)))

torch.Size([1, 76])


tensor(98)

torch.Size([1, 76])