### some library imports

In [1]:
import torch 
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm
import evaluate


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Shiven\anaconda3\envs\agents\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\Shiven\anaconda3\envs\agents\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\Shiven\anaconda3\envs\agents\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File 

In [2]:
print(torch.__version__)

2.2.0+cpu


In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [4]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [6]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"]
)

In [7]:
train_data[123]

{'en': 'A man is standing on a ladder painting bricks',
 'de': 'Ein Mann steht auf einer Leiter und malt Ziegel.'}

### tokenzation

In [8]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [9]:
def tokenize(example, en_nlp, de_nlp, max_len, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_len]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_len]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [10]:
fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_len": 1000,
    "lower": True,
    "sos_token": "<sos>",
    "eos_token": "<eos>"
}

In [11]:
train_data = train_data.map(tokenize, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize, fn_kwargs=fn_kwargs)

In [12]:
train_data[12]

{'en': 'A black dog and a spotted dog are fighting',
 'de': 'Ein schwarzer Hund und ein gefleckter Hund kämpfen.',
 'en_tokens': ['<sos>',
  'a',
  'black',
  'dog',
  'and',
  'a',
  'spotted',
  'dog',
  'are',
  'fighting',
  '<eos>'],
 'de_tokens': ['<sos>',
  'ein',
  'schwarzer',
  'hund',
  'und',
  'ein',
  'gefleckter',
  'hund',
  'kämpfen',
  '.',
  '<eos>']}

In [13]:
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"

special_tokens = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

en_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["en_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)
en_vocab.set_default_index(en_vocab[unk_token])  # Add this line

de_vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["de_tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)
de_vocab.set_default_index(de_vocab[unk_token])  # Add this line

In [14]:
en_vocab.get_itos()[:20] #itos = integer to string

['<unk>',
 '<pad>',
 '<sos>',
 '<eos>',
 'a',
 '.',
 'in',
 'the',
 'on',
 'man',
 'is',
 'and',
 'of',
 'with',
 'woman',
 ',',
 'two',
 'are',
 'to',
 'people']

In [15]:

en_vocab.get_stoi()["nice"] #stoi = string to integer

999

In [16]:
#number of unique tokens in each vocab
len(en_vocab), len(de_vocab)

(5893, 7853)

In [17]:
tokens = ["i", "love", "watching", "all", "shows"]
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 255, 821]

In [18]:
def numericalize(example, en_vocab, de_vocab):
    en_ids = en_vocab.lookup_indices(example["en_tokens"])
    de_ids = de_vocab.lookup_indices(example["de_tokens"])
    return {"en_ids": en_ids, "de_ids": de_ids}

In [19]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}


train_data = train_data.map(numericalize, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize, fn_kwargs=fn_kwargs)

In [20]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>'],
 'en_ids': [2, 16, 24, 15, 25, 778, 17, 57, 80, 202, 1312, 5, 3],
 'de_ids': [2, 18, 26, 253, 30, 84, 20, 88, 7, 15, 110, 7647, 3171, 4, 3]}

In [21]:
en_vocab.lookup_tokens(train_data[0]["en_ids"])

['<sos>',
 'two',
 'young',
 ',',
 'white',
 'males',
 'are',
 'outside',
 'near',
 'many',
 'bushes',
 '.',
 '<eos>']

In [22]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [23]:

train_data[0]

{'en_ids': tensor([   2,   16,   24,   15,   25,  778,   17,   57,   80,  202, 1312,    5,
            3]),
 'de_ids': tensor([   2,   18,   26,  253,   30,   84,   20,   88,    7,   15,  110, 7647,
         3171,    4,    3]),
 'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.',
 'en_tokens': ['<sos>',
  'two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.',
  '<eos>'],
 'de_tokens': ['<sos>',
  'zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.',
  '<eos>']}

### data loaders

In [24]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

In [25]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [26]:

batch_size = 128
pad_index = 0
train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

### encoder - decoder 

In [27]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [41]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embeddings = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embeddings(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

        

In [54]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dims of encoder and decoder should be equal"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Num of layers in encoder and decoder should be equal"
    
    # def forward(self, src, trg, teacher_forcing_ratio):
    #     batch_size = trg.shape[1]
    #     trg_length = trg.shape[0]
    #     trg_vocab_size = self.decoder.output_dim
    #     outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
    #     hidden, cell = self.encoder(src)
    #     input = trg[0, :]
    #     for t in range(1, trg_length):
    #         output, hidden, cell = self.decoder(input, hidden, cell)
    #         outputs[t] = output
    #         teacher_force = output.argmax(1)
    #         input = trg[t] if teacher_force else top1
    #     return outputs

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            
            # Randomly decide whether to teacher force
            teacher_force = random.random() < teacher_forcing_ratio
            
            # Get the highest predicted token
            top1 = output.argmax(-1)
            
            # If teacher forcing, use actual target token. If not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs


In [55]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [56]:
device

device(type='cpu')

In [57]:
torch.__version__

'2.2.0+cpu'

In [58]:
encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [59]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embeddings): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [60]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"No. of trainable params: {count_parameters(model):,}")

No. of trainable params: 13,898,501


In [61]:
optimizer = optim.Adam(model.parameters())

In [62]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [63]:
def train_fn(model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["de_ids"].to(device)
        trg = batch["en_ids"].to(device)
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss/len(data_loader)

In [64]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["de_ids"].to(device)
            trg = batch["en_ids"].to(device)
            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [65]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5
best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(model, train_data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device,)
    valid_loss = evaluate_fn(model, valid_data_loader, criterion, device,)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "tut1-model.pt")
    
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  torch.nn.utils.clip_grad_norm(model.parameters(), clip)
 10%|█         | 1/10 [06:55<1:02:22, 415.87s/it]

	Train Loss:   5.055 | Train PPL: 156.841
	Valid Loss:   5.024 | Valid PPL: 152.043


 20%|██        | 2/10 [13:46<55:02, 412.78s/it]  

	Train Loss:   4.452 | Train PPL:  85.769
	Valid Loss:   4.730 | Valid PPL: 113.240


 30%|███       | 3/10 [20:18<47:01, 403.09s/it]

	Train Loss:   4.153 | Train PPL:  63.650
	Valid Loss:   4.554 | Valid PPL:  94.995


 40%|████      | 4/10 [26:52<39:58, 399.80s/it]

	Train Loss:   3.914 | Train PPL:  50.092
	Valid Loss:   4.464 | Valid PPL:  86.841


 50%|█████     | 5/10 [34:02<34:12, 410.57s/it]

	Train Loss:   3.747 | Train PPL:  42.396
	Valid Loss:   4.345 | Valid PPL:  77.112


 60%|██████    | 6/10 [40:54<27:24, 411.18s/it]

	Train Loss:   3.584 | Train PPL:  36.030
	Valid Loss:   4.151 | Valid PPL:  63.509


 70%|███████   | 7/10 [47:26<20:14, 404.74s/it]

	Train Loss:   3.418 | Train PPL:  30.502
	Valid Loss:   4.058 | Valid PPL:  57.836


 80%|████████  | 8/10 [53:57<13:20, 400.48s/it]

	Train Loss:   3.283 | Train PPL:  26.664
	Valid Loss:   3.932 | Valid PPL:  51.020


 90%|█████████ | 9/10 [1:00:42<06:41, 401.80s/it]

	Train Loss:   3.142 | Train PPL:  23.160
	Valid Loss:   3.888 | Valid PPL:  48.827


100%|██████████| 10/10 [1:07:11<00:00, 403.20s/it]

	Train Loss:   3.004 | Train PPL:  20.160
	Valid Loss:   3.828 | Valid PPL:  45.976





In [67]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 3.789 | Test PPL:  44.203 |


In [68]:
print(1)

1


In [69]:
def translate_sentence(sentence, model, en_nlp, de_nlp, en_vocab, de_vocab, lower, sos_token, eos_token, device, max_output_length=25):
    model.eval()
    with torch.no_grad():
        if isinstance(sentence, str):
            tokens = [token.text for token in de_nlp.tokenizer(sentence)]
        else:
            tokens = [token for token in sentence]
        if lower:
            tokens = [token.lower() for token in tokens]
        tokens = [sos_token]+tokens+[eos_token]
        ids = de_vocab.lookup_indices(tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = en_vocab.lookup_indices([sos_token])
        for _ in range(max_output_length):
            input_tensors = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(input_tensors, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == en_vocab[eos_token]:
                break
        tokens = en_vocab.lookup_tokens(inputs)
    return tokens

In [75]:

sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [76]:
lower = True
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [77]:
translation

['<sos>', 'a', 'man', 'in', 'a', 'black', 'hat', 'is', 'working', '.', '<eos>']

In [78]:
sentence = "Ein Mann sitzt auf einer Bank."

In [79]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)

In [80]:
translation

['<sos>', 'a', 'man', 'sits', 'on', 'a', 'bench', '.', '<eos>']

In [81]:
bleu = evaluate.load("bleu")

Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 2.98MB/s]
Downloading extra modules: 4.07kB [00:00, 330kB/s]                    
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<?, ?B/s]


In [83]:
translations = [
    translate_sentence(
        example["de"],
        model,
        en_nlp,
        de_nlp,
        en_vocab,
        de_vocab,
        lower,
        sos_token,
        eos_token,
        device,
    )
    for example in tqdm.tqdm(test_data)
]

100%|██████████| 1000/1000 [00:42<00:00, 23.52it/s]


In [84]:
predictions = [" ".join(translation[1:-1]) for translation in translations]

references = [[example["en"]] for example in test_data]

In [85]:
def get_tokenizer_fn(nlp, lower):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

In [86]:
tokenizer_fn = get_tokenizer_fn(en_nlp, lower)
tokenizer_fn(predictions[0]), tokenizer_fn(references[0][0])

(['a', 'man', 'in', 'a', 'black', 'hat', 'is', 'working', '.'],
 ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.'])

In [87]:
results = bleu.compute(
    predictions=predictions, references=references, tokenizer=tokenizer_fn
)


In [88]:
results

{'bleu': 0.1394542490454578,
 'precisions': [0.5066445182724253,
  0.20643115942028986,
  0.10009960159362549,
  0.05066371681415929],
 'brevity_penalty': 0.9189243341345873,
 'length_ratio': 0.9220401286567621,
 'translation_length': 12040,
 'reference_length': 13058}