In [None]:
!python -m pip install --upgrade pip
!python -m pip install torchtext==0.6.0
!python -m pip install einops

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->torchtext==0.6.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->torchtext==0.6.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->torchtext==0.6.0)
  Downloading nvidia_cu

In [None]:
!python -m pip install spacy



In [None]:
!python -m spacy download en
!python -m spacy download de

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Collecting de-core-news-sm==3.8.0
  Downloading https://gith

In [None]:
import numpy as np
import pandas
import torch
import random
from typing import Tuple

import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import Tensor

Implementing scaled dot product attention

In [None]:
def softmax(x):
  return np.exp(x) / np.sum(np.exp(x), axis=0)

In [None]:
class ScaledDotProductAttention(nn.Module):
        def __init__(self, dropout=0.1):
          super(ScaledDotProductAttention, self).__init__()

        def forward(self, query, keys, value):
          query = query.detach()
          keys = keys.detach()
          value = value.detach()

          qk = np.matmul(query, keys.T)
          dk = qk/np.sqrt(qk)
          attention = softmax(dk)
          out = np.matmul(attention, value)
          return out, attention




seq2seq model using the following tutorials from the pytorch docs: https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html and https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html#:~:text=The%20Seq2Seq%20Model,word%20in%20the%20input%20sentence.

In [None]:
class Decoder(nn.Module):
  def __init__(self, output_dim, embed_dim, enc_hid_dim, dec_hid_dim, dropout):
    super().__init__()
    self.output_dim = output_dim
    self.embed_dim = embed_dim
    self.enc_hid_dim = enc_hid_dim
    self.dec_hid_dim = dec_hid_dim
    self.dropout = dropout
    self.attention = ScaledDotProductAttention()

    self.embedding = nn.Embedding(output_dim, embed_dim)

    self.rnn = nn.GRU((enc_hid_dim * 2) + embed_dim, dec_hid_dim)
    self.out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + embed_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def _weighted_encoder_rep(self, decoder_hidden, encoder_outputs):
    a, _ = self.attention(decoder_hidden, encoder_outputs, encoder_outputs)
    a = a.unsqueeze(1)

    encoder_outputs = encoder_outputs.permute(1,0,2)
    weighted_encoder_rep = torch.bmm(a, encoder_outputs)

    weighted_encoder_rep = weighted_encoder_rep.permute(1,0,2)

    return weighted_encoder_rep

  def forward(self, input, decoder_hidden, encoder_outputs):
    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, encoder_outputs)
    rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)
    output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
    embedded = embedded.squeeze(0)
    output = output.squeeze(0)
    weighted_encoder_rep = weighted_encoder_rep.squeeze(0)
    output = self.out(torch.cat((output, weighted_encoder_rep, embedded), dim = 1))

    return output, decoder_hidden.squeeze(0)




English and German data preprocessing

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden


The model has 3,491,070 trainable parameters


In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)

        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            output = trg[t] if teacher_force else top1

        return outputs
