<a href="https://colab.research.google.com/github/pooriaazami/deep_learning_class_notebooks/blob/main/12_Machine_Translation_From_Scratch_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files

In [3]:
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"pooriaazami","key":"91ff20621c61968b71cdbf65dc2a8612"}'}

In [4]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

Downloading en-fr-translation-dataset.zip to /content
100% 2.54G/2.54G [00:20<00:00, 261MB/s]
100% 2.54G/2.54G [00:20<00:00, 136MB/s]


In [6]:
!unzip en-fr-translation-dataset.zip
!rm en-fr-translation-dataset.zip

Archive:  en-fr-translation-dataset.zip
  inflating: en-fr.csv               


In [7]:
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_value_
from torchtext.vocab import build_vocab_from_iterator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

In [8]:
DATASET_PATH = 'en-fr.csv'
REDUCED_DATASET_PATH = 'reduced-en-fr.csv'

PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
START_TOKEN = '<start>'
END_TOKEN = '<end>'

MAX_SEQ_LENGTH = 256
EMBEDDING_DIM = 128
LATENT_DIM = 256
BATCH_SIZE = 32
EPOCHS = 10
LR = 1e-3
MAX_TOKENS = 30_000

NUM_TRAINING_PAIRS = 10_000

CLIP_VALUE = 100
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
counter = 0
with open(DATASET_PATH, 'r', encoding='utf-8') as source_file:
  with open(REDUCED_DATASET_PATH, 'w', encoding='utf-8') as destination_file:
    for i, line in enumerate(source_file):
      if 1 < i < 100:
        continue

      destination_file.write(line + '\n')
      counter += 1
      if counter == NUM_TRAINING_PAIRS:
        break

In [10]:
df = pd.read_csv(REDUCED_DATASET_PATH)

In [11]:
df.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Observatories The Woodstock College Observator...,Observatoires Création de l'observatoire de Wo...
2,1879 : pollution Edison makes a major improvem...,1879 : pollution Amélioration majeure de l’amp...
3,Instruments British chemist William de Wiveles...,Instruments Le chimiste britannique William de...
4,Observatories The Victoria College Observatory...,Observatoires Création de l'observatoire du co...


In [12]:
df = df.dropna()

In [13]:
df.shape

(9998, 2)

In [14]:
def tokenizer(text):
  text = text.lower().strip()
  text = re.sub(r"([.!?])", r" \1", text)
  text = re.sub(r"'", " ' ", text)
  text = re.sub(f"[^a-z!?0-9'[{START_TOKEN}][{END_TOKEN}][{PAD_TOKEN}{UNK_TOKEN}]]", r" ", text)
  text = text.strip()
  text = text.split(" ")
  return text

In [15]:
def build_iterator(iterable):
  for item in iterable:
    yield tokenizer(item)

In [16]:
en_vocab = build_vocab_from_iterator(build_iterator(df['en']), specials=[START_TOKEN, END_TOKEN, PAD_TOKEN, UNK_TOKEN])
fr_vocab = build_vocab_from_iterator(build_iterator(df['fr']), specials=[START_TOKEN, END_TOKEN, PAD_TOKEN, UNK_TOKEN])

en_vocab.set_default_index(en_vocab[UNK_TOKEN])
fr_vocab.set_default_index(fr_vocab[UNK_TOKEN])

In [17]:
class TranslationDataset(Dataset):
  def __init__(self, dataset):
    self.__dataset = dataset

  def __len__(self):
    return self.__dataset.shape[0]

  def __getitem__(self, idx):
    return self.__dataset.iloc[idx]['en'], self.__dataset.iloc[idx]['fr']

In [26]:
class Encoder(nn.Module):
  def __init__(self, num_tokens, embedding_dim, latent_dim):
    super().__init__()

    self.embedding = nn.Embedding(num_embeddings=num_tokens, embedding_dim=embedding_dim)
    self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=latent_dim, num_layers=1, batch_first=True)

    self.latent_dim = latent_dim

  def forward(self, x):
    x = self.embedding(x)

    batch_size = x.size()[0]
    h_0 = torch.zeros(1, batch_size, self.latent_dim).to(DEVICE)
    _, context_vector = self.rnn(x, h_0)

    return context_vector

In [27]:
class Decoder(nn.Module):
  def __init__(self, num_tokens, embedding_dim, latent_dim):
    super().__init__()

    self.embedding = nn.Embedding(num_embeddings=num_tokens, embedding_dim=embedding_dim)
    self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=latent_dim, num_layers=1, batch_first=True)
    self.fc = nn.Linear(in_features=latent_dim, out_features=num_tokens)
    self.softmax = nn.LogSoftmax(dim=1)

  def forward(self, x, context_vector):
    x = self.embedding(x)

    x, _ = self.rnn(x, context_vector)
    x = self.fc(x)
    x = self.softmax(x)

    return x

In [28]:
def transform_text(text, vocab):
  text = tokenizer(text)
  text = vocab(text)

  pad_length = MAX_SEQ_LENGTH - len(text)
  text = torch.tensor(text, dtype=torch.int64)
  text = F.pad(text, (0, pad_length), value=vocab[PAD_TOKEN])
  return text

def collate_fn(batch):
  en_list = []
  fr_list = []

  for en, fr in batch:
    en = f'{START_TOKEN} {en} {END_TOKEN}'
    fr = f'{START_TOKEN} {fr} {END_TOKEN}'

    en = transform_text(en, en_vocab)
    fr = transform_text(fr, fr_vocab)

    en_list.append(en)
    fr_list.append(fr)

  en_tensor = torch.vstack(en_list)
  fr_tensor = torch.vstack(fr_list)

  return en_tensor, fr_tensor

In [29]:
dataset = TranslationDataset(df)
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [30]:
encoder = Encoder(num_tokens=len(en_vocab), embedding_dim=EMBEDDING_DIM, latent_dim=LATENT_DIM).to(DEVICE)
decoder = Decoder(num_tokens=len(fr_vocab), embedding_dim=EMBEDDING_DIM, latent_dim=LATENT_DIM).to(DEVICE)

In [31]:
encoder_optimizer = optim.Adam(encoder.parameters(), lr=LR)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=LR)

In [32]:
loss_function = nn.NLLLoss()

In [34]:
for i in range(1, EPOCHS + 1):
  print(f'Epoch {i} / {EPOCHS}')
  total_loss = .0

  for en, fr in tqdm(data_loader):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    en = en.to(DEVICE)
    fr = fr.to(DEVICE)

    context_vector = encoder(en)
    predicted_fr = decoder(fr, context_vector)

    loss = loss_function(
      predicted_fr.view(-1, predicted_fr.size(-1)),
      fr.view(-1)
    )

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.detach().cpu().item()

  print(f'loss: {total_loss:.2f}')

Epoch 1 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1598.48
Epoch 2 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1564.32
Epoch 3 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1556.10
Epoch 4 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1553.20
Epoch 5 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1552.43
Epoch 6 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1552.21
Epoch 7 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1552.11
Epoch 8 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1552.05
Epoch 9 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1552.02
Epoch 10 / 10


  0%|          | 0/313 [00:00<?, ?it/s]

loss: 1551.99
