# Machine Translation

**English-to-French**

HELLO WORLD = BONJOUR LE MONDE

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence  # pad batch
import spacy  # for tokenizer
import random

In [2]:
# from torchtext.vocab import build_vocab_from_iterator
# from torchtext.datasets import Multi30k

In [3]:
!python -m spacy download fr_core_news_sm

spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load('fr_core_news_sm')

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [4]:
import pandas as pd
df = pd.read_csv('/kaggle/input/language-translation-englishfrench/eng_-french.csv')
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


**Perform Basic Data Preprocessing**

In [5]:
import string

In [6]:
df.columns = ['english', 'french']

def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ''.join(char for char in text if char not in string.punctuation)  # Remove punctuation
    return text

df['en'] = df['english'].apply(lambda x: preprocess_text(x))
df['fr'] = df['french'].apply(lambda x: ''.join(char for char in x if char not in string.punctuation))

In [7]:
df.head()

Unnamed: 0,english,french,en,fr
0,Hi.,Salut!,hi,Salut
1,Run!,Cours !,run,Cours
2,Run!,Courez !,run,Courez
3,Who?,Qui ?,who,Qui
4,Wow!,Ça alors !,wow,Ça alors


In [8]:
# To build vocabulary out of the text in captions

class Vocabulary:
    def __init__(self, freq_threshold, en=True): # store a word in vocab if its occurance is more than the frequency threshold
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"} # to store all tokens and map with index as key
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3} # map index as value to tokens
        self.freq_threshold = freq_threshold 
        self.en = en

    def __len__(self):
        return len(self.itos)
    
    def __getitem__(self,idx):
        return list(self.itos.items())[idx]

    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_en.tokenizer(text)]
    
    @staticmethod
    def tokenizer_french(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

    def build_vocabulary(self, sentence_list):
        frequencies = {} # temp dict to store words 
        idx = 4 # because we have tokens for index 0-3 already assigned

        for sentence in sentence_list:  # caption file as list
            if self.en==True:
                tokenizer = self.tokenizer_eng
            else:
                tokenizer = self.tokenizer_french
            for word in tokenizer(sentence): # takes each sentence and returns a token list of that sentence
                if word not in frequencies:
                    frequencies[word] = 1    

                else:
                    frequencies[word] += 1

                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx  # new word with freq>threshold mapped to index
                    self.itos[idx] = word  # new word:index mapping
                    idx += 1

    def numericalize(self, text): # return index for given text/sentence else return index for UNK (unknown)
        if self.en==True:
            tokenized_text = self.tokenizer_eng(text)
        else:
            tokenized_text = self.tokenizer_french(text)
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]

In [9]:
# english to french

#english
src_vocab = Vocabulary(2)
src_vocab.build_vocabulary(df['en'])

In [11]:
len(src_vocab), src_vocab.numericalize("I am Running"), src_vocab[99]

(9953, [10, 74, 762], (99, 'did'))

In [12]:
trg_vocab = Vocabulary(2, en=False)
trg_vocab.build_vocabulary(df['fr'])

In [13]:
len(trg_vocab), trg_vocab.numericalize('bonjour le monde'), trg_vocab[5]

(17995, [275, 112, 1059], (5, 'ça'))

In [42]:
class MyDataset(nn.Module):
    def __init__(self, src_vocab, trg_vocab, df):
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    
    def __getitem__(self, idx):
        english = self.df['en'][idx]
        french = self.df['fr'][idx]
        
        eng_num = [self.src_vocab.stoi['<SOS>']]
        eng_num += self.src_vocab.numericalize(english)
        eng_num.append(self.src_vocab.stoi['<EOS>'])
    
        fr_num = [self.trg_vocab.stoi['<SOS>']]
        fr_num += self.trg_vocab.numericalize(french)
        fr_num.append(self.trg_vocab.stoi['<EOS>'])
        
        return torch.tensor(eng_num), torch.tensor(fr_num)

In [43]:
class MyCollate: 
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        source = [item[0] for item in batch]
        source = pad_sequence(source, batch_first=False, padding_value=self.pad_idx)
        target = [item[1] for item in batch]
        target = pad_sequence(target, batch_first=False, padding_value=self.pad_idx)

        return source, target

In [44]:
def get_loader(
    src_vocab,
    trg_vocab,
    df,
    batch_size=32,
#     num_workers=4,
    shuffle=True,
    pin_memory=True,
):
    dataset = MyDataset(src_vocab, trg_vocab, df)
    pad_idx = src_vocab.stoi["<PAD>"]

    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
#         num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )

    return loader, dataset

In [45]:
data_loader, ds = get_loader(src_vocab, trg_vocab, df)

In [46]:
len(data_loader)

5489

In [47]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        # x: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        encoder_states, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        # Use forward, backward cells and hidden through a linear layer
        # so that it can be input to the decoder which is not bidirectional
        # Also using index slicing ([idx:idx+1]) to keep the dimension
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        return encoder_states, hidden, cell


class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, num_layers)

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        # x: (1, N) where N is the batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        # h_reshaped: (seq_length, N, hidden_size*2)

        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # energy: (seq_length, N, 1)

        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs).squeeze(0)
        # predictions: (N, hidden_size)

        return predictions, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(trg_vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [49]:
num_epochs = 50
learning_rate = 3e-4

In [50]:
# Model hyperparameters
input_size_encoder = len(src_vocab)
input_size_decoder = len(trg_vocab)
output_size = len(trg_vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0

In [51]:

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
pad_idx = src_vocab.stoi["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [52]:
sentence = "Hello World! I winning UCL. HALA MADRID"

In [53]:
def translate_sentence(model, sentence, device, max_length=50):
#     # Load french tokenizer
#     spacy_fr = spacy.load('fr_core_news_sm')
#     spacy_en = spacy.load('en_core_news_sm')
    
#     # Create tokens using spacy and everything in lower case (which is what our vocab is)
#     if type(sentence) == str:
#         tokens = [token.text.lower() for token in spacy_en(sentence)]
#     else:
#         tokens = [token.lower() for token in sentence]
    
    
    # Add <SOS> and <EOS> in beginning and end respectively
    en_tokens = [src_vocab.stoi['<SOS>']]
    en_tokens += src_vocab.numericalize(sentence)
    en_tokens.append(src_vocab.stoi['<EOS>'])

#     # Go through each german token and convert to an index
#     text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(en_tokens).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)

    outputs = [trg_vocab.stoi["<SOS>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hiddens, cells = model.decoder(
                previous_word, outputs_encoder, hiddens, cells
            )
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == trg_vocab.stoi["<EOS>"]:
            break

    translated_sentence = [trg_vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]

In [None]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(data_loader):
        # Get input and targets and get to cuda
        inp_data, target = batch[0].to(device), batch[1].to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()


[Epoch 0 / 50]
Translated example sentence: 
 ['devina', 'devina', 'embauchés', 'impatients', 'essaim', 'eaux', 'gueules', 'encaisseront', 'mattendes', 'encaisseront', 'octobre', 'champêtre', 'ravis', 'déroba', 'dansons', 'marchâmes', 'endéans', 'évidences', 'minute', 'négocié', 'détendues', 'tom', 'part', 'laffaire', 'commencement', 'tavons', 'taider', 'seront', 'aiderait', 'mordit', 'ford', 'inacceptable', 'spatiale', '1', 'transporté', 'abstraite', 'renversé', 'habitez', 'habitez', 'fournirent', 'réalistes', 'consciencieusement', 'dieu', 'devrais', 'défense', 'suggéra', 'terrifiée', 'colère', 'vécu', 'reprendre']
[Epoch 1 / 50]
Translated example sentence: 
 ['en', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<EOS>']
[Epoch 2 / 50]
Translated example sentence: 
 ['en', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<UNK>', '<EOS>']
[Epoch 3 / 50]
Translated example sentence: 
 ['le', '<UNK>', 'que', 'je', '<UNK>', '<UNK>