In [8]:
import math
import torchtext
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab,Vocab
from torchtext.utils import download_from_url, extract_archive
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from torchtext.transforms import ToTensor
from torch import Tensor
from torch.nn import TransformerEncoder,TransformerDecoder,TransformerEncoderLayer,TransformerDecoderLayer
import io
import time


import random
import re
import torchtext.data
import numpy as np
import pandas as pd
import string
from torchtext.vocab import FastText as ft
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vectors
from torchtext.transforms import ToTensor
from torchtext.vocab import Vectors
from pathlib import Path
import unicodedata

In [9]:
import spacy

# **Prepare Data**

In [10]:
class Vocab:
    def __init__(self, name):
        self.name = name
        self.stoi = {}
        self.word2count = {}
        self.itos = {0: "<bos>",1: "<pad>", 2:'<eos>',3:'<unk>'}
        self.n_words = 4

    def add_sentence(self, sentence):
        for w in sentence.split(' '):
            self.add_word(w)

    def add_word(self, w):
        if w not in self.stoi:
            self.itos[self.n_words] = w
            self.stoi = {w:i for (i,w) in self.itos.items()}
            self.word2count[w] = 1
            self.n_words += 1
        else:
            self.word2count[w] += 1


In [11]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [12]:

%matplotlib inline
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
from fastai.text import *

In [13]:

df = pd.read_csv('/content/data/questions_easy.csv')

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
df = df.loc[df['en'].apply(lambda x: len(x.split(' '))<=20) & df['fr'].apply(lambda x: len(x.split(' '))<=20)]

In [16]:
df.head()

Unnamed: 0,en,fr
0,what is light ?,qu est ce que la lumiere ?
1,who are we ?,ou sommes nous ?
2,where did we come from ?,d ou venons nous ?
3,what would we do without it ?,que ferions nous sans elle ?
4,what is the absolute location latitude and lon...,quelle sont les coordonnees latitude et longit...


In [17]:
en_vocab = Vocab('en')
fr_vocab = Vocab('fr')

In [18]:
for sent in df['en'].values:
    en_vocab.add_sentence(sent)

for sent in df['fr'].values:
    fr_vocab.add_sentence(sent)

In [19]:
en_emb = nn.Embedding(len(en_vocab.stoi),300,padding_idx = 1)
wgts = en_emb.weight.data

In [20]:
miss = []
for w,i in en_vocab.stoi.items():
    try: wgts[i+3] = torch.tensor(vects_en.get_vecs_by_tokens(w))
    except: miss.append(w)

In [21]:
fr_emb = nn.Embedding(len(fr_vocab.stoi),300,padding_idx=1)
wgts = fr_emb.weight.data

In [22]:
miss = []
for w,i in fr_vocab.stoi.items():
    try:wgts[i+1] = torch.tensor(vects_fr.get_vecs_by_tokens(w))
    except: miss.append(w)

In [23]:
en_tokenizer = get_tokenizer(None, language='en')
fr_tokenizer = get_tokenizer(None, language='fr')

In [24]:
inputs = [[en_vocab.stoi[y] for y in en_tokenizer(x)] for x in df['en'].values]
outputs = [[fr_vocab.stoi[y] for y in fr_tokenizer(x)] for x in df['fr'].values]

In [25]:
with torch.no_grad():
    for i in range(4):
        fr_emb.weight[i] = i * torch.ones(300)
        en_emb.weight[i] = i * torch.ones(300)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
to_tensor = ToTensor(padding_value=1)

In [28]:
en_train_data, en_test_data, fr_train_data, fr_test_data = train_test_split(inputs, outputs,test_size=0.2)

In [29]:
en_train_data, en_val_data, fr_train_data, fr_val_data = train_test_split(en_train_data, fr_train_data,test_size=0.2)

In [30]:
en_train_data = to_tensor(en_train_data)
fr_train_data = to_tensor(fr_train_data)
en_val_data = to_tensor(en_val_data)
fr_val_data = to_tensor(fr_val_data)
en_test_data = to_tensor(en_test_data)
fr_test_data = to_tensor(fr_test_data)

In [31]:
inputs_tensor = ToTensor(padding_value = 1)(inputs)
outputs_tensor = ToTensor(padding_value = 1)(outputs)

In [32]:
inputs_tensor

tensor([[   4,    5,    6,  ...,    1,    1,    1],
        [   8,    9,   10,  ...,    1,    1,    1],
        [  11,   12,   10,  ...,    1,    1,    1],
        ...,
        [   4,    5,   19,  ...,    1,    1,    1],
        [   4,    9,   19,  ...,    1,    1,    1],
        [   4,   70, 7119,  ...,    1,    1,    1]])

In [33]:
ds = TensorDataset(inputs_tensor,outputs_tensor)

# **Train Seq2Seq model**

In [34]:
en_train_data.size()

torch.Size([26588, 20])

In [35]:
fr_train_data.size()

torch.Size([26588, 20])

In [36]:
train_ds = TensorDataset(en_train_data,fr_train_data)
val_ds = TensorDataset(en_val_data,fr_val_data)
test_ds = TensorDataset(en_test_data,fr_test_data)

In [37]:
bs = 128

In [38]:
train_dl = DataLoader(train_ds,batch_size=bs,shuffle=True)
val_dl = DataLoader(val_ds,batch_size=bs,shuffle=True)
test_dl = DataLoader(test_ds,batch_size=bs,shuffle=True)

In [39]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int,
                 emb_size: int, src_vocab_size: int, tgt_vocab_size: int,
                 dim_feedforward:int = 512, dropout:float = 0.1):
        super().__init__()
        encoder_layer = TransformerEncoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward,batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model=emb_size, nhead=NHEAD,
                                                dim_feedforward=dim_feedforward,batch_first=True)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)



    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor,
                tgt_mask: Tensor, src_padding_mask: Tensor,
                tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory,tgt_mask, None,
                                        tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer_encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer_decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

class PositionalEncoding(nn.Module):

    def __init__(self,emb_size,dropout,max_len=5000):
        super().__init__()
        den = torch.exp(-torch.arange(0,emb_size,2) * math.log(10000)/emb_size)
        pos = torch.arange(0,max_len).reshape(max_len,1)
        pos_embedding = torch.zeros((max_len,emb_size))
        pos_embedding[:,0::2] = torch.sin(pos * den)
        pos_embedding[:,1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding',pos_embedding)

    def forward(self,token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0),:])





In [40]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [41]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz,sz),device='cuda') == 1)).transpose(0,1)

    mask = mask.float().masked_fill(mask==0,float('-inf')).masked_fill(mask==1,float(0.0))
    return mask

In [42]:
def create_mask(src,tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len,src_seq_len),device='cuda').type(torch.bool)

    src_padding_mask = (src == 1)
    tgt_padding_mask = (tgt == 1)

    return src_mask, tgt_mask, src_padding_mask,tgt_padding_mask

In [43]:
SRC_VOCAB_SIZE = len(en_vocab.stoi)
TGT_VOCAB_SIZE = len(fr_vocab.stoi)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 50
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [44]:
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS,
                                 EMB_SIZE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE,
                                 FFN_HID_DIM)


In [45]:
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [46]:
transformer = transformer.cuda()

In [47]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=1)

optimizer = torch.optim.Adam(transformer.parameters(),lr=0.0001,betas=(0.9,0.98),eps=1e-9)

In [48]:
def train_epoch(model, train_dl, optimizer):
    model.train()
    losses = 0
    for idx, (src, tgt) in enumerate(train_dl):
        src = src.cuda()
        tgt = tgt.cuda()

        tgt_input = tgt[:,:-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask,
                tgt_mask, src_padding_mask,
                tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[:, 1:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dl)



In [49]:
def evaluate(model,val_dl):
    model.eval()
    losses = 0

    for idx, (src,tgt) in enumerate(val_dl):
        src = src.cuda()
        tgt = tgt.cuda()

        tgt_input =tgt[:,:-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask,
                tgt_mask, src_padding_mask,
                tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[:,1:]

        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))

        losses += loss.item()

    return losses/len(val_dl)

In [50]:
for epoch in range(30):
    start_time = time.time()
    train_loss = train_epoch(transformer,train_dl,optimizer)

    end_time = time.time()

    val_loss = evaluate(transformer,val_dl)

    print(f'Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, '
         f'Epoch time = {(end_time - start_time):.3f}s ')



Epoch: 0, Train loss: 6.770, Val loss: 5.811, Epoch time = 29.675s 
Epoch: 1, Train loss: 5.346, Val loss: 4.985, Epoch time = 26.729s 
Epoch: 2, Train loss: 4.699, Val loss: 4.572, Epoch time = 26.402s 
Epoch: 3, Train loss: 4.338, Val loss: 4.381, Epoch time = 26.617s 
Epoch: 4, Train loss: 4.086, Val loss: 4.238, Epoch time = 26.660s 
Epoch: 5, Train loss: 3.880, Val loss: 4.123, Epoch time = 26.629s 
Epoch: 6, Train loss: 3.699, Val loss: 4.027, Epoch time = 26.586s 
Epoch: 7, Train loss: 3.530, Val loss: 3.959, Epoch time = 26.568s 
Epoch: 8, Train loss: 3.366, Val loss: 3.884, Epoch time = 26.661s 
Epoch: 9, Train loss: 3.206, Val loss: 3.806, Epoch time = 26.655s 
Epoch: 10, Train loss: 3.048, Val loss: 3.720, Epoch time = 26.607s 
Epoch: 11, Train loss: 2.896, Val loss: 3.686, Epoch time = 26.582s 
Epoch: 12, Train loss: 2.747, Val loss: 3.626, Epoch time = 26.575s 
Epoch: 13, Train loss: 2.609, Val loss: 3.581, Epoch time = 26.591s 
Epoch: 14, Train loss: 2.475, Val loss: 3.53

In [51]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.cuda()
    src_mask = src_mask.cuda()
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(2).type(torch.long).cuda()
    for i in range(max_len-1):

        memory = memory.cuda()
        memory_mask = torch.zeros(ys.shape[1], memory.shape[1]).cuda().type(torch.bool)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1))
                                    .type(torch.bool)).cuda()
        out = model.decode(ys, memory, tgt_mask).squeeze(0)[-1]
        prob = model.generator(out)
        next_word = prob.argmax()
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == 3:
              break
    return ys


def translate(model, src, src_vocab, tgt_vocab, src_tokenizer):
    model.eval()
    tokens = [2]+[en_vocab.stoi[x] for x in en_tokenizer(src)]+[3]
    num_tokens = len(tokens)
    src = (torch.LongTensor(tokens).reshape(1,num_tokens))
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens + 5, start_symbol=2).flatten()
    return " ".join([tgt_vocab.itos[tok] for tok in tgt_tokens.cpu().detach().numpy()]).replace("<bos>", "").replace("<eos>", "")

# **Result**

In [52]:
output = translate(transformer, "what is the major aboriginal group on vancouver island ?", en_vocab, fr_vocab, en_tokenizer)
print(output)

 ? ? ? ? est il dans le groupe de l ile autochtone ? ? ?


In [53]:
output1 = translate(transformer, "who can assist me with questions relating to establishing a business in canada ?", en_vocab, fr_vocab, en_tokenizer)
print(output1)

 d affaires peut on me rendre au canada ? ? ? ? ? ? ? ? ? ? ? ?


In [54]:
output2 = translate(transformer, "where can i find advisories for food drugs medical devices natural health products and consumer products ?", en_vocab, fr_vocab, en_tokenizer)
print(output2)

 d un aliment special et des produits naturels ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
