In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets.translation import IWSLT
import spacy
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import sys
from torchtext.data import Field, BucketIterator
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, Dataset
import torch.nn.functional as F
from torchtext.datasets.translation import Multi30k
from tqdm import tqdm

In [2]:
def get_config():
    return {
        "batch_size": 32,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 20,
        "d_model": 512,
        "src_lang": "eng",
        "trg_lang": "fra",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": None,
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }
config = get_config()

In [3]:
class SelfAttention(nn.Module):
    def __init__(self, d_model, heads):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        self.heads = heads
        self.head_dim = d_model // heads

        assert (
            self.head_dim * heads == d_model
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.d_model, self.d_model, bias=False)
        self.keys = nn.Linear(self.d_model, self.d_model, bias=False)
        self.queries = nn.Linear(self.d_model, self.d_model, bias=False)
        self.fc_out = nn.Linear(self.d_model, self.d_model,bias=False)

    # values == keys == query == [batch_size,seq_len,d_model]
    
    def forward(self, values, keys, query, mask):
        N = query.shape[0]

        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        
        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(query)
        
        # [batch_size,seq_len,d_model] -> [batch_size,seq_len,heads,head_dim]
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        # mat mul: energy == [batch_size,heads,seq_len,seq_len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
        
        attention = torch.softmax(energy / (self.d_model ** (1 / 2)), dim=3)
        #attention == [batch_size,heads,seq_len,seq_len]
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )
        #out == [batch_size,seq_len,d_model]
        
        out = self.fc_out(out)
        return out

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(d_model, heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, forward_expansion * d_model),
            nn.ReLU(),
            nn.Linear(forward_expansion * d_model, d_model),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        # attention = [batch_size,seq_len,d_model]
        # query = [batch_size,seq_len,d_model]
        attention = self.attention(value, key, query, mask) 
        x = self.dropout(self.norm1(attention + query))
        # x == [batch_size,seq_len,d_model]
        forward = self.feed_forward(x)
        # forward == [batch_size,seq_len,d_model]
        out = self.dropout(self.norm2(forward + x))
        return out

In [5]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        d_model,
        num_layers,
        heads,
        device,
        forward_expansion,
        dropout,
        max_length,
    ):

        super(Encoder, self).__init__()
        self.d_model = d_model
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_length, d_model)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    d_model,
                    heads,
                    dropout=dropout,
                    forward_expansion=forward_expansion,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # x == [batch_size,seq_len]
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )
        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

In [6]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(d_model)
        self.attention = SelfAttention(d_model, heads=heads)
        self.transformer_block = TransformerBlock(
            d_model, heads, dropout, forward_expansion
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out

In [7]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size,
        d_model,
        num_layers,
        heads,
        forward_expansion,
        dropout,
        device,
        max_length,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_length, d_model)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(d_model, heads, forward_expansion, dropout, device)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(d_model, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out

In [8]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        heads=8,
        d_model = 512,
        num_layers=3,
        forward_expansion=4,
        dropout=0.1,
        max_length=100,
        
        device="cpu",
    ):

        super(Transformer, self).__init__()

        self.encoder = Encoder(
            src_vocab_size,
            d_model,
            num_layers,
            heads,
            device,
            forward_expansion,
            dropout,
            max_length,
        )

        self.decoder = Decoder(
            trg_vocab_size,
            d_model,
            num_layers,
            heads,
            forward_expansion,
            dropout,
            device,
            max_length,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out

In [9]:
SOS_token = "SOS"
EOS_token = "EOS"
PAD_token = "PAD"
UNK_token = "UNK"

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"SOS": 0,"EOS":1,"PAD":2,"UNK":3}
        self.word2count = {"SOS": 1,"EOS":1,"PAD":1,"UNK":1}
        self.index2word = {0: "SOS", 1: "EOS",2:"PAD",3:"UNK"}
        self.n_words = 4  # Count SOS and EOS and PAD and UNK

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
        

In [10]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [11]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


In [12]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    pairs = filterPairs(pairs)
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    
    return input_lang, output_lang, pairs

In [21]:
class Data(Dataset):
    def __init__(self,src_lang,trg_lang,seq_len):
        super().__init__()
        self.seq_len = seq_len
        self.token_src, self.token_trg, self.pairs = readLangs(src_lang,trg_lang,True)
        self.sos_token = torch.tensor([self.token_src.word2index[SOS_token]],dtype=torch.int64)
        self.eos_token = torch.tensor([self.token_src.word2index[EOS_token]],dtype=torch.int64)
        self.pad_token = torch.tensor([self.token_src.word2index[PAD_token]],dtype=torch.int64)
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self,idx):
        src_trg_pair = self.pairs[idx]
        src_text = src_trg_pair[0]
        trg_text = src_trg_pair[1]
        
        encoder_input_tokens = [self.token_src.word2index[word] for word in src_text.split(' ')]
        decoder_input_tokens = [self.token_trg.word2index[word] for word in trg_text.split(' ')]
        
        enc_num_padding_tokens = self.seq_len - len(encoder_input_tokens) - 2
        dec_num_padding_tokens = self.seq_len - len(decoder_input_tokens) - 1
        
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")
            
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(encoder_input_tokens,dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim = 0
        )
        
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(decoder_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim = 0
        )
        
        label = torch.cat(
            [
                torch.tensor(decoder_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )
        
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
        
        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(1).unsqueeze(2), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": trg_text,
        }
    

In [22]:
def get_ds(config):
    train_ds = Data(config["src_lang"],config["trg_lang"],config["seq_len"])
    train_dataloader = DataLoader(train_ds,config["batch_size"],shuffle = True)
    
    return train_dataloader

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
token_src, token_trg, pairs = readLangs('eng','fra',True)

Reading lines...


In [24]:
def get_model():
    src_vocab_size = 17866
    trg_vocab_size = 10700
    model = Transformer(
        src_vocab_size,
        trg_vocab_size,
        2,
        2,
        device=device
    )
    return model

In [147]:
def translate_sentence(sentence,model):
    sentence = normalizeString(sentence)
    tokens = [token_src.word2index[word] for word in sentence.split(' ')]
    tokens.append(token_src.word2index[EOS_token])
    tokens.insert(0,token_src.word2index[SOS_token])
    for _ in range(20-len(tokens)):
        tokens.append(token_src.word2index[PAD_token])
    tokens = torch.LongTensor(tokens).unsqueeze(0).to(device)
    outputs = [token_trg.word2index[SOS_token]]
    for i in range(20):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(tokens, trg_tensor)
        best_guess = output.argmax(2)[:, -1].item()
        outputs.append(best_guess)
        # print(outputs)
    
        if best_guess == 1:
            break
    translated_sentence = [token_trg.index2word[word] for word in outputs]
    translated_sentence = translated_sentence[1:-1]
    final = " ".join([str(elem) for elem in translated_sentence])
    return final

In [120]:
model = get_model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
loss_fn = nn.CrossEntropyLoss(ignore_index=2, label_smoothing=0.1).to(device)
def train():
    train_ds = get_ds(config)
    initial_epoch = 0
    step = 0
    for epoch in range(initial_epoch,config["num_epochs"]):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_ds, desc=f"Processing Epoch {epoch:02d}")
        
        for batch in batch_iterator:
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            label = batch['label'].to(device)
            out = model(encoder_input,decoder_input)
            loss = loss_fn(out.view(-1, 10700), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
        model.eval()
        sentence="Comment vous appelez-vous ?"
        print("SENTENCE: ",sentence)
        print("EXPECTED: ","can you stay a while?")
        print("PREDICTED: ",translate_sentence(sentence,model))
        torch.save(model.state_dict(), f'models/model_{epoch}.pth')

In [1]:
# train()

In [135]:
model = get_model().to(device)
model.load_state_dict(torch.load("models/model_19.pth"))
model.eval()

Transformer(
  (encoder): Encoder(
    (word_embedding): Embedding(17866, 512)
    (position_embedding): Embedding(100, 512)
    (layers): ModuleList(
      (0-2): 3 x TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=512, out_features=512, bias=False)
          (keys): Linear(in_features=512, out_features=512, bias=False)
          (queries): Linear(in_features=512, out_features=512, bias=False)
          (fc_out): Linear(in_features=512, out_features=512, bias=False)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): 

In [146]:
sentence = "Je peux avoir l'addition, s'il vous plaît ?"
translate_sentence(sentence,model)

'may i have the check please ? EOS'