In [1]:
import unicodedata
import numpy as np
import torch
from torch import nn
import re
import os
from tensorboardX import SummaryWriter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [2]:
# download the needed data
if not os.path.isfile('data.zip'):
    ! curl -o data.zip https://download.pytorch.org/tutorial/data.zip && unzip data.zip 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2814k  100 2814k    0     0   507k      0  0:00:05  0:00:05 --:--:--  631k
Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating

In [8]:
# Take a quick view of the data.
with open('data/eng-fra.txt') as f:
    f.seek(1000)
    print(f.read(200))

 de question !
Really?	Vraiment ?
Really?	Vrai ?
Really?	Ah bon ?
Thanks.	Merci !
We try.	On essaye.
We won.	Nous avons gagné.
We won.	Nous gagnâmes.
We won.	Nous l'avons emporté.
We won.	Nous l'empor


In [10]:

class Lang:
    """
    Utility class that serves as a language dictionary
    """
    def __init__(self, name):
        self.name = name
        # Count how often a word occurs in the language data.
        self.word2count = {}
        # Words are mapped to indices and vice versa
        self.index2word = {0: "SOS", 1: "EOS"}
        self.word2index = {v:k for k, v in self.index2word.items()}
        # Total word count
        self.n_words = 2  # Count SOS and EOS

    def add_sentence(self, sentence):
        """
        Process words in a sentence string.
        
        :param sentence: (str) 
        """
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        """
        Process words
        :param word: (str)
        """
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        elif word != 'SOS' and word != 'EOS':
            self.word2count[word] += 1
    
    def translate_indexes(self, idx):
        """
        Takes in a vector of indices and returns the sentence.
        """
        return [self.index2word[i] for i in idx]
    
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r"\s?[.!?]", r" EOS", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')
    
    # Split every line into pairs and normalize
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [11]:
# Since there are a lot of example sentences and we want to train something quickly, we'll trim the data set to only relatively short and simple sentences. 
# Here the maximum length is 10 words (that includes ending punctuation) and we're filtering to sentences that translate to the form "I am" or "He is" etc. 
# (accounting for apostrophes replaced earlier).



def filter_pairs(pairs):
    MAX_LENGTH = 10
    
    eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
    )
    
    def filter_pair(p):
        return len(p[0].split(' ')) < MAX_LENGTH and \
            len(p[1].split(' ')) < MAX_LENGTH \
            and p[0].startswith(eng_prefixes)
    return [pair for pair in pairs if filter_pair(pair)]

In [12]:
class Data:
    def __init__(self, pairs, lang_1, lang_2):
        self.pairs = np.array(pairs)
        np.random.seed(9)
        np.random.shuffle(self.pairs)
        idx_1 = [[lang_1.word2index[word] for word in s.split(' ')] 
                               for s in self.pairs[:, 0]]
        idx_2 = [[lang_2.word2index[word] for word in s.split(' ')]
                               for s in self.pairs[:, 1]]
        self.idx_pairs = np.array(list(zip(idx_1, idx_2)))
                
    def __str__(self):
        return(self.pairs)
    
    def shuffle(self):
        np.random.shuffle(self.shuffle_idx)
        self.pairs = self.pairs[self.shuffle_idx]
        self.idx_pairs = self.idx_pairs[self.shuffle_idx]      
    
    

In [13]:
def prepare_data(lang1, lang2, reverse=False):
    # read_langs initialized the Lang objects (still empty) and returns the pair sentences.
    input_lang, output_lang, pairs = read_langs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    
    # Reduce data. We haven't got all day to train a model.
    pairs = filter_pairs(pairs) 
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    
    # Process the language pairs.
    for pair in pairs:
        input_lang.add_sentence(pair[0])
        output_lang.add_sentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, Data(pairs, input_lang, output_lang)


eng, fra, data = prepare_data('eng', 'fra', False)
data.pairs[0]

Reading lines...
Read 135842 sentence pairs
Trimmed to 10853 sentence pairs
Counting words...
Counted words:
eng 2922
fra 4486


array(['we are even EOS', 'nous sommes a egalite EOS'], dtype='<U60')

The Encoder

The encoder of a seq2seq network is a RNN that outputs some value for every word from the input sentence. For every input word the encoder outputs a vector and a hidden state, and uses the hidden state for the next input word.

![](img/encoder-network.png)

Every output could be seen as the context of the sentence up to that point.

![](img/training_seq2seq_many2may.svg)

In [20]:
class Encoder(nn.Module):
    def __init__(self, n_words, embedding_size, hidden_size, bidirectional=False, device=device.type):
        super(Encoder, self).__init__()
        self.bidirectional = bidirectional
        self.hidden_size = hidden_size
        # The word embeddings will also be trained
        # To freeze them --> m.embedding.weight.requires_grad = False
        self.embedding = nn.Embedding(n_words, embedding_size)  
        self.rnn = nn.GRU(embedding_size, hidden_size, bidirectional=bidirectional)
        self.device = device
        if device == 'cuda':
            self.cuda()
                    
    def forward(self, x):
        # shape (seq_length, batch_size, input_size)
        dense_vector = self.embedding(x).view(x.shape[0], 1, -1)
        
        # init hidden layer at beginning of sequence
        n = 2 if self.bidirectional else 1
        
        h = torch.zeros(n, 1, self.hidden_size, device=self.device)
            
        x, h = self.rnn(dense_vector, h)

        return x, h
        

m = Encoder(eng.n_words, 10, 2, False, 'cpu')
scentence = torch.tensor([400, 1, 2, 6, 8])
a = m(scentence)
a[0].shape

torch.Size([5, 1, 2])

# Simple Decoder

In the simplest seq2seq decoder we use only last output of the encoder. This last output is sometimes called the context vector as it encodes context from the entire sequence. This context vector is used as the initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and hidden state. The initial input token is the start-of-string <SOS> token, and the first hidden state is the context vector (the encoder’s last hidden state).
    
![](img/decoder-network.png)
    

In [24]:
class Decoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, device=device.type):
        super(Decoder, self).__init__()
        self.decoder = 'simple'
        self.hidden_size = hidden_size
        # Lookup table for the last word activation.
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.relu = nn.LeakyReLU()
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.out = nn.Sequential(
            nn.LeakyReLU(),
            nn.Linear(hidden_size, output_size),
            nn.LogSoftmax(2)
        )
        self.device = device
        if device == 'cuda':
            self.cuda()
            
    def forward(self, word, h):
        """
        :param word: (tensor) Last word or start of sentence token.
        :param h: (tensor) Hidden state or context tensor.
        """
        # map from shape (seq_len, embedding_size) to (seq_len, batch, embedding_size) (Notel: seq length is the number of words in the sentence)
        word_embedding = self.embedding(word).view(h.shape[0], 1, -1)
        a = self.relu(word_embedding)
        x, h = self.rnn(a, h)

        return self.out(x), h

m = Decoder(10, 20, eng.n_words, device='cpu')
m.train(False)
m(torch.tensor([1]) ,torch.zeros(1, 1, 20))[0].sum()

torch.Size([1, 10])
torch.Size([1, 1, 10])


tensor(-23351.8633, grad_fn=<SumBackward0>)

![](img/attention-decoder-network.png)

In [120]:
class AttentionDecoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, dropout=0.1, max_length=10, device='cpu'):
        super(AttentionDecoder, self).__init__()
        self.decoder = 'attention'
        self.max_length = max_length
        self.device = device
        self.embedding = nn.Sequential(
            nn.Embedding(output_size, embedding_size),
            nn.Dropout(dropout)
        )
        
        # Seperate neural network to learn the attention weights
        self.attention_weights = nn.Sequential(
            nn.Linear(embedding_size + hidden_size, max_length),
            nn.Softmax(2)
        )
        self.attention_combine = nn.Sequential(
            nn.Linear(hidden_size + embedding_size, hidden_size),
            nn.ReLU()
        )

        self.rnn = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Sequential(
            nn.Linear(hidden_size, output_size),
            nn.LogSoftmax(2)
            )
        
        if device == 'cuda':
            self.cuda()
        
    def forward(self, word, h, encoder_outputs):
        """
        :param word: (LongTensor) The word indices. This is the last activated word or 
        :param h: (tensor) The hidden state from the previous step. In the first step, the hidden state of the encoder.
        :param encoder_outputs: (tensor) Zero padded (max_length, shape, shape) outputs from the encoder.
        """
        # map from shape (seq_len, embedding_size) to (seq_len, batch, embedding_size) (Notel: seq length is the number of words in the sentence)
        word_embedding = self.embedding(word).view(1, 1, -1)
        
        # Concatenate the word embedding and the last hidden state, so that attention weights can be determined.
        x = torch.cat((word_embedding, h), 2)
        attention_weights = self.attention_weights(x)
        # attention applied
        x = torch.bmm(attention_weights, encoder_outputs.unsqueeze(0))  # could also be done with matmul
   
        # attention combined
        x = torch.cat((word_embedding, x), 2)
        x = self.attention_combine(x)
        
        x, h = self.rnn(x, h)
       
        x = self.out(x)

        return x, h


embedding_size = 256
hidden_size = 256
max_length = 10

m = Encoder(eng.n_words, embedding_size, hidden_size, bidirectional=False, device='cpu')
scentence = torch.tensor([1, 23, 9])
out, h = m(scentence)
encoder_outputs = torch.zeros(max_length, out.shape[-1], device='cpu')
encoder_outputs[:out.shape[0], :out.shape[-1]] = out.view(out.shape[0], -1)


m = AttentionDecoder(embedding_size, hidden_size, 2, device='cpu')
m(torch.tensor([1]), h, encoder_outputs)[0].shape

torch.Size([1, 1, 2])

In [122]:
def run_decoder(decoder, scentence, h, teacher_forcing=False, encoder_outputs=None):
    loss = 0
    
    for j in range(scentence.shape[0]):
        if decoder.decoder == 'Attention':
            x, h = decoder(word, h, encoder_outputs)
        else:
            x, h = decoder(word, h)

        loss += criterion(x.view(1, -1), scentence[j].view(-1))
        if teacher_forcing:
            word = eng_scentence[j]
        else:
            word = x.argmax().detach()
        if word.item() == 1: # <EOS>
            break
    return loss


In [97]:
teacher_forcing_ratio = 0.5

embedding_size = 100
context_vector_size = 256
bidirectional = False
encoder = Encoder(eng.n_words, embedding_size, context_vector_size, bidirectional)
context_vector_size = context_vector_size * 2 if bidirectional else context_vector_size 
decoder = Decoder(embedding_size, context_vector_size, fra.n_words)
writer = SummaryWriter('tb/emb-100_h256_bidirectionalwRelu')

In [61]:

def train(encoder, decoder):
    criterion = nn.NLLLoss()
    optim_encoder = torch.optim.SGD(encoder.parameters(), lr=0.01)
    optim_decoder = torch.optim.SGD(decoder.parameters(), lr=0.01)

    epochs = 4
    batch_size = 1

    encoder.train(True)
    decoder.train(True)

    for epoch in range(epochs):
        data.shuffle()

        for i in range(data.pairs.shape[0]):
            optim_decoder.zero_grad()
            optim_encoder.zero_grad()
            
            pair = data.idx_pairs[i]

            eng_scentence = torch.tensor(pair[0], device=device)
            fra_scentence = torch.tensor(pair[1], device=device)

            # Encode the input language
            out, h = encoder(fra_scentence)        
            encoder_outputs = torch.zeros(max_length, out.shape[-1], device=device)
            
            if decoder.decoder == 'attention':
                encoder_outputs[:out.shape[0], :out.shape[-1]] = out.view(out.shape[0], -1)

            word = torch.tensor([0], device=device) # <SOS>
            teacher_forcing = np.random.rand() < teacher_forcing_ratio
            loss = run_decoder(decoder, eng_scentence, h, teacher_forcing)

            loss.backward()
            writer.add_scalar('loss', loss.cpu().item() / (j + 1))

            optim_decoder.step()
            optim_encoder.step()

        print(f'epoch {epoch}')


In [None]:
def translate(start, end):
    with torch.no_grad():
        for i in range(start, end):

            pair = data.idx_pairs[i]
            eng_scentence = torch.tensor(pair[0], device=device)
            fra_scentence = torch.tensor(pair[1], device=device)

            print('English scentence:\t', ' '.join([eng.index2word[i] for i in eng_scentence.cpu().data.numpy()][:-1]))
            print('Real translation:\t', ' '.join([fra.index2word[i] for i in fra_scentence.cpu().data.numpy()][:-1]))

            h = encoder(eng_scentence)
            word = torch.tensor([0], device=device)

            translation = []
            for j in range(fra_scentence.shape[0]):
                x, h = decoder(word, h)

                word = x.argmax().detach()
                translation.append(word.cpu().data.tolist())

                if word.item() == 1: # <EOS>
                    break
            print('Model translation:\t', ' '.join([fra.index2word[i] for i in translation][:-1]), '\n')
        
translate(10, 20)

In [111]:
def translate( start, end):
    
    for i in range(start, end):
    
        pair = data.idx_pairs[i]
        eng_scentence = torch.tensor(pair[0], device=device)
        fra_scentence = torch.tensor(pair[1], device=device)

        print('English scentence:\t', ' '.join([eng.index2word[i] for i in eng_scentence.cpu().data.numpy()][:-1]))
        print('French scentence:\t', ' '.join([fra.index2word[i] for i in fra_scentence.cpu().data.numpy()][:-1]))

        # Encode the input language
        out, h = encoder(fra_scentence)        
        encoder_outputs = torch.zeros(max_length, out.shape[-1], device=device)
        encoder_outputs[:out.shape[0], :out.shape[-1]] = out.view(out.shape[0], -1)
        
        word = torch.tensor([0], device=device) # <SOS>
  
        translation = []
        for j in range(fra_scentence.shape[0]):
            x, h = decoder(word, h, encoder_outputs=encoder_outputs)
  
            word = x.argmax().detach()
            translation.append(word.cpu().data.tolist())

            if word.item() == 1: # <EOS>
                break
        print('\nModel translation:\t', ' '.join([eng.index2word[i] for i in translation][:-1]), '\n\n')
        
translate(20, 60)

English scentence:	 i m pretty busy
French scentence:	 je suis plutot occupe

Model translation:	 i m pretty busy 


English scentence:	 she sang better than him
French scentence:	 elle chanta mieux que lui

Model translation:	 she sang better than him 


English scentence:	 i m all for that
French scentence:	 je suis tout a fait pour

Model translation:	 i m all for that 


English scentence:	 they re right behind me
French scentence:	 ils se trouvent juste derriere moi

Model translation:	 they re right behind me 


English scentence:	 you re very funny
French scentence:	 vous etes fort droles

Model translation:	 you re very funny 


English scentence:	 you re very rude
French scentence:	 vous etes fort grossier

Model translation:	 you re very rude 


English scentence:	 he s the class clown
French scentence:	 c est le pitre de la classe

Model translation:	 he s the class 


English scentence:	 they re not always right
French scentence:	 elles n ont pas toujours raison

Model tran