This experiment is based on the paper [Learning Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078)

In [1]:
!pip install torch==1.4

Collecting torch==1.4
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 21kB/s 
[31mERROR: torchvision 0.6.1+cu101 has requirement torch==1.5.1, but you'll have torch 1.4.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.5.1+cu101
    Uninstalling torch-1.5.1+cu101:
      Successfully uninstalled torch-1.5.1+cu101
Successfully installed torch-1.4.0


### Imports

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time
torch.__version__

'1.4.0'

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Data Preparation

In [4]:
! python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 367kB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=621c3a96876eb7ce39d402799f27fda622475c681c2237545b2a131fe9b3155e
  Stored in directory: /tmp/pip-ephem-wheel-cache-h9i1obd9/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [5]:
! python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [6]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [7]:
def tokenize_de(text):

  return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):

  return [tok.text for tok in spacy_en.tokenizer(text)]


In [8]:
SRC = Field(tokenize= tokenize_de, init_token= '<sos>', eos_token= '<eos>', lower= True)
TRG = Field(tokenize = tokenize_en, init_token= '<sos>', eos_token= '<eos>', lower =True)

In [9]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

training.tar.gz:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 4.76MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.44MB/s]

downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz



mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.40MB/s]


In [10]:
vars(train_data.examples[0])

{'src': ['zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.'],
 'trg': ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.']}

In [11]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data,min_freq = 2)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
device

device(type='cuda')

In [14]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

### Encoder

In [15]:
class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, dropout):
    super().__init__()
    self.hid_dim = hid_dim
    self.embedding = nn.Embedding(input_dim, emb_dim)
    self.rnn = nn.GRU(emb_dim, hid_dim)
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, src):

    # src = [src_len, batch_size]

    embedded = self.dropout(self.embedding(src))

    #embedded = [src_len, batch_size, embedding_dim]

    outputs, hidden = self.rnn(embedded)

    #outputs = [src_len, batch_size, hidden_dim * num_directions]
    #hidden = [n_layers * n_direction, batch_size, hidden_dim]

    return hidden

### Decoder

In [16]:
class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, dropout):
    super().__init__()
    self.hid_dim = hid_dim
    self.output_dim = output_dim

    self.embedding = nn.Embedding(output_dim,emb_dim)

    self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)

    self.fc_out = nn.Linear(emb_dim + hid_dim*2, output_dim)

    self.dropout = nn.Dropout(dropout)
  
  def forward(self, input, hidden, context):

    #input = [batch_size]
    #hidden = [n_layers * n_directions, batch_size, hid_dim]
    #context = [n_layers * n_directions, batch_size, hid_dim]

    #n_layers and n_directions is 1
    #hidden = [1, batch_size, hid_dim]
    #context = [1, batch_size, hid_dim]

    input = input.unsqueeze(0)

    embedded = self.dropout(self.embedding(input))

    #embedded = [1, batch_size, emb_dim]

    emb_con = torch.cat((embedded,context), dim = 2)

    #emb_con = [1, batch_size, emb_dim + hid_dim]

    output, hidden = self.rnn(emb_con,hidden)

    #output = [seq_len, batch_size, hid_dim * n_directions]
    #hidden = [n_layers * n_directions, batch_size, hid_dim]

    #seq_len, n_layers, n_directions will be 1

    #output = [1, batch_size, hid_dim]
    #hidden = [1, batch_size, hid_dim]

    output = torch.cat((embedded.squeeze(0),hidden.squeeze(0),context.squeeze(0)),dim = 1)

    #output = [batch_size, emb_dim + hid_dim * 2]

    prediction = self.fc_out(output)

    #prediction = [batch_size, output_dim]

    return prediction, hidden


### Seq2Seq

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.75):
        #src = [src len, batch_size]
        #trg = [trg len, batch_size]
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store the outputs
        
        outputs = torch.zeros(trg_len,batch_size,trg_vocab_size).to(self.device)
        
        context = self.encoder(src)
        
        hidden = context
        
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            output,hidden = self.decoder(input, hidden, context)
            
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1

        return outputs

### Training seq2seq model

In [18]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

In [19]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

#### Initialization of weights

We initialize them from a normal distribution having mean 0 and standard deviation 0.01

In [20]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7855, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

#### Initialize the optimizer

In [21]:
optimizer = optim.Adam(model.parameters())

In [22]:
TRG.pad_token

'<pad>'

In [23]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

#### Training function

The loss function works with 2d inputs and 1d targets

In [24]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src,trg)
        
        #trg = [trg_len, batch_size]
        #output = [trg_len, batch_size, output_dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1,output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output,trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # clip the gradient from exploding(problem in RNN)
        
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss/len(iterator)
    

In [25]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [26]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [27]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')

Epoch: 01 | Time: 0m 58s
	Train Loss: 4.876
	 Val. Loss: 5.463
Epoch: 02 | Time: 0m 58s
	Train Loss: 4.076
	 Val. Loss: 5.967
Epoch: 03 | Time: 0m 58s
	Train Loss: 3.764
	 Val. Loss: 5.388
Epoch: 04 | Time: 0m 58s
	Train Loss: 3.425
	 Val. Loss: 4.965
Epoch: 05 | Time: 0m 58s
	Train Loss: 3.128
	 Val. Loss: 4.644
Epoch: 06 | Time: 0m 58s
	Train Loss: 2.825
	 Val. Loss: 4.441
Epoch: 07 | Time: 0m 58s
	Train Loss: 2.573
	 Val. Loss: 4.252
Epoch: 08 | Time: 0m 58s
	Train Loss: 2.320
	 Val. Loss: 4.245
Epoch: 09 | Time: 0m 58s
	Train Loss: 2.100
	 Val. Loss: 4.105
Epoch: 10 | Time: 0m 58s
	Train Loss: 1.924
	 Val. Loss: 4.128
