In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import random
import torch
import re
import torch.nn as nn
import torch.optim as optim 
import spacy
from torchtext.datasets import Multi30k

In [3]:
!python -m spacy download de


Collecting de_core_news_sm==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 674 kB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l- \ | / - \ done
[?25h  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.3.0-py3-none-any.whl size=14907580 sha256=c794b1b46ddc03a595842031df6ea67d044c0b1660cb70b2627373d88bc7ef9e
  Stored in directory: /tmp/pip-ephem-wheel-cache-6sm3175g/wheels/75/30/c3/ea1c6002eede7f49c8ab017ce62a2981a87b1cd39fab6e6a65
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.3.0
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model vi

In [4]:
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")

In [5]:
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [6]:
t=tokenize_eng("I am Mike Johnson ")
t

['I', 'am', 'Mike', 'Johnson']

In [7]:
from torchtext.data import Field

german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)




In [8]:
train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:04<00:00, 292kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 92.4kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 88.0kB/s]


In [9]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [10]:
class Encoder(nn.Module):
    def __init__(self,input_size,embedding_size,hidden_size,num_layers,p):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers 
        
        self.embedding = nn.Embedding(input_size , embedding_size)
        self.rnn = nn.LSTM(embedding_size , hidden_size, num_layers , bidirectional= True)
        
        self.fc_hidden = nn.Linear(hidden_size *2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size *2, hidden_size)
        self.dropout = nn.Dropout(p)
        
     

    def forward(self,x):
        # x is shape of (seq_length ,N)
        embedding = self.dropout(self.embedding(x))
        

        encoder_states, (hidden, cell) = self.rnn(embedding)
        
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
        
        return encoder_states, hidden , cell 

In [11]:
class Decoder(nn.Module):
    def __init__(self , input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder , self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_size , embedding_size)
        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size ,num_layers)
        
        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()
    
    def forward(self,  x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        #x:(1,N)
        embedding = self.dropout(self.embedding(x))
        #(1,N,embedding_size)
        
        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        # h_reshaped: (seq_length, N, hidden_size*2)
        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        
        attention = self.softmax(energy)
        attention = attention.permute(1,2,0)
        
        encoder_states = encoder_states.permute(1,0,2)
        
        context_vector = torch.bmm( attention, encoder_states).permute(1,0,2)
        rnn_input = torch.cat((context_vector, embedding), dim=2)
        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)
        
        predictions = self.fc(outputs).squeeze(0)
        return predictions, hidden, cell

In [12]:
class Seq2seq(nn.Module):
    def __init__(self ,encoder ,decoder):
        super(Seq2seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self ,source ,target ,teacher_force_ratio=0.5):
        batch_size =  source.shape[1]
        target_len =  target.shape[0]
        target_vocab_size = len(english.vocab)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)
        
        x = target[0]
        
        for t in range(1,target_len):
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)
            
            outputs[t] = output
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs
        
        
        
        

In [13]:
from torch.utils.tensorboard import SummaryWriter
from torchtext.data import Field, BucketIterator
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

# Training hyperparameters
num_epochs = 50
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 200
decoder_embedding_size = 200
hidden_size = 1024
num_layers = 1
enc_dropout = 0.0
dec_dropout = 0.0

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)



In [14]:
from torchtext.data.metrics import bleu_score
import sys

In [15]:

def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [16]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        outputs_encoder, hiddens, cells = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hiddens, cells = model.decoder(
                previous_word, outputs_encoder, hiddens, cells
            )
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

In [17]:
model = Seq2seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = (
    "ein boot mit mehreren männern darauf wird von einem großen"
    "pferdegespann ans ufer gezogen."
)

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")

[Epoch 0 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['black', 'leaving', 'valley', 'awe', 'wood', 'vegetables', 'flour', 'fair', 'approaches', 'overlooking', 'cup', 'drifts', 'sleepy', 'electronics', 'confetti', 'streets', 'barriers', 'does', 'riot', 'splash', 'bridesmaid', 'tuba', 'teeth', 'caged', 'european', 'embracing', 'planes', 'fisherman', 'heron', 'middle', 'gated', 'ninja', 'dragging', 'slow', 'own', '<pad>', 'question', 'upright', 'short', 'olympians', 'nibbling', 'ohio', 'bodysuit', 'cafeteria', 'dishes', 'dunk', 'member', 'wigs', 'sons', 'word']




[Epoch 1 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'young', 'player', 'with', 'a', '<unk>', 'is', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', '.', '<eos>']
[Epoch 2 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'man', 'with', 'a', 'yellow', 'hat', 'is', 'by', 'a', 'large', 'of', 'a', '<unk>', '.', '<eos>']
[Epoch 3 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'brown', 'and', 'white', 'is', 'being', 'pulled', 'from', 'a', '<unk>', 'by', 'a', '<unk>', '<unk>', '.', '<eos>']
[Epoch 4 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'with', 'white', '<unk>', 'is', 'being', 'pulled', 'by', 'a', '<unk>', 'by', 'a', '.', '.', '<eos>']
[Epoch 5 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'with', 'white', '<unk>', 'is', 'being', 'pulled', 'by', 'a', 'bull', 'by', 'a', '.', '<eos>']
[Epoch 6 / 50]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'boat', 'with', 'white', '<unk>