<a href="https://colab.research.google.com/github/shivammehta007/NLPResearch/blob/master/Seq2Seq_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sequence to Sequence Machine Translation [TUT]

We will be writing an encoder-decoder model to try to Machine Translate with help of NLP and Pytorch

In [1]:
!pip install -U tqdm
!python -m spacy download de

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.43.0)
Collecting de_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.1.0/de_core_news_sm-2.1.0.tar.gz (11.1MB)
[K     |████████████████████████████████| 11.1MB 1.8MB/s 
[?25hBuilding wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.1.0-cp36-none-any.whl size=11073065 sha256=896b80e610b854e1a41f80bcce6e91d8624485665f6d99948b887776bf2b22ff
  Stored in directory: /tmp/pip-ephem-wheel-cache-eqqkqjj8/wheels/b4/8b/5e/d2ce5d2756ca95de22f50f68299708009a4aafda2aea79c4e4
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔

In [0]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


import torchtext.data as data
from torchtext.datasets import TranslationDataset, Multi30k

import os
import spacy
import math
import random
from tqdm.notebook import tqdm

## Seeding
For duplication of results

In [0]:
def seed_all(seed=1234):
    """Seed the results for duplication"""
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 1234
seed_all(SEED)

## PreProcessing

### Tokenizing

In [0]:
spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

In [0]:
def tokenize_en(sentence):
    return [word.text for word in spacy_en.tokenizer(sentence)]

def tokenize_de(sentence):
    return [word.text for word in spacy_de.tokenizer(sentence)][::-1]

### Data Loaders
Create two Field Texts for Source and Desitnation

In [0]:
source = data.Field(tokenize=tokenize_de,
               init_token='<sos>',
               eos_token='<eos>',
               lower=True)

destination = data.Field(tokenize=tokenize_en,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)

In [7]:
%%time
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(source, destination))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 667kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 221kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 218kB/s]


CPU times: user 17.8 s, sys: 169 ms, total: 18 s
Wall time: 22.2 s


In [8]:
len(train_data), len(valid_data), len(test_data)

(29000, 1014, 1000)

In [0]:
source.build_vocab(train_data, min_freq=2)
destination.build_vocab(train_data, min_freq=2)

#### Set up Device, CPU or GPU
To put Iterator onto that device

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(datasets=(train_data,valid_data, test_data),
                                                                           batch_size=BATCH_SIZE, 
                                                                           device=device)

## Building the Model

For starters this is uni-directional

#### Encoder

In [0]:
class Encoder(nn.Module):
    """Encoder RNN for the Seq2Seq Model"""

    def __init__(self, input_dim, embedding_dim, hid_dim, n_layers, dropout):
        super(self, Encoder).__init__()

        self.n_layers = n_layers
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, source_sentence):
        embedding  = self.embedding(source_sentence)
        outputs, (hidden, cell) = self.lstm(embedding)
        return hidden, cell

#### Decoder

In [0]:
class Decoder(nn.Module):
    """Decoder RNN for the Seq2Seq Model"""
    
    def __init__(self, output_dim, embedding_dim, hid_dim, n_layers, dropout):
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(output_dim, hidden_dim)
        self.lstm = nn.LSTM(embedding_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded)
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell
