<a href="https://colab.research.google.com/github/satyajitghana/TSAI-DeepVision-EVA4.0-Phase-2/blob/master/11-Attention%26Transformers/de_to_en_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! python -m spacy download en
! python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 2.5MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=efc0b83667797d1d086b121c93544e0d14655fecef64d43c00c22b8c28a9d306
  Stored in directory: /tmp/pip-ephem-wheel-cache-6ole2up6/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Inst

In [None]:
# from torchtext import data, datasets
import spacy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator
        
    def forward(self, src, trg, src_mask, trg_mask, src_lengths, trg_lengths):
        """Take in and process masked src and target sequences."""
        encoder_hidden, encoder_final = self.encode(src, src_mask, src_lengths)
        return self.decode(encoder_hidden, encoder_final, src_mask, trg, trg_mask)
    
    def encode(self, src, src_mask, src_lengths):
        return self.encoder(self.src_embed(src), src_mask, src_lengths)
    
    def decode(self, encoder_hidden, encoder_final, src_mask, trg, trg_mask,
               decoder_hidden=None):
        return self.decoder(self.trg_embed(trg), encoder_hidden, encoder_final,
                            src_mask, trg_mask, hidden=decoder_hidden)

In [None]:
class Generator(nn.Module):
    """Define standard linear + softmax generation step."""
    def __init__(self, hidden_size, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(hidden_size, vocab_size, bias=False)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [None]:
class Encoder(nn.Module):
    """Encodes a sequence of word embeddings"""
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, 
                          batch_first=True, bidirectional=True, dropout=dropout)
        
    def forward(self, x, mask, lengths):
        """
        Applies a bidirectional GRU to sequence of embeddings x.
        The input mini-batch x needs to be sorted by length.
        x should have dimensions [batch, time, dim].
        """
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        output, final = self.rnn(packed)
        output, _ = pad_packed_sequence(output, batch_first=True)

        # we need to manually concatenate the final states for both directions
        fwd_final = final[0:final.size(0):2]
        bwd_final = final[1:final.size(0):2]
        final = torch.cat([fwd_final, bwd_final], dim=2)  # [num_layers, batch, 2*dim]

        return output, final

In [None]:
class Decoder(nn.Module):
    """A conditional RNN decoder with attention."""
    
    def __init__(self, emb_size, hidden_size, attention, num_layers=1, dropout=0.5,
                 bridge=True):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = attention
        self.dropout = dropout
                 
        self.rnn = nn.GRU(emb_size + 2*hidden_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
                 
        # to initialize from the final encoder state
        self.bridge = nn.Linear(2*hidden_size, hidden_size, bias=True) if bridge else None

        self.dropout_layer = nn.Dropout(p=dropout)
        self.pre_output_layer = nn.Linear(hidden_size + 2*hidden_size + emb_size,
                                          hidden_size, bias=False)
        
    def forward_step(self, prev_embed, encoder_hidden, src_mask, proj_key, hidden):
        """Perform a single decoder step (1 word)"""

        # compute context vector using attention mechanism
        query = hidden[-1].unsqueeze(1)  # [#layers, B, D] -> [B, 1, D]
        context, attn_probs = self.attention(
            query=query, proj_key=proj_key,
            value=encoder_hidden, mask=src_mask)

        # update rnn hidden state
        rnn_input = torch.cat([prev_embed, context], dim=2)
        output, hidden = self.rnn(rnn_input, hidden)
        
        pre_output = torch.cat([prev_embed, output, context], dim=2)
        pre_output = self.dropout_layer(pre_output)
        pre_output = self.pre_output_layer(pre_output)

        return output, hidden, pre_output
    
    def forward(self, trg_embed, encoder_hidden, encoder_final, 
                src_mask, trg_mask, hidden=None, max_len=None):
        """Unroll the decoder one step at a time."""
                                         
        # the maximum number of steps to unroll the RNN
        if max_len is None:
            max_len = trg_mask.size(-1)

        # initialize decoder hidden state
        if hidden is None:
            hidden = self.init_hidden(encoder_final)
        
        # pre-compute projected encoder hidden states
        # (the "keys" for the attention mechanism)
        # this is only done for efficiency
        proj_key = self.attention.key_layer(encoder_hidden)
        
        # here we store all intermediate hidden states and pre-output vectors
        decoder_states = []
        pre_output_vectors = []
        
        # unroll the decoder RNN for max_len steps
        for i in range(max_len):
            prev_embed = trg_embed[:, i].unsqueeze(1)
            output, hidden, pre_output = self.forward_step(
              prev_embed, encoder_hidden, src_mask, proj_key, hidden)
            decoder_states.append(output)
            pre_output_vectors.append(pre_output)

        decoder_states = torch.cat(decoder_states, dim=1)
        pre_output_vectors = torch.cat(pre_output_vectors, dim=1)
        return decoder_states, hidden, pre_output_vectors  # [B, N, D]

    def init_hidden(self, encoder_final):
        """Returns the initial decoder state,
        conditioned on the final encoder state."""

        if encoder_final is None:
            return None  # start with zeros

        return torch.tanh(self.bridge(encoder_final))            


In [None]:
class BahdanauAttention(nn.Module):
    """Implements Bahdanau (MLP) attention"""
    
    def __init__(self, hidden_size, key_size=None, query_size=None):
        super(BahdanauAttention, self).__init__()
        
        # We assume a bi-directional encoder so key_size is 2*hidden_size
        key_size = 2 * hidden_size if key_size is None else key_size
        query_size = hidden_size if query_size is None else query_size

        self.key_layer = nn.Linear(key_size, hidden_size, bias=False)
        self.query_layer = nn.Linear(query_size, hidden_size, bias=False)
        self.energy_layer = nn.Linear(hidden_size, 1, bias=False)
        
        # to store attention scores
        self.alphas = None
        
    def forward(self, query=None, proj_key=None, value=None, mask=None):
        assert mask is not None, "mask is required"

        # We first project the query (the decoder state).
        # The projected keys (the encoder states) were already pre-computated.
        query = self.query_layer(query)
        
        # Calculate scores.
        scores = self.energy_layer(torch.tanh(query + proj_key))
        scores = scores.squeeze(2).unsqueeze(1)
        
        # Mask out invalid positions.
        # The mask marks valid positions so we invert it using `mask & 0`.
        scores.data.masked_fill_(mask == 0, -float('inf'))
        
        # Turn scores to probabilities.
        alphas = F.softmax(scores, dim=-1)
        self.alphas = alphas        
        
        # The context vector is the weighted sum of the values.
        context = torch.bmm(alphas, value)
        
        # context shape: [B, 1, 2D], alphas shape: [B, 1, M]
        return context, alphas

In [None]:
def make_model(src_vocab, tgt_vocab, emb_size=256, hidden_size=512, num_layers=1, dropout=0.1):
    "Helper: Construct a model from hyperparameters."

    attention = BahdanauAttention(hidden_size)

    model = EncoderDecoder(
        Encoder(emb_size, hidden_size, num_layers=num_layers, dropout=dropout),
        Decoder(emb_size, hidden_size, attention, num_layers=num_layers, dropout=dropout),
        nn.Embedding(src_vocab, emb_size),
        nn.Embedding(tgt_vocab, emb_size),
        Generator(hidden_size, tgt_vocab))

    return model

In [None]:
def greedy_decode(model, src, src_mask, src_lengths, max_len=100, sos_index=1, eos_index=None):
    """Greedily decode a sentence."""

    with torch.no_grad():
        encoder_hidden, encoder_final = model.encode(src, src_mask, src_lengths)
        prev_y = torch.ones(1, 1).fill_(sos_index).type_as(src)
        trg_mask = torch.ones_like(prev_y)

    output = []
    hidden = None

    for i in range(max_len):
        with torch.no_grad():
            out, hidden, pre_output = model.decode(
              encoder_hidden, encoder_final, src_mask,
              prev_y, trg_mask, hidden)

            # we predict from the pre-output layer, which is
            # a combination of Decoder state, prev emb, and context
            prob = model.generator(pre_output[:, -1])

        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data.item()
        output.append(next_word)
        prev_y = torch.ones(1, 1).type_as(src).fill_(next_word)
    
    output = np.array(output)
        
    # cut off everything starting from </s> 
    # (only when eos_index provided)
    if eos_index is not None:
        first_eos = np.where(output==eos_index)[0]
        if len(first_eos) > 0:
            output = output[:first_eos[0]]      
    
    return output

In [None]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
from torchtext import data, datasets

UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"    
SOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"
LOWER = True

# we include lengths to provide to the RNNs
SRC = data.Field(tokenize=tokenize_de, 
                    batch_first=True, lower=LOWER, include_lengths=True,
                    unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=None, eos_token=EOS_TOKEN)
TRG = data.Field(tokenize=tokenize_en, 
                    batch_first=True, lower=LOWER, include_lengths=True,
                    unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, init_token=SOS_TOKEN, eos_token=EOS_TOKEN)

MAX_LEN = 25  # NOTE: we filter out a lot of sentences for speed
train_data, valid_data, test_data = datasets.IWSLT.splits(
    exts=('.de', '.en'), fields=(SRC, TRG), 
    filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
        len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 5  # NOTE: we limit the vocabulary to frequent words for speed
SRC.build_vocab(train_data.src, min_freq=MIN_FREQ)
TRG.build_vocab(train_data.trg, min_freq=MIN_FREQ)

PAD_INDEX = TRG.vocab.stoi[PAD_TOKEN]

downloading de-en.tgz


de-en.tgz: 100%|██████████| 24.2M/24.2M [00:07<00:00, 3.03MB/s]


.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.de.xml
.data/iwslt/de-en/train.tags.de-en.de
.data/iwslt/de-en/train.tags.de-en.en


In [None]:
def save_meta(meta, path):
    import dill
    output = open(path, 'wb')
    dill.dump(meta, output)
    output.close()

In [None]:
import inspect

In [None]:
TRG.vocab.stoi.default_factory

<function torchtext.vocab._default_unk_index>

In [None]:
print(inspect.getsource(TRG.vocab.stoi.default_factory))

def _default_unk_index():
    return 0



In [None]:
from collections import defaultdict

In [None]:
TRG_vocab_stoi = defaultdict(lambda: 0, TRG.vocab.stoi)
SRC_vocab_stoi = defaultdict(lambda: 0, SRC.vocab.stoi)

In [None]:
save_meta({
    "UNK_TOKEN": "<unk>",
    "PAD_TOKEN": "<pad>",    
    "SOS_TOKEN": "<s>",
    "EOS_TOKEN": "</s>",
    "TRG.vocab.itos": list(TRG.vocab.itos),
    "TRG.vocab.stoi": TRG_vocab_stoi,
    "SRC.vocab.itos": list(SRC.vocab.itos),
    "SRC.vocab.stoi": SRC_vocab_stoi,
}, 'de-to-en-meta.dill.pkl')

In [None]:
def load_meta(path):
    import dill
    inp = open(path, "rb")
    meta = dill.load(inp)
    inp.close()
    
    return meta

In [None]:
meta = load_meta('de-to-en-meta.dill.pkl')

In [None]:
meta.keys()

dict_keys(['UNK_TOKEN', 'PAD_TOKEN', 'SOS_TOKEN', 'EOS_TOKEN', 'TRG.vocab.itos', 'TRG.vocab.stoi', 'SRC.vocab.itos', 'SRC.vocab.stoi'])

In [None]:
[type(meta[item]) for item in meta.keys()]

[str,
 str,
 str,
 str,
 list,
 collections.defaultdict,
 list,
 collections.defaultdict]

In [None]:
model = make_model(len(meta['SRC.vocab.itos']), len(meta['TRG.vocab.itos']),
                   emb_size=256, hidden_size=256,
                   num_layers=1, dropout=0.2)

  "num_layers={}".format(dropout, num_layers))


In [None]:
model_state = torch.load('/content/annotated-encoder-decoder-de-en.pt', map_location='cpu')

In [None]:
model.load_state_dict(model_state)

<All keys matched successfully>

Actual Inferencing begins from here

In [None]:
ger = 'als ich 11 jahre alt war, wurde ich eines morgens von den heller freude geweckt.'
ger

'als ich 11 jahre alt war, wurde ich eines morgens von den heller freude geweckt.'

In [None]:
ger_tok = tokenize_de(ger)
ger_tok

['als',
 'ich',
 '11',
 'jahre',
 'alt',
 'war',
 ',',
 'wurde',
 'ich',
 'eines',
 'morgens',
 'von',
 'den',
 'heller',
 'freude',
 'geweckt',
 '.']

In [None]:
src = [meta['SRC.vocab.stoi'][x] for x in ger_tok] + [meta['SRC.vocab.stoi'][meta["EOS_TOKEN"]]]
src

[41,
 9,
 1012,
 144,
 464,
 35,
 4,
 84,
 9,
 126,
 1715,
 21,
 27,
 11351,
 1117,
 8043,
 3,
 2]

In [None]:
[meta['SRC.vocab.itos'][x] for x in src]

['als',
 'ich',
 '11',
 'jahre',
 'alt',
 'war',
 ',',
 'wurde',
 'ich',
 'eines',
 'morgens',
 'von',
 'den',
 'heller',
 'freude',
 'geweckt',
 '.',
 '</s>']

In [None]:
src_length = [len(src)]
src_length

[18]

In [None]:
src_mask = torch.ones(src_length) > 0
src_mask

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True])

Checking if the index of the tokens for src and tar the same

In [None]:
meta['SRC.vocab.stoi'][meta['PAD_TOKEN']], meta['TRG.vocab.stoi'][meta['PAD_TOKEN']]

(1, 1)

In [None]:
meta['SRC.vocab.stoi'][meta['SOS_TOKEN']], meta['TRG.vocab.stoi'][meta['SOS_TOKEN']]

(0, 2)

In [None]:
meta['SRC.vocab.stoi'][meta['EOS_TOKEN']], meta['TRG.vocab.stoi'][meta['EOS_TOKEN']]

(2, 3)

In [None]:
# src, src_masks, src_lengths
src = torch.LongTensor(src)
src_mask = (src != meta['SRC.vocab.stoi'][meta['PAD_TOKEN']]).unsqueeze(-2)
src_length = torch.tensor(len(src))

In [None]:
src, src_mask, src_length

(tensor([   41,     9,  1012,   144,   464,    35,     4,    84,     9,   126,
          1715,    21,    27, 11351,  1117,  8043,     3,     2]),
 tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
          True, True, True, True, True, True]]),
 tensor(18))

unsqueeze one dimension, so as to convert it to a batch of size 1

In [None]:
src = src.unsqueeze(0)
src_mask = src_mask.unsqueeze(0)
src_length = src_length.unsqueeze(0)

In [None]:
src.shape, src_mask.shape, src_length.shape

(torch.Size([1, 18]), torch.Size([1, 1, 18]), torch.Size([1]))

In [None]:
output = greedy_decode(
    model, 
    src, 
    src_mask, 
    src_length, 
    max_len=100, 
    sos_index=meta['TRG.vocab.stoi'][meta['SOS_TOKEN']], 
    eos_index=meta['TRG.vocab.stoi'][meta['EOS_TOKEN']]
    )

In [None]:
output.shape

(15,)

In [None]:
print(" ".join([meta['TRG.vocab.itos'][x] for x in output ]))

when i was 11 years , i was called the morning to the morning .


## Some Experiments

This is to see how the Batch module works, so i can augment my raw text given by the user and make something like Batch, then i can simply pass the values to greedy_decode and things will work

In [None]:
 class Batch:
    """Object for holding a batch of data with mask during training.
    Input is a batch from a torch text iterator.
    """
    def __init__(self, src, trg, pad_index=0):
        
        src, src_lengths = src
        
        self.src = src
        self.src_lengths = src_lengths
        self.src_mask = (src != pad_index).unsqueeze(-2)
        self.nseqs = src.size(0)
        
        self.trg = None
        self.trg_y = None
        self.trg_mask = None
        self.trg_lengths = None
        self.ntokens = None

        if trg is not None:
            trg, trg_lengths = trg
            self.trg = trg[:, :-1]
            self.trg_lengths = trg_lengths
            self.trg_y = trg[:, 1:]
            self.trg_mask = (self.trg_y != pad_index)
            self.ntokens = (self.trg_y != pad_index).data.sum().item()
        
        if USE_CUDA:
            self.src = self.src.cuda()
            self.src_mask = self.src_mask.cuda()

            if trg is not None:
                self.trg = self.trg.cuda()
                self.trg_y = self.trg_y.cuda()
                self.trg_mask = self.trg_mask.cuda()
                

In [None]:
DEVICE = torch.device('cpu')

In [None]:
train_iter = data.BucketIterator(train_data, batch_size=64, train=True, 
                                 sort_within_batch=True, 
                                 sort_key=lambda x: (len(x.src), len(x.trg)), repeat=False,
                                 device=DEVICE)
valid_iter = data.Iterator(valid_data, batch_size=1, train=False, sort=False, repeat=False, 
                           device=DEVICE)


def rebatch(pad_idx, batch):
    """Wrap torchtext batch into our own Batch class for pre-processing"""
    return Batch(batch.src, batch.trg, pad_idx)

In [None]:
USE_CUDA = False

In [None]:
batch_iter = iter(valid_iter)

In [None]:
batch = next(batch_iter)
batch = rebatch(PAD_INDEX, batch)

In [None]:
batch.src.shape, batch.src_mask.shape, batch.src_lengths.shape

(torch.Size([1, 19]), torch.Size([1, 1, 19]), torch.Size([1]))

In [None]:
batch.src, batch.trg, batch.trg_y

(tensor([[   41,     9,  1012,   144,   464,    35,     4,    84,     9,   126,
           1715,    21,    27,     0, 11351,  1117,  8043,     3,     2]]),
 tensor([[   2,   70,   11,   24, 1460,    5,   11,  371, 9546,   66,   40,  690,
             8,    6,  427,   10, 1806,   16,   42,  510,    4]]),
 tensor([[  70,   11,   24, 1460,    5,   11,  371, 9546,   66,   40,  690,    8,
             6,  427,   10, 1806,   16,   42,  510,    4,    3]]))

In [None]:
" ".join([SRC.vocab.itos[x] for x in batch.src[0]])

'als ich 11 jahre alt war , wurde ich eines morgens von den <unk> heller freude geweckt . </s>'

In [None]:
(batch.src[0] != 0).unsqueeze(-2)

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True, False,  True,  True,  True,  True,  True]])

In [None]:
batch.src_mask[0]

tensor([[True, True, True, True, True, True, True, True, True, True, True, True,
         True, True, True, True, True, True, True]])

In [None]:
batch.src_lengths.shape

torch.Size([1])

In [None]:
batch.src_lengths[0]

tensor(19)

In [None]:
" ".join([TRG.vocab.itos[x] for x in batch.trg[0]])

'<s> when i was 11 , i remember waking up one morning to the sound of joy in my house .'

In [None]:
" ".join([TRG.vocab.itos[x] for x in batch.trg_y[0]])

'when i was 11 , i remember waking up one morning to the sound of joy in my house . </s>'

In [None]:
batch.trg_mask[0]

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True])