# Outline

1. Data set and task
2. Data processing XML files
3. Why we need encoder decoder architecture
4. Basic GRU based encoder decoder
5. Adding attention
6. Evaluation

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np

# Instantiates the device to be used as GPU/CPU based on availability
device_gpu = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
# Visualisation tools
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
import random

  import pandas.util.testing as tm


# Prepare Data

### Alphabets setup

As usual, convert first to numerical form so that model can process.

### English

In [3]:
eng_alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
pad_char = '-PAD-'

eng_alpha2index = {pad_char:0}
for index, alpha in enumerate(eng_alphabets):
  eng_alpha2index[alpha] = index+1

# Same as:
# for index, alpha in enumerate(eng_alphabets, 1):
#   eng_alpha2index[alpha] = index

print(eng_alpha2index)

{'-PAD-': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15, 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}


### Hindi

In [4]:
# Hindi Unicode Hex Range is 2304:2432

hindi_alphabets = [chr(alpha) for alpha in range(2304, 2432)]
hindi_alphabet_size = len(hindi_alphabets)

hindi_alpha2index = {pad_char : 0}
for index, alpha in enumerate(hindi_alphabets):
  hindi_alpha2index[alpha] = index + 1

print(hindi_alpha2index)

{'-PAD-': 0, 'ऀ': 1, 'ँ': 2, 'ं': 3, 'ः': 4, 'ऄ': 5, 'अ': 6, 'आ': 7, 'इ': 8, 'ई': 9, 'उ': 10, 'ऊ': 11, 'ऋ': 12, 'ऌ': 13, 'ऍ': 14, 'ऎ': 15, 'ए': 16, 'ऐ': 17, 'ऑ': 18, 'ऒ': 19, 'ओ': 20, 'औ': 21, 'क': 22, 'ख': 23, 'ग': 24, 'घ': 25, 'ङ': 26, 'च': 27, 'छ': 28, 'ज': 29, 'झ': 30, 'ञ': 31, 'ट': 32, 'ठ': 33, 'ड': 34, 'ढ': 35, 'ण': 36, 'त': 37, 'थ': 38, 'द': 39, 'ध': 40, 'न': 41, 'ऩ': 42, 'प': 43, 'फ': 44, 'ब': 45, 'भ': 46, 'म': 47, 'य': 48, 'र': 49, 'ऱ': 50, 'ल': 51, 'ळ': 52, 'ऴ': 53, 'व': 54, 'श': 55, 'ष': 56, 'स': 57, 'ह': 58, 'ऺ': 59, 'ऻ': 60, '़': 61, 'ऽ': 62, 'ा': 63, 'ि': 64, 'ी': 65, 'ु': 66, 'ू': 67, 'ृ': 68, 'ॄ': 69, 'ॅ': 70, 'ॆ': 71, 'े': 72, 'ै': 73, 'ॉ': 74, 'ॊ': 75, 'ो': 76, 'ौ': 77, '्': 78, 'ॎ': 79, 'ॏ': 80, 'ॐ': 81, '॑': 82, '॒': 83, '॓': 84, '॔': 85, 'ॕ': 86, 'ॖ': 87, 'ॗ': 88, 'क़': 89, 'ख़': 90, 'ग़': 91, 'ज़': 92, 'ड़': 93, 'ढ़': 94, 'फ़': 95, 'य़': 96, 'ॠ': 97, 'ॡ': 98, 'ॢ': 99, 'ॣ': 100, '।': 101, '॥': 102, '०': 103, '१': 104, '२': 105, '३': 106, '४': 107, '५': 108, '६': 109, '७': 

Note: de in devanagiri in Hindi, though looks like one character (in Hindi), is actually 2 unicode characters : da and e (in hindi)

# Data Pre-processing helper functions

In [0]:
import re # regular expressions
non_eng_letters_regex = re.compile('[^a-zA-Z ]')

# Remove all English non-letters (alphabets and space)
def cleanEnglishVocab(line):
  line = line.replace('-',' ').replace(',',' ').upper() # '-' and ',' act as space
  line = non_eng_letters_regex.sub('', line) # substitute all chars of non_eng_letters_regex present in line by nothing ('')
  return line.split()

# Remove all Hindi non-letters
def cleanHindiVocab(line):
  # print(line)
  line = line.replace('-',' ').replace(',',' ')
  cleaned_line = ''
  for char in line:
    if char in hindi_alpha2index or char == ' ':
      cleaned_line += char
  return cleaned_line.split() # mistake : gave it one indent more so always returned length 1

# Dataset Loading

In [0]:
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET # available in Python

In [0]:
tree = ET.parse('NEWS2012-Ref-EnHi-1000.xml')

In [8]:
root = tree.getroot()
print(root)

<Element 'TransliterationCorpus' at 0x7f5982be5ea8>


In [9]:
root.tag

'TransliterationCorpus'

The root tag is named TransliterationCorpus

In [10]:
i = 0
for child in root:
    print(child.tag, child.attrib)
    if i > 5:
      break
    i += 1

Name {'ID': '1'}
Name {'ID': '2'}
Name {'ID': '3'}
Name {'ID': '4'}
Name {'ID': '5'}
Name {'ID': '6'}
Name {'ID': '7'}


The next sub level tags are "Name" tags with "ID" attribute.  
Similar to how a HTML tag div, has an attribute "class" or "id"

In [11]:
i = 0
for elem in root.iter():
    print(elem.tag)
    if i > 11:
      break
    i += 1

TransliterationCorpus
Name
SourceName
TargetName
Name
SourceName
TargetName
Name
SourceName
TargetName
Name
SourceName
TargetName


The above lists the inner elements.  
The inner structure is :  
TransliterationCorpus -> Name -> SourceName at same level as TargetName

In [12]:
# [(ele.tag, ele.attrib) for ele in root.iter("Name")] : prints all attributes of Name tag
# nextelem.tag or nextelem.attrib prints empty braces since this tag has no attributes

i = 0
for nextelem in root.iter("SourceName"):
  print(nextelem.text)
  if i > 11:
    break
  i += 1

Aadhi
Aakash
Aap
Aayasha
Aayee
Abduh
Aberhart
Abey
Abou
Abri
Academy
Accommodation
Acorn


In [13]:
i = 0
for nextelem in root.iter("TargetName"):
  print(nextelem.text)
  if i > 11:
    break
  i += 1

आधी
आकाश
आप
आयशा
आई
अब्दुस
एबरहर्ट
अबेय
अबू
एब्री
अकेडमी
एकेडमी
एकोमडेशन


Thus extracted English and corresponding Hindi from XML.

In [0]:
class TransliterationLoader(Dataset): # extends Dataset, adding features of our own to it
  def __init__(self, filename):
    self.eng_words, self.hindi_words = self.readXMLDataset(filename)
    self.shuffle_indices = list(range(len(self.eng_words)))
    random.shuffle(self.shuffle_indices)
    self.shuffle_start_index = 0

  def __len__(self):
    return len(self.eng_words)

  def __getitem__(self, idx):
    return self.eng_words[idx], self.hindi_words[idx]

  def readXMLDataset(self, filename):
    transliterationCorpus = ET.parse(filename).getroot()
    English_words = []
    Hindi_words = []

    for line in transliterationCorpus:
      wordlist1 = cleanEnglishVocab(line[0].text)
      wordlist2 = cleanHindiVocab(line[1].text)
      # print(wordlist1, '-', wordlist2)

      # Skip noisy data i.e. where number of "words" (not "characters" in each word) is not the same in English and Hindi
      # e.g. English: Stirling Smith Museum And Art Gallery, Hindi: स (4 words in English corresponding to one in Hindi)
      if len(wordlist1) != len(wordlist2):
        print("Skipping: ", line[0].text, '-', line[1].text)
        continue

      # Since each tag may contain more than one word such as : ['STIRLING', 'SMITH', 'MUSEUM', 'AND', 'ART', 'GALLERY'], ['DEOGAN', 'ROAD']
      # ['देवगन', 'रोड']
      for word in wordlist1: 
        English_words.append(word)

      for word in wordlist2: 
        Hindi_words.append(word)

    return English_words, Hindi_words

  def get_random_sample(self):
    return self.__getitem__(np.random.randint(self.__len__()))

  # generalise for Hindi and English (array is the parameter that decides if Hindi or English)
  def get_batch_from_array(self, batch_size, array):
    end = self.shuffle_start_index + batch_size
    batch = []
    if end >= self.__len__():
      batch = [array[i] for i in self.shuffle_indices[0:end%self.__len__()]] # mod if batch size ("end") > the length of eng_words in the corpus we got from the XML file
      end = self.__len__()
    return batch + [array[i] for i in self.shuffle_indices[self.shuffle_start_index:end]]

  def get_batch(self, batch_size, postprocess = True):
    eng_batch = self.get_batch_from_array(batch_size, self.eng_words)
    hindi_batch = self.get_batch_from_array(batch_size, self.hindi_words)
    self.shuffle_start_index += batch_size + 1
  
    # Reshuffle if 1 epoch is complete
    if self.shuffle_start_index >= self.__len__():
      random.shuffle(self.shuffle_indices)
      self.shuffle_start_index = 0

    return eng_batch, hindi_batch

In [47]:
train_data = TransliterationLoader('NEWS2012-Training-EnHi-13937.xml')
test_data = TransliterationLoader('NEWS2012-Ref-EnHi-1000.xml')

Skipping:  BARHARWA JUNCTION - बरहरवा
Skipping:  STATE BNK TR - स्टेट बैंक ऑफ त्रावणकोर
Skipping:  SOUTH ARLINGTON CHURCH OF CHRIST - साउथ अर्लिंग्टन
Skipping:  KING EDWARD VII - किंग एडवर्ड
Skipping:  DIBANG VALLEY - दिबंगवैली
Skipping:  ORDER OF VASA - ऑडर ऑफ़ द वासा
Skipping:  AZAMNAGAR ROAD - आज़मनगर
Skipping:  CAPE TOWN - केपटाउन
Skipping:  NEW ZEALAND - न्यूज़ीलैंड
Skipping:  SEA OF THE HEBRIDES - सी ऑफ हरब्रिड्‍स
Skipping:  RAMCOIND - राम्को इंड
Skipping:  KELVINGROVE ART GALLERY AND MUSEUM - केल्व‍िनग्रोव आर्ट एण्ड म्युज़ियम
Skipping:  AUSTRALIAN NATIONAL UNIVERSITY - ऑस्ट्रेलियननेशनल यूनिवर्सिटी
Skipping:  JAHAN AARA - जहाँआरा
Skipping:  NAVABHARAT FERRO ALLOYS - नव भारत फ़ैरो अलॉय
Skipping:  RAMA LINGESHWARA - रामालिंगेश्वर
Skipping:  FAKHRUN NISA - फखरुन्निसा
Skipping:  REDIFF.COM INDIA LIMITED - रेडिफ़ डॉट कॉम इंडिया लिमिटेड
Skipping:  OMKARNATH THAKUR - ओंकार नाथ ठाकुर
Skipping:  OPENTV - ओपन टीवी
Skipping:  ENVOY COMMUNICATIONS GROUP - एन्वॉय कम्युनिकेशंस
Skipping:  WAR OF T

# Data Visualisation

In [49]:
print("Train set size: ", len(train_data))
print("Test set size: ", len(test_data))

print("\nSample data from train set:")
for i in range(10):
  eng, hindi = train_data.get_random_sample()
  print(eng + '-' + hindi)

Train set size:  20543
Test set size:  1000

Sample data from train set:
JYOTI-ज्योति
JUNCTION-जंक्शन
ROUNDER-राउंडर
MAGEE-मगी
FORT-फोर्ट
ROCHEFORT-रॉचेफोर्ट
NOOR-नूर
MAIL-मेल
CANYON-केन्यॉन
KHIDASH-खिदाश


(Note all english converted to upper case)

# Encoding single words

In [0]:
# OHE for English words (input) - convert word to number (OHE main purpose to convert to number)
def word_rep( word, letter2index, device = 'cpu'):
  rep = torch.zeros(len(word)+1, 1, len(letter2index)).to(device)
  # (number of characters in word, batch in sequence problems, OHE length of each character in vocabulary i.e. 27)
  for letter_index, letter in enumerate(word):
    pos = letter2index[letter]
    rep[letter_index][0][pos] = 1
  pad_pos = letter2index[pad_char]
  # marking the last character in the word as PAD (remember we made word size = actual word size + 1)
  rep[letter_index+1][0][pad_pos] = 1
  return rep


# Just label for Hindi words (output) sice just classification task
def gt_rep(word, letter2index, device = 'cpu'): # ground truth
  gt_rep = torch.zeros([len(word)+1, 1], dtype = torch.long).to(device)
  for letter_index, letter in enumerate(word):
    pos = letter2index[letter]
    gt_rep[letter_index][0] =  pos
  gt_rep[letter_index + 1][0] = letter2index[pad_char]
  return gt_rep

Get one pair of (English word, corresponding Hindi word)

In [0]:
eng, hindi = train_data.get_random_sample()

In [52]:
eng_rep = word_rep(eng, eng_alpha2index)
print(eng, "\n", eng_rep)

KHIDASH 
 tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

8 characters including PAD

In [53]:
hindi_gt = gt_rep(hindi, hindi_alpha2index)
print(hindi, "\n", hindi_gt)

खिदाश 
 tensor([[23],
        [64],
        [39],
        [63],
        [55],
        [ 0]])


6 characters including PAD (kha, i, da, aa, sha, PAD)

***NOTE: Number of characters in input need not correspond to number of characters in output. That's the reason we use encoder-decoder architecture***

# Network Architecture

Note: 3 letters in input (KHI) correspond to two letters in output (kha, i)  
Similarly 2 letters in input (SH) correspond to one in output (sha)

In such tasks, where output size != input size, need to encode entire input using the encoder part.  
And then process it further using the decoder.

## Encoder-Decoder Using GRU

In [0]:
MAX_OUTPUT_CHARS = 30
class Transliteration_EncoderDecoder(nn.Module):

  def __init__(self, input_size, hidden_size, output_size, verbose = False):
    # input size is the size of each input character of the word i.e. 27 (OHE)
    # output size is the size of each output word character i.e. 118 Hindi characters (Label)
    # hidden_size is the number of layers (hyperparameters) of both encoder and decoder RNN cell (needn't be the same, can take 2 parameters for encoder hidden size and decoder hidden size)

    super(Transliteration_EncoderDecoder, self).__init__()

    self.hidden_size = hidden_size
    self.output_size = output_size

    # size of input to encoder cell is input_size
    self.encoder_rnn_cell = nn.GRU(input_size, hidden_size)
    # size of input to decoder cell is output_size, since the output of the previous timestamp is itself fed to the decoder cell of the current timestamp
    self.decoder_rnn_cell = nn.GRU(output_size, hidden_size)

    # fully connected
    self.h2o = nn.Linear(hidden_size, output_size)
    # softmax since classification
    self.softmax = nn.LogSoftmax(dim=2)

    self.verbose = verbose

  def forward(self, input, max_output_chars = MAX_OUTPUT_CHARS, device = 'cpu', ground_truth = None):

    # encoder

    """
    Very important:
    Encoder is a single GRU/LSTM cell (here, GRU). Now feeding to it (encoding) can be done in 2 ways when sequence (string) problems.
    i) Feeding one character (vector size : 27), one at a time. And call it as many times as there are alphabets in the word. Thus input to the GRU once is (1,1,27)
    ii) Feeding all the characters at once. In that case, input to GRU is (number of alphabets in word, 1, 27). In one go, computes output of the 6 characters. Internally it happens step wise - i.e. first cell, then output to second cell and so on.
    This is possible, since encoding is a simple computation.
    The dimension of the "out" correspondingly changes.
    """
    out, hidden = self.encoder_rnn_cell(input)

    if self.verbose:
      print('Encoder input', input.shape)
      print('Encoder ouput', out.shape)
      print('Encoder hidden', hidden.shape)

    # decoder
    decoder_state = hidden
    decoder_input = torch.zeros(1, 1, self.output_size).to(device)
    outputs = []

    if self.verbose:
      print('Decoder state', decoder_state.shape)
      print('Decoder input', decoder_input.shape)

    for i in range(max_output_chars):

      out, decoder_state = self.decoder_rnn_cell(decoder_input, decoder_state)

      if self.verbose:
        print('Decoder intermediate output', out.shape)

      out = self.h2o(decoder_state)
      out = self.softmax(out)
      outputs.append(out.view(1, -1))

      if self.verbose:
        print('Decoder output', out.shape)
        self.verbose = False
      
      max_idx = torch.argmax(out, 2, keepdim = True)
      if not ground_truth is None:
        max_idx = ground_truth[i].reshape(1, 1, 1)
      one_hot = torch.FloatTensor(out.shape).to(device)
      one_hot.zero_()
      one_hot.scatter_(2, max_idx, 1)

      decoder_input = one_hot.detach()

    return outputs

In [0]:
# Chosen number of hidden layers of both encoder and decoder to be 256
net = Transliteration_EncoderDecoder(len(eng_alpha2index), 256, len(hindi_alpha2index), verbose = True)

In [0]:
def infer(network, word, max_chars):
  network.eval()
  # network = network.__init__(len(eng_alphabets)+1, 256, len(hindi_alphabets)+1)
  word = word_rep(word, eng_alpha2index)
  print(word)

  output = network(word, max_chars)
  return output

In [103]:
out = infer(net, "INDIA", 30)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
Encoder input torch.Size([6, 1, 27])
Encoder ouput torch.Size([6, 1, 256])
Encoder hidden torch.Size([1, 1, 256])
Decoder state torch.Size([1, 1, 256])
Decoder input torch.Size([1, 1, 129])
Decoder intermediate out