<a href="https://colab.research.google.com/github/sindhu213/Research-Papers/blob/master/src/RNN_Encoder_Decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [101]:
import re
import torch
from urllib.request import urlopen
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Preparing Data

In [139]:
link = "https://raw.githubusercontent.com/suvaansh/Machine-Translation-English-to-Hindi-/master/hin.txt"

with urlopen(link) as url:
  temp = url.read().decode('utf-8').split('\n')
  ds = [item.split('\t') for item in temp]

In [140]:
DATA = ds[:len(ds)-2]

In [141]:
def tokenize_en(text):
  tokenized = re.sub(r'[^\w\s+]',' ',text.lower())
  return tokenized.split()

def tokenize_hi(text):
  tokenized = re.sub(r'[!(),-./।:;"?[\]^_`{|}~]',' ',text)
  return tokenized.split()

In [142]:
## sanity check
print(tokenize_en('He said, "I will try to get this work done by tomorrow!"'))
print(tokenize_hi('उन्होंने कहा, "मैं यह काम कल तक पूरा करने की कोशिश करूंगा!"'))

['he', 'said', 'i', 'will', 'try', 'to', 'get', 'this', 'work', 'done', 'by', 'tomorrow']
['उन्होंने', 'कहा', 'मैं', 'यह', 'काम', 'कल', 'तक', 'पूरा', 'करने', 'की', 'कोशिश', 'करूंगा']


In [143]:
def yield_tokens(iter,lang):
  for EN,HI in iter:
    if lang is 'en': yield tokenize_en(EN)
    else: yield tokenize_hi(HI)

In [144]:
EN_VOCAB = build_vocab_from_iterator(yield_tokens(DATA,'en'),min_freq=2,specials=["<unk>","<pad>","<bos>","<eos>"])
EN_VOCAB.set_default_index(0)

HI_VOCAB = build_vocab_from_iterator(yield_tokens(DATA,'hi'),min_freq=2,specials=["<unk>","<pad>","<bos>","<eos>"])
HI_VOCAB.set_default_index(0)

In [145]:
print("Total no. of training examples:",len(DATA))
print("English vocab size:",len(EN_VOCAB))
print("Hindi vocab size:",len(HI_VOCAB))

Total no. of training examples: 2868
English vocab size: 1271
Hindi vocab size: 1422


In [146]:
en_pipeline = lambda x: EN_VOCAB(["<bos>",*tokenize_en(x),"<eos>"])
hi_pipeline = lambda x: HI_VOCAB(["<bos>",*tokenize_hi(x),"<eos>"])

In [147]:
def collate_batch(data_iter):
  src,tgt = [],[]
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  for x,y in data_iter:
    src_encoded = en_pipeline(x)
    src.append(torch.tensor(src_encoded,dtype=torch.int64,device=device))
    tgt_encoded = hi_pipeline(y)
    tgt.append(torch.tensor(tgt_encoded,dtype=torch.int64,device=device))
  
  src = pad_sequence(src,batch_first=True,padding_value=1)
  tgt = pad_sequence(tgt,batch_first=True,padding_value=1)
  
  return src.to(device), tgt.to(device)

In [148]:
BATCH_SIZE = 64

TRAIN_DL = DataLoader(DATA,batch_size=BATCH_SIZE,shuffle=True,drop_last=True,collate_fn=collate_batch)

In [150]:
en_tensor,hi_tensor = next(iter(TRAIN_DL))
print("[BATCH_SIZE,en_seq_length]: ",en_tensor.shape)
print("[BATCH_SIZE,hi_seq_length]: ",hi_tensor.shape)

[BATCH_SIZE,en_seq_length]:  torch.Size([64, 13])
[BATCH_SIZE,hi_seq_length]:  torch.Size([64, 18])


# Model