### First connect the colab

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
import os
import sys

In [3]:

path = '/content/gdrive/My Drive/NLP/project/'
os.chdir(path)
print(os.getcwd())

/content/gdrive/My Drive/NLP/project


##### Step result:

Now the notebook is connected to the drive with proper directory path

### Import libraries

In [4]:

## General libraries
import io
from io import open
import unicodedata
import string
import re
import random
import codecs
import math
import time
import numpy as np
import glob

In [5]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [6]:
import tensorflow as tf

In [7]:
base_dir = "."

#### Create a single file with both languages delimited by tab delimiter

- For our convenience during the preprocessing stage

In [8]:
def create_input_file(path):
  lines = io.open(path, encoding='utf-8').read().strip().split('\n')

  return lines

In [9]:
english_path_to_file = os.path.join(base_dir, 'parallel/IITB.en-hi.en')
hindi_path_to_file = os.path.join(base_dir, 'parallel/IITB.en-hi.hi')

In [10]:
eng_lines = create_input_file(english_path_to_file)
hin_lines = create_input_file(hindi_path_to_file)

In [11]:
len(eng_lines)

1609682

In [12]:
len(hin_lines)

1609682

In [13]:
eng_lines[-2:]

['UGC Twitter Handle (@ugc_india) : https://twitter.com/ugc_india?s=12',
 'It would also be broadcast on DD News.']

In [14]:
hin_lines[-2:]

['यूजीसी ट्विटर हैंडल (@ugc_india) : https://twitter.com/ugc_india?s=12',
 'कार्यक्रम को डीडी न्यूज पर भी प्रसारित किया जाएगा।']

In [None]:
file_writer = io.open( "./en-hi.txt", mode = "w",  encoding='utf-8',)
for i in range(len(en) + 1):
    file_writer.write(eng_lines[i] + '\t' + hin_lines[i] + '\n')

file_writer.close()  

In [None]:
concatenated_lines = create_input_file("./en-hi.txt")

In [None]:
len(concatenated_lines)

1609682

In [None]:
concatenated_lines[:20]

['Give your application an accessibility workout\tअपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
 'Accerciser Accessibility Explorer\tएक्सेर्साइसर पहुंचनीयता अन्वेषक',
 'The default plugin layout for the bottom panel\tनिचले पटल के लिए डिफोल्ट प्लग-इन खाका',
 'The default plugin layout for the top panel\tऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका',
 'A list of plugins that are disabled by default\tउन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है',
 'Highlight duration\tअवधि को हाइलाइट रकें',
 'The duration of the highlight box when selecting accessible nodes\tपहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि',
 'Highlight border color\tसीमांत (बोर्डर) के रंग को हाइलाइट करें',
 'The color and opacity of the highlight border.\tहाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ',
 'Highlight fill color\tभराई के रंग को हाइलाइट करें',
 'The color and opacity of the highlight fill.\tहाइलाइट किया गया भराई का रंग और पारदर्शिता। ',
 'API Browser\tएपीआई विचरक',
 'Browse the various

In [18]:
#### create small file for quick iterations 
file_writer_hi = io.open( "./small_datasets/train.hi", mode = "w",  encoding='utf-8',)
file_writer_en = io.open( "./small_datasets/train.en", mode = "w",  encoding='utf-8',)

for i in range(200000):
    file_writer_hi.write(hin_lines[i] + '\n')
    file_writer_en.write(eng_lines[i] + '\n')


file_writer_hi.close()
file_writer_en.close()

#### Create the small dataset for faster iterations while implementing the code

In [None]:
#### create small file for quick iterations 
file_writer = io.open( "./en-hi-small.txt", mode = "w",  encoding='utf-8',)
for i in range(30000):
    file_writer.write(eng_lines[i] + '\t' + hin_lines[i] + '\n')

file_writer.close() 


In [None]:
concatenated_lines_small = create_input_file("./en-hi-small.txt")

In [None]:
len(concatenated_lines_small)

30000

In [None]:
concatenated_lines_small[0:50]

['Give your application an accessibility workout\tअपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें',
 'Accerciser Accessibility Explorer\tएक्सेर्साइसर पहुंचनीयता अन्वेषक',
 'The default plugin layout for the bottom panel\tनिचले पटल के लिए डिफोल्ट प्लग-इन खाका',
 'The default plugin layout for the top panel\tऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका',
 'A list of plugins that are disabled by default\tउन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है',
 'Highlight duration\tअवधि को हाइलाइट रकें',
 'The duration of the highlight box when selecting accessible nodes\tपहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि',
 'Highlight border color\tसीमांत (बोर्डर) के रंग को हाइलाइट करें',
 'The color and opacity of the highlight border.\tहाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। ',
 'Highlight fill color\tभराई के रंग को हाइलाइट करें',
 'The color and opacity of the highlight fill.\tहाइलाइट किया गया भराई का रंग और पारदर्शिता। ',
 'API Browser\tएपीआई विचरक',
 'Browse the various

#### Create dev and test datasets with parallel sentences in same file

In [None]:
dev_english_path_to_file = os.path.join(base_dir, 'dev_test/dev.en')
dev_hindi_path_to_file = os.path.join(base_dir, 'dev_test/dev.hi')

In [None]:
dev_eng_lines = create_input_file(dev_english_path_to_file)
dev_hin_lines = create_input_file(dev_hindi_path_to_file)

In [None]:
print(len(dev_eng_lines))
print(len(dev_hin_lines))

520
520


In [None]:
file_writer = io.open( "./data/en-hi-dev.txt", mode = "w",  encoding='utf-8',)
for i in range(len(dev_eng_lines)):
    file_writer.write(dev_eng_lines[i] + '\t' + dev_hin_lines[i] + '\n')

file_writer.close()  

In [None]:
dev_concatenated_lines = create_input_file("./data/en-hi-dev.txt")

In [None]:
print(len(dev_concatenated_lines))

520


In [None]:
test_english_path_to_file = os.path.join(base_dir, 'dev_test/test.en')
test_hindi_path_to_file = os.path.join(base_dir, 'dev_test/test.hi')

In [None]:
test_eng_lines = create_input_file(test_english_path_to_file)
test_hin_lines = create_input_file(test_hindi_path_to_file)

In [None]:
print(len(test_eng_lines))
print(len(test_hin_lines))

2507
2507


In [None]:
file_writer = io.open( "./data/en-hi-test.txt", mode = "w",  encoding='utf-8',)
for i in range(len(test_eng_lines)):
    file_writer.write(test_eng_lines[i] + '\t' + test_hin_lines[i] + '\n')

file_writer.close()  

In [None]:
test_concatenated_lines = create_input_file("./data/en-hi-test.txt")

In [None]:
print(len(test_concatenated_lines))

2507


In [None]:
test_concatenated_lines[0:5]

['A black box in your car?\tआपकी कार में ब्लैक बॉक्स?',
 "As America's road planners struggle to find the cash to mend a crumbling highway system, many are beginning to see a solution in a little black box that fits neatly by the dashboard of your car.\tजबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए हाईवे सिस्टम को सुधारने के लिए धन की कमी से जूझ रहे हैं, वहीं बहुत-से लोग इसका समाधान छोटे से ब्लैक बॉक्स में देख रहे हैं, जो आपकी कार के डैशबोर्ड पर सफ़ाई से फिट हो जाता है।",
 "The devices, which track every mile a motorist drives and transmit that information to bureaucrats, are at the center of a controversial attempt in Washington and state planning offices to overhaul the outdated system for funding America's major roads.\tयह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्रत्येक मील को ट्रैक करती है तथा उस सूचना को अधिकारियों को संचारित करती है, आजकल अमेरिका की प्रमुख सड़कों का वित्त-पोषण करने के लिए पुराने हो चुके सिस्टम का जीर्णोद्धार करने के लिए वाशिंगटन और राज्य नियोजन कार्यालय के लिए

### Read and preprocess the data

In [None]:
num_examples = 30000

In [None]:
path_to_file = os.path.join(base_dir , "./en-hi-small.txt")

# english_path_to_file = os.path.join(base_dir , "parallel/IITB.en-hi.hi")
# hindi_path_to_file = os.path.join(base_dir , "parallel/IITB.en-hi.en")
# file_paths = [english_path_to_file, hindi_path_to_file]

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w, lang_name):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r" _?\s", r"", w)
  w = re.sub(r'[" "]+', " ", w)
  w = re.sub(r"\% ?[a-z]", "", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  if lang_name == 'en':
    w = re.sub(r"[^a-zA-Z0-9?'.!,¿\-\/]+", " ", w)
    w = re.sub(r" - ", "-", w)
  elif lang_name == 'hi':  ## In future, this can be modified to use the "Indic NLP toolkit"
    w = re.sub(r"([\-])", r"\1", w)
    w = re.sub(r"([\/])", r" \1 ", w)
    ## since there is nothing like "¿" in Hindi
    w = re.sub(r"[¿]+", "", w)
    ## since i see a extra letter from English in the open braces
    w = re.sub(r"\(.*\)", "", w)
    ## since i see a extra letters from English in the open braces
    w = re.sub(r"(\&.*lt\;.*bgt\;)", "", w)
    w = re.sub(r"(lt;.*bgt;)", "", w)
    w = re.sub(r"lt\; bgt\;", "", w)

    ### Replace the english characters
    w = re.sub(r"[a-zA-Z]+?\s", "", w)
    
    
  w = re.sub(r'[" "]+', " ", w)
  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [None]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence, lang_name='en'))
print(preprocess_sentence(sp_sentence, lang_name='hi').encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> ? <end>'


In [None]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [ENGLISH, SPANISH]
def create_dataset(path, num_examples, source_lang = "en" , target_lang = "hi"):
  lines = io.open(path, encoding='utf-8').read().strip().split('\n')

  source_sentences = []
  target_sentences = []

  for line in lines[:num_examples]:
      line_sents = line.split('\t')
      for i, line_sent in enumerate(line_sents):
          if i == 0:
              source_sent = preprocess_sentence(line_sent, lang_name = source_lang)
              source_sentences.append(source_sent)
          elif i == 1:
              target_sent = preprocess_sentence(line_sent, lang_name = target_lang)
              target_sentences.append(target_sent)

  # word_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  # return zip(*word_pairs)
  return source_sentences, target_sentences

In [None]:
en, hi = create_dataset(path_to_file, num_examples= num_examples)
print(en[-1])


<start> import session <end>


In [None]:
len(en)

30000

In [None]:
hi[-1]

'<start> सतर आयात कर <end>'

In [None]:
len(hi)

30000

In [None]:
print(en[0: 10])
print(hi[0:10])


['<start> give your application an accessibility workout <end>', '<start> accerciser accessibility explorer <end>', '<start> the default plugin layout for the bottom panel <end>', '<start> the default plugin layout for the top panel <end>', '<start> a list of plugins that are disabled by default <end>', '<start> highlight duration <end>', '<start> the duration of the highlight box when selecting accessible nodes <end>', '<start> highlight border color <end>', '<start> the color and opacity of the highlight border . <end>', '<start> highlight fill color <end>']
['<start> अपन अनपरयोग को पहचनीयता वयायाम का लाभ द <end>', '<start> एकसरसाइसर पहचनीयता अनवषक <end>', '<start> निचल पटल क लिए डिफोलट पलग-इन खाका <end>', '<start> ऊपरी पटल क लिए डिफोलट पलग-इन खाका <end>', '<start> उन पलग-इनो की सची जिनह डिफोलट रप स निषकरिय किया गया ह <end>', '<start> अवधि को हाइलाइट रक <end>', '<start> पहचनीय आसधि को चनत समय हाइलाइट बकस की अवधि <end>', '<start> सीमात क रग को हाइलाइट कर <end>', '<start> हाइलाइट किए ग

In [None]:
#### create small file for quick iterations 
file_writer = io.open( "./data/en-hi-small-preprocessed.txt", mode = "w",  encoding='utf-8',)
for i in range(30000):
    file_writer.write(en[i] + '\t' + hi[i] + '\n')

file_writer.close() 