### We will implement a Neural Machine Translation with LSTM using Teacher Forcing

In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import multiprocessing as mp
import dask.dataframe as dd
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import time
import re
import string
from string import digits

In [2]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [3]:
data = pd.read_table('./Data/spa.txt',names=['source','target','comments'])
data.sample(5)

Unnamed: 0,source,target,comments
1099,He gave in.,Él se rindió.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1860,He was busy.,Él estaba ocupado.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
26495,This cat is not ours.,Este gato no es nuestro.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
15965,Where are my keys?,¿Dónde están mis llaves?,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
54021,Do you need a place to stay?,¿Necesitáis un lugar para alojaros?,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [4]:
num_partitions = 10 #number of partitions to split dataframe
num_cores = mp.cpu_count() #number of cores on your machine
Client(n_workers=4, threads_per_worker=2, memory_limit='4GB')

0,1
Client  Scheduler: tcp://127.0.0.1:49448  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 16.00 GB


### Clean Texts

In [5]:
# Remove HTML Tags
def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free_text = soup.get_text()
    return html_free_text

# Remove Punctuations
special_characters= set(string.punctuation)
def punctuation_remover(text):
    punctuation_free_text = " ".join([word for word in text if word \
                                    not in special_characters])
    return punctuation_free_text


# Convert to lower case
def convert_to_lowercase(tokens):
    low = []
    for tok in tokens:
        low.append(tok.lower().strip())
    return low

# Lemmatization
def lemmatize_words(text):
    words = nlp(str(text))
    return [word.lemma_ for word in words if word.lemma_ != '-PRON-']  


num_digits= str.maketrans('','', digits)
remove_digits = lambda x: x.translate(num_digits)

In [6]:
def clean_text_for_tasks(text, for_pos_tagging = False):
    cleaned_text = remove_html(text)
    cleaned_text = word_tokenize(cleaned_text)
    cleaned_text = lemmatize_words(cleaned_text)
    cleaned_text = punctuation_remover(cleaned_text)
    cleaned_text = remove_digits(cleaned_text)
    
    return cleaned_text

In [7]:
dask_df = dd.from_pandas(data,npartitions=8)

In [8]:
dask_df

Unnamed: 0_level_0,source,target,comments
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,object,object,object
15472,...,...,...
...,...,...,...
108304,...,...,...
123769,...,...,...


In [9]:
missing_values = dask_df.isnull().sum()
with ProgressBar():
    missing_percent = ((missing_values / dask_df.index.size)*100).compute()
missing_percent  

source      0.0
target      0.0
comments    0.0
dtype: float64

In [10]:
def clean_df(df):
    df['cleaned_source'] = df.source.map(clean_text_for_tasks)
    df['cleaned_target'] = df.target.map(clean_text_for_tasks)
    return df

In [11]:
data['cleaned_source'] = -1
data['cleaned_target'] = -1


In [12]:
start = time.time()
result = dask_df.map_partitions(clean_df,meta=data)

In [13]:
df = result.compute()
print(f'Time Taken for Processing {df.shape[0]} rows with Dask(4 Workers ,8 Cores) : {time.time()-start} ')

Time Taken for Processing 123770 rows with Dask(4 Workers ,8 Cores) : 962.1482224464417 


In [14]:
df.head()

Unnamed: 0,source,target,comments,cleaned_source,cleaned_target
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,Ve
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,Vete
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,Vaya
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,Váyase
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,hi,Hola


### Check if GPU's are available

In [15]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


### Tagginng Sentences with BOS and EOS

In [17]:
df.cleaned_target = df.cleaned_target.apply(lambda x : 'START_ '+ x + ' _END')

In [20]:
VOCAB_SIZE = 20000 # max no. of words for tokenizer , Top 5000 Words in the Vocabulary
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence)
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
OOV_TOKEN = '<OOV>'

In [34]:
def vocab_creator(texts,VOCAB_SIZE=VOCAB_SIZE):
     tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
     tokenizer.fit_on_texts(texts)
     word_index_dictionary = tokenizer.word_index

     word2index = {}
     index2word = {}
     for key,value in word_index_dictionary.items():
         if value < VOCAB_SIZE:
             word2index[key] = value
             index2word[value] = key
         if value >= VOCAB_SIZE-1:
             continue
     return word2index ,index2word



# return word_index
# dict(list(word_index.items())[:15])

In [35]:
source_word2index ,source_index2word = vocab_creator(df.cleaned_source)

In [36]:
dict(list(source_index2word.items())[:15])

{1: 'be',
 2: 'the',
 3: 'to',
 4: 'tom',
 5: 'do',
 6: 'a',
 7: "n't",
 8: 'have',
 9: "'s",
 10: 'that',
 11: 'in',
 12: 'of',
 13: 'this',
 14: 'go',
 15: 'for'}

In [32]:
target_word2index ,target_index2word = vocab_creator(df.cleaned_target)

In [33]:
dict(list(target_word2index.items())[:15])

{'start': 1,
 'end': 2,
 'de': 3,
 'que': 4,
 'no': 5,
 'a': 6,
 'tom': 7,
 'la': 8,
 '¿': 9,
 'el': 10,
 'en': 11,
 'es': 12,
 'un': 13,
 'se': 14,
 'por': 15}