### We will implement a Neural Machine Translation with LSTM using Teacher Forcing

In [1]:
import pandas as pd 
import numpy as np 
import tensorflow as tf
# import spacy
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from bs4 import BeautifulSoup
import multiprocessing as mp
import dask.dataframe as dd
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import time
import re
import unicodedata
import string
from string import digits
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
# spacy.prefer_gpu()
# nlp = spacy.load("en_core_web_sm")

In [3]:
data = pd.read_table('./Data/spa.txt',names=['source','target','comments'])
data.sample(5)

Unnamed: 0,source,target,comments
82676,I am looking forward to seeing you.,Espero verte.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
7495,Wash your face.,Lávate la cara.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
26685,Tom doesn't watch TV.,Tom no ve televisión.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
113160,I don't know whether this'll be of any use to ...,No sé si esto le servirá para algo.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
88506,Don't let anyone leave this building.,No dejes salir a nadie de este edificio.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...


In [4]:
num_partitions = 10 #number of partitions to split dataframe
num_cores = mp.cpu_count() #number of cores on your machine
# Client(n_workers=num_cores, threads_per_worker=2, memory_limit='4GB')
Client(n_workers=num_cores, threads_per_worker=2)

0,1
Client  Scheduler: tcp://127.0.0.1:53970  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 12  Cores: 24  Memory: 17.10 GB


### Clean Texts

In [35]:
# Remove HTML Tags
# def remove_html(text):
#     soup = BeautifulSoup(text,'lxml')
#     html_free_text = soup.get_text()
#     return html_free_text
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

# Remove Punctuations
special_characters= set(string.punctuation)
def punctuation_remover(w):
    # punctuation_free_text = ''.join([word for word in text if word \
    #                                 not in special_characters])
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
#   w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = re.sub(r"[^a-zA-Z]+", " ", w)

  w = w.strip()
  return w


# Convert to lower case
def convert_to_lowercase(sentence):
    return sentence.lower().strip()
    # low = []
    # for tok in tokens:
    #     low.append(tok.lower().strip())
    # return low

# Lemmatization
# def lemmatize_words(text):
#     words = nlp(str(text))
#     return [word.lemma_ for word in words if word.lemma_ != '-PRON-']  


remove_qoutes = lambda x: re.sub("'", '', x)


num_digits= str.maketrans('','', digits)
remove_digits = lambda x: x.translate(num_digits)

In [36]:
def clean_text_for_tasks(text, for_pos_tagging = False):
    cleaned_text = convert_to_lowercase(text)
    cleaned_text = unicode_to_ascii(cleaned_text)
    cleaned_text = punctuation_remover(cleaned_text)
    cleaned_text = remove_digits(cleaned_text)
    cleaned_text = remove_qoutes(cleaned_text)

    return cleaned_text

### Cleanup with Dask

In [37]:
 start = time.time()
 data['cleaned_source'] = data.source.apply(clean_text_for_tasks)
 data['cleaned_target'] = data.target.apply(clean_text_for_tasks)
 print(f'Time Taken for Processing {data.shape[0]} rows Sequentially : {time.time()-start} ')

Time Taken for Processing 123770 rows Sequentially : 5.920709133148193 


In [38]:
data.to_csv('./Data/cleaned_data.csv',index=False)

In [39]:
data.head()

Unnamed: 0,source,target,comments,cleaned_source,cleaned_target
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,ve
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,vete
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,vaya
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,vayase
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,hi,hola


### Check if GPU's are available

In [40]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


### Tagginng Sentences with BOS and EOS

In [41]:
dask_df = dd.from_pandas(data,npartitions=12)
missing_values = dask_df.isnull().sum()
with ProgressBar():
    missing_percent = ((missing_values / dask_df.index.size)*100).compute()
print(missing_percent) 

def clean_df(df):
    df['cleaned_source'] = df.source.map(clean_text_for_tasks)
    df['cleaned_target'] = df.target.map(clean_text_for_tasks)
    return df

data['cleaned_source'] = -1
data['cleaned_target'] = -1

start = time.time()
result = dask_df.map_partitions(clean_df,meta=data)

df = result.compute()
print(f'Time Taken for Processing {df.shape[0]} rows with Dask(4 Workers ,8 Cores) : {time.time()-start} ')

source            0.0
target            0.0
comments          0.0
cleaned_source    0.0
cleaned_target    0.0
dtype: float64
Time Taken for Processing 123770 rows with Dask(4 Workers ,8 Cores) : 2.065565586090088 


In [42]:
df

Unnamed: 0,source,target,comments,cleaned_source,cleaned_target
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,ve
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,vete
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,vaya
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,vayase
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,hi,hola
...,...,...,...,...,...
123765,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...,there are four main causes of alcohol related ...,hay cuatro causas principales de muertes relac...
123766,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...,CC-BY 2.0 (France) Attribution: tatoeba.org #3...,there are mothers and fathers who will lie awa...,hay madres y padres que se quedan despiertos d...
123767,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...,CC-BY 2.0 (France) Attribution: tatoeba.org #1...,a carbon footprint is the amount of carbon dio...,una huella de carbono es la cantidad de contam...
123768,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...,since there are usually multiple websites on a...,como suele haber varias paginas web sobre cual...


In [43]:
df.cleaned_target = df.cleaned_target.apply(lambda x : 'START_ '+ x + ' _END')

In [44]:
df.head()

Unnamed: 0,source,target,comments,cleaned_source,cleaned_target
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ ve _END
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ vete _END
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ vaya _END
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...,go,START_ vayase _END
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...,hi,START_ hola _END


In [45]:
df.to_csv('./Data/cleaned_data.csv',index=False)

In [21]:
VOCAB_SIZE = 20000 # max no. of words for tokenizer , Top 5000 Words in the Vocabulary
MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence)
EMBEDDING_DIM = 300      # embedding dimensions for word vectors
OOV_TOKEN = '<OOV>'

In [25]:
def vocab_creator(texts,VOCAB_SIZE=VOCAB_SIZE):
     tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
     tokenizer.fit_on_texts(texts)
     word_index_dictionary = tokenizer.word_index

     word2index = {}
     index2word = {}
     for key,value in word_index_dictionary.items():
         if value < VOCAB_SIZE:
             word2index[key] = value
             index2word[value] = key
         if value >= VOCAB_SIZE-1:
             continue
     return word2index ,index2word



# return word_index
# dict(list(word_index.items())[:15])

In [26]:
source_word2index ,source_index2word = vocab_creator(data.cleaned_source)

In [27]:
dict(list(source_index2word.items())[:15])

{1: 'be',
 2: 'the',
 3: 'to',
 4: 'tom',
 5: 'do',
 6: 'a',
 7: "n't",
 8: 'have',
 9: "'s",
 10: 'that',
 11: 'in',
 12: 'of',
 13: 'this',
 14: 'go',
 15: 'for'}

In [28]:
target_word2index ,target_index2word = vocab_creator(data.cleaned_target)

In [29]:
dict(list(target_word2index.items())[:15])

{'start': 1,
 'end': 2,
 'de': 3,
 'que': 4,
 'no': 5,
 'a': 6,
 'tom': 7,
 'la': 8,
 '¿': 9,
 'el': 10,
 'en': 11,
 'es': 12,
 'un': 13,
 'se': 14,
 'por': 15}

In [30]:
df = shuffle(data)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df.cleaned_source, df.cleaned_target, test_size = 0.1)
X_train.shape, X_test.shape

((111393,), (12377,))