<a href="https://colab.research.google.com/github/riccardo1980/colab_bench/blob/master/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x

In [2]:
import tensorflow
print(tensorflow.__version__)

2.3.0


In [3]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time

# Dataset download
Resources: http://www.manythings.org/anki/

In [4]:
# Download the file
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip', origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True)

path_to_file = os.path.dirname(path_to_zip)+"/spa-eng/spa.txt"

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


# Preprocessing functions

In [5]:
def unicode_to_ascii(s: str) -> str:
  """
    Converts unicode string to ascii
  
    Non-spacing marks (Mn category) are discarded,
    see https://www.fileformat.info/info/unicode/category/Mn/list.htm 

    Applies Normalization Form C (NFC)

    :param s: unicode string
    :return: ascii string
  """

  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w: str) -> str:
  """
    Convert single string sentence
    
    1. unicode to ascii
    2. adds a space between word and following punctuation
    3. removes all chars except a-Z, A-Z, , ".", "?", "!", ",","¿"
    4. removes leading/trailing blanks
    5. adds start/end tokens

    :param w: unicode string string
    :return: cleaned string  
  """
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except basic punctuation and alpha chars
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

## Test preprocessing functions

In [6]:
en_sentence = u"May I borrow this book?"
sp_sentence = u"¿Puedo tomar prestado este libro?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence).encode('utf-8'))

<start> may i borrow this book ? <end>
b'<start> \xc2\xbf puedo tomar prestado este libro ? <end>'


# Dataset functions

In [20]:
from typing import Tuple, List

def create_dataset(path: str, num_examples: int) -> List[List[str]]:
  """
    Create pairs of sentences
  
    1. Remove the accents
    2. Clean the sentences
    3. Return sentences grouped by language]

    :param path: path to input file
    :param num_examples: maximum number of examples
    :return: tuple containing two list of sequences, one for each column in input file 
  """
  # each line contains two columns separated by tab character
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')

  # split lines, preprocess phrases, get a tuple for each line
  sentence_pairs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]

  # rearrange to a tuple for each language
  return zip(*sentence_pairs)


def tokenize(lang: List[str]) -> Tuple[np.ndarray, tf.keras.preprocessing.text.Tokenizer]:
  """
    Fit a tokenizer on input list of sentences

    :param lang: list of sentences of same language
    :return: a tuple of:
      a tensor of size [NUMBER_OF_SENTENCES, SENTENCE_SIZE] contaning the vectorizations of the sentences
      a learned tokenizer
  """

  # create vanilla tokenizer
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  
  # learn tokenization procedure on given set of sentences
  lang_tokenizer.fit_on_texts(lang)

  # transforms sentences in sequences of integers (sequences are of different lengths)
  tensor = lang_tokenizer.texts_to_sequences(lang)

  # pad sequenes
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

def load_dataset(path, num_examples=None) -> Tuple[np.ndarray, np.ndarray, tf.keras.preprocessing.text.Tokenizer, tf.keras.preprocessing.text.Tokenizer]:
  """
    Load dataset, with preprocessing and tokenization

    :param path: path to input file
     :param num_examples: maximum number of examples
  """

  # creating cleaned input, output pairs
  targ_lang, inp_lang = create_dataset(path, num_examples)

  # tokenization
  input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
  target_tensor, targ_lang_tokenizer = tokenize(targ_lang)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

## Test dataset functions

In [25]:
target_sentences, input_sentences = create_dataset(path_to_file, None)
print(en[-1])
print(sp[-1])

<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [26]:
num_examples = 30000
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(path_to_file, num_examples)

In [None]:
target_sentences[0]

'<start> go . <end>'

In [None]:
targ_lang_tokenizer.texts_to_sequences([['go']])

[[36]]

In [None]:
targ_lang_tokenizer.sequences_to_texts(
    [np.arange(20)]
)

['<start> <end> . i tom you ? is a it s t the he to we me m this']

In [36]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

# Configuration

In [38]:
num_examples = 30000

BATCH_SIZE = 64
embedding_dim = 256
units = 1024


# Dataset creation

In [40]:
input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer = load_dataset(path_to_file, num_examples)
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

In [41]:
BUFFER_SIZE = len(input_tensor_train)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [42]:
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
vocab_inp_size = len(inp_lang_tokenizer.word_index)+1
vocab_tar_size = len(targ_lang_tokenizer.word_index)+1
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]