In [4]:
import torch
import torch.nn as nn
import torchtext
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import os
import pathlib
import random
import string
import re
import numpy as np

#### Download data

In [None]:
import requests
import io
import zipfile

url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

response = requests.get(url)

if response.status_code == 200:
    print("Zip file downloaded.")
else:
    print(f"Unable to download zip file: {response.status_code}")
    exit()

zip_content = io.BytesIO(response.content)

with zipfile.ZipFile(zip_content, 'r') as z:
    z.extractall()
    print("Zip file extracted.")

# Get text file
if os.path.isfile("spa-eng/spa.txt"):
    with open("spa-eng/spa.txt", 'r') as f:
        text_lines = f.readlines()

Zip file downloaded.
Zip file extracted.


#### Parsing the data

In [11]:
text_pairs = []
for line in text_lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

# Sample sentence pairs
for _ in range(5):
    print(random.choice(text_pairs))

('He seems to be a student.', '[start] Parece ser un estudiante.\n [end]')
('The employees are all unionized.', '[start] Todos los empleados están sindicados.\n [end]')
('My orders are absolute.', '[start] Mis órdenes son incondicionales.\n [end]')
("We're historians.", '[start] Somos historiadores.\n [end]')
("That's the reason he became angry.", '[start] Esa es la razón por la que se enfadó.\n [end]')


##### Split the sentence into training set, validation and test set

In [12]:
random.shuffle(text_pairs)

# train_split, val_split, test_split = 0.15
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


##### Vectorization

In [14]:
strip_chars = string.punctuation + "¿"
print(strip_chars)
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
print(strip_chars)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¿
!"#$%&'()*+,-./:;<=>?@\^_`{|}~¿


In [69]:
train_pairs[0]

('Are you a creature of habit?',
 '[start] ¿Eres un esclavo del hábito?\n [end]')

In [97]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import pre_tokenizers, normalizers, trainers
from tokenizers.trainers import WordLevelTrainer
from torch.nn.utils.rnn import pad_sequence

vocab_size = 20000
max_sequence_length = 20
batch_size = 64

train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]

# English Tokenizer
eng_tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
eng_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
eng_tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(),
                                             normalizers.Lowercase(),
                                             normalizers.StripAccents()])
trainer = WordLevelTrainer(vocab_size=vocab_size,
                            min_frequency=1,
                            special_tokens=['[UNK]'])
eng_tokenizer.train_from_iterator(train_eng_texts, trainer)

# Spanish Tokenizer
spa_tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
spa_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
spa_tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(),
                                             normalizers.Lowercase(),
                                             normalizers.StripAccents()])
trainer = WordLevelTrainer(vocab_size=vocab_size,
                            min_frequency=1,
                            special_tokens=['[UNK]'])
spa_tokenizer.train_from_iterator(train_spa_texts, trainer)

def transform_text(pair):
    x, y = pair[0], pair[1]
    x = eng_tokenizer.encode(x)
    y = spa_tokenizer.encode(y)
    # x = x.truncate(max_sequence_length)
    # y = y.truncate(max_sequence_length)
    # x = x.pad(sequence_length)
    # y = y.pad(sequence_length)
    return ({'encoder_inputs': x.ids,
            'decoder_inputs': y.ids},
            y.ids)

def make_dataset(pairs):
    # eng_texts, spa_texts = zip(*pairs)
    dataset = [transform_text(pair) for pair in pairs]
    return dataset
    # eng_texts, spa_texts = zip(*pairs)
    # dataset = torch.utils.data.Dataset((eng_texts, spa_texts))
    # dataset = dataset.map(transform_text)

train_ds = make_dataset(train_pairs)
train_ds[0]
# train_eng_texts.apply(transform_text).with_format('torch')

({'encoder_inputs': [30, 6, 8, 3711, 16, 1517, 9],
  'decoder_inputs': [1, 4, 2, 13, 124, 17, 5624, 47, 2522, 14, 1, 3, 2]},
 [1, 4, 2, 13, 124, 17, 5624, 47, 2522, 14, 1, 3, 2])

In [95]:
train_ds[10]

({'encoder_inputs': [13, 2, 15, 1334, 1],
  'decoder_inputs': [1, 4, 2, 21, 1658, 5, 1, 3, 2]},
 [1, 4, 2, 21, 1658, 5, 1, 3, 2])

In [42]:
train_eng_data = torch.utils.data.DataLoader(train_eng_texts, batch_size=batch_size, shuffle=True, drop_last=True)
train_spa_data = torch.utils.data.DataLoader(train_spa_texts, batch_size=batch_size, shuffle=True, drop_last=True)