In [4]:
import torch
import torch.nn as nn
import torchtext
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import os
import pathlib
import random
import string
import re
import numpy as np

#### Download data

In [None]:
import requests
import io
import zipfile

url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

response = requests.get(url)

if response.status_code == 200:
    print("Zip file downloaded.")
else:
    print(f"Unable to download zip file: {response.status_code}")
    exit()

zip_content = io.BytesIO(response.content)

with zipfile.ZipFile(zip_content, 'r') as z:
    z.extractall()
    print("Zip file extracted.")

# Get text file
if os.path.isfile("spa-eng/spa.txt"):
    with open("spa-eng/spa.txt", 'r') as f:
        text_lines = f.readlines()

Zip file downloaded.
Zip file extracted.


#### Parsing the data

In [11]:
text_pairs = []
for line in text_lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

# Sample sentence pairs
for _ in range(5):
    print(random.choice(text_pairs))

('He seems to be a student.', '[start] Parece ser un estudiante.\n [end]')
('The employees are all unionized.', '[start] Todos los empleados están sindicados.\n [end]')
('My orders are absolute.', '[start] Mis órdenes son incondicionales.\n [end]')
("We're historians.", '[start] Somos historiadores.\n [end]')
("That's the reason he became angry.", '[start] Esa es la razón por la que se enfadó.\n [end]')


##### Split the sentence into training set, validation and test set

In [12]:
random.shuffle(text_pairs)

# train_split, val_split, test_split = 0.15
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


##### Vectorization

In [14]:
strip_chars = string.punctuation + "¿"
print(strip_chars)
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
print(strip_chars)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~¿
!"#$%&'()*+,-./:;<=>?@\^_`{|}~¿


In [None]:
vocab_size = 20000 # we only consider the top 20k words
max_len = 200 # we only consider the first 200 words of each movie review

from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer




In [34]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]

# Tokenization
eng_tokenizer = get_tokenizer("basic_english")
# spa_tokenizer = get_tokenizer("spanish")

tokenized_eng_data = [eng_tokenizer(text) for text in train_eng_texts]
# tokenized_spa_data = [spa_tokenizer(text) for text in train_spa_texts]

# Vocabulary building
vocab = build_vocab_from_iterator(tokenized_eng_data)#, specials=["<unk>", "<pad>"])
# vocab.set_default_index(vocab["<unk>"])

# Numericalization
numericalized_data = [torch.tensor(vocab(tokens)) for tokens in tokenized_eng_data]

# Padding and Batching
padded_sequences = pad_sequence(numericalized_data, batch_first=True, padding_value=vocab["<pad>"])

# Embedding Layer (within a PyTorch model)
embedding_dim = 100
num_embeddings = len(vocab)
embedding_layer = torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=vocab["<pad>"])

# Example of using the embedding layer
embedded_output = embedding_layer(padded_sequences)
print(embedded_output.shape) # Expected: (batch_size, sequence_length, embedding_dim)

83276lines [00:00, 596531.77lines/s]


TypeError: 'Vocab' object is not callable

In [39]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers import pre_tokenizers, normalizers, trainers
from tokenizers.trainers import WordLevelTrainer

vocab_size = 15000
sequence_length = 20
batch_size = 64

train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]

tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(),
                                             normalizers.Lowercase(),
                                             normalizers.StripAccents()])

trainer = WordLevelTrainer(vocab_size=vocab_size,
                                    min_frequency=1,
                                    special_tokens=['[UNK]'])

tokenizer.train_from_iterator(train_eng_texts, trainer)

def transform_text(text):
    x = tokenizer.encode(text)
    x.truncate(sequence_length)
    x.pad(sequence_length)
    return x.ids

def apply_transform(x):
    return {'input_ids': transform_text(x)}

train_eng_texts = [apply_transform(x) for x in train_eng_texts]
train_spa_texts = [apply_transform(x) for x in train_spa_texts]
# train_eng_texts.apply(transform_text).with_format('torch')

In [42]:
train_eng_data = torch.utils.data.DataLoader(train_eng_texts, batch_size=batch_size, shuffle=True, drop_last=True)
train_spa_data = torch.utils.data.DataLoader(train_spa_texts, batch_size=batch_size, shuffle=True, drop_last=True)

In [None]:


import torchtext.
def custom_standardization(input_string):
    return input_string.lower()


# Initialize a standard tokenizer
tokenizer = torchtext.transforms.SentencePieceTokenizer()

eng_vectorization = 