In [3]:
import torch
from torchtext import data

from tqdm import tqdm

import numpy as np
from scipy import stats

import pickle
import dill
import umsgpack
import time
import json

# Import the libraries
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

**Set constants**

In [2]:
DATASET_PATH = f"../.data/miguel"
SRC_LANG = "en"
TRG_LANG = "es"
SOS_WORD = '<sos>'
EOS_WORD = '<eos>'
MAX_SEQ_LENGTH = 100

## Load dataset

To speed things up, I prefer to use torchtext directly in order to read the CSV files, preprocess
them and tokenize each pair.

I'm gonna use the tokenizer from Spacy, which is a Natural Language Processing library that is blazingly fast, suitable
for large datasets, with support for many language and hundreds of features.

This step can take a while but since I plan to save our tokenized datasets, it  won't be a problem.

Also note that I'm converting everything to lowercase, and adding the `<sos>` and `<eos>` tokens to our pairs.

***Note:** Keep in mind that I share (by reference) the SRC/TRG fields between the train, dev and test partitions.*

In [3]:
SRC = data.Field(tokenize='spacy', tokenizer_language=SRC_LANG, init_token=SOS_WORD, eos_token=EOS_WORD, lower=True)
TRG = data.Field(tokenize='spacy', tokenizer_language=TRG_LANG, init_token=SOS_WORD, eos_token=EOS_WORD, lower=True)



In [4]:
start = time.time()

data_fields = [('src', SRC), ('trg', TRG)]  # Shared fields
train_data, dev_data, test_data = data.TabularDataset.splits(path=f'{DATASET_PATH}/preprocessed/',
                                                             train='train.csv', validation='dev.csv', test='test.csv',
                                                             format='csv', fields=data_fields, skip_header=True)
end = time.time()
print(end - start)



463.37507581710815


## Saving preprocessed dataset

Since preprocessing and tokenizing a big dataset can take time, I recommend to save it always, if possible, in other
to save us some time when debugging our model.

For torchtext, we'll need to save the preprocessed examples for the train, dev and test partitions, along with their
fields. Depending on the case we can ignore the last part. Here, I'll save only the train fields with their
vocabularies.


In [5]:
def save_dataset(dataset, savepath):
    start = time.time()

    total = len(dataset.examples)
    with open(savepath, 'wb') as f:
        # Save num. elements
        umsgpack.pack(total, f)

        # Save elements
        for pair in tqdm(dataset.examples, total=total):
            data = [pair.src, pair.trg]
            umsgpack.pack(data, f)

    end = time.time()
    print(end - start)

save_dataset(train_data, f"{DATASET_PATH}/tokenized/train.msgpack")
save_dataset(dev_data, f"{DATASET_PATH}/tokenized/dev.msgpack")
save_dataset(test_data, f"{DATASET_PATH}/tokenized/test.msgpack")
print("Tokenized datasets saved!")


100%|██████████| 1960641/1960641 [02:03<00:00, 15906.52it/s]
100%|██████████| 3003/3003 [00:00<00:00, 17365.34it/s]
100%|██████████| 3003/3003 [00:00<00:00, 16957.28it/s]


123.44851589202881
0.17477941513061523
0.1789076328277588
Tokenized datasets saved!


In [6]:
def save_dataset2(dataset, savepath):
    start = time.time()

    total = len(dataset.examples)
    with open(savepath, 'w') as f:
        # Save num. elements
        f.write(json.dumps(total))
        f.write("\n")

        # Save elements
        for pair in tqdm(dataset.examples, total=total):
            data = [pair.src, pair.trg]
            f.write(json.dumps(data))
            f.write("\n")

    end = time.time()
    print(end - start)

save_dataset2(train_data, f"{DATASET_PATH}/tokenized/train.json")
save_dataset2(dev_data, f"{DATASET_PATH}/tokenized/dev.json")
save_dataset2(test_data, f"{DATASET_PATH}/tokenized/test.json")
print("Tokenized datasets saved!")

100%|██████████| 1960641/1960641 [00:15<00:00, 125955.90it/s]
100%|██████████| 3003/3003 [00:00<00:00, 106927.25it/s]
100%|██████████| 3003/3003 [00:00<00:00, 134750.09it/s]


16.33510136604309
0.034819602966308594
0.023775815963745117
Tokenized datasets saved!


In [7]:
def save_dataset3(dataset, savepath):
    start = time.time()

    total = len(dataset.examples)
    # Collect pairs
    examples = []
    for pair in tqdm(dataset.examples, total=total):
        data = [pair.src, pair.trg]
        examples.append(data)

    # Save
    with open(savepath, 'w') as f:
        json.dump(examples, f)

    end = time.time()
    print(end - start)

save_dataset3(train_data, f"{DATASET_PATH}/tokenized/train3.json")
save_dataset3(dev_data, f"{DATASET_PATH}/tokenized/dev3.json")
save_dataset3(test_data, f"{DATASET_PATH}/tokenized/test3.json")
print("Tokenized datasets saved!")

100%|██████████| 1960641/1960641 [00:05<00:00, 336883.06it/s] 
100%|██████████| 3003/3003 [00:00<00:00, 1696820.01it/s]
100%|██████████| 3003/3003 [00:00<00:00, 1602684.17it/s]


46.543951749801636
0.05698347091674805
0.05739021301269531
Tokenized datasets saved!


## Loading preprocessed dataset

Similary, we can also load the preprocessed datasets.

In [4]:
def load_dataset(filename):
    start = time.time()

    examples = []
    with open(filename, 'rb') as f:
        # Read num. elements
        total = umsgpack.unpack(f)

        # Save elements
        for i in tqdm(range(total), total=total):
            example = umsgpack.unpack(f)
            examples.append(example)

    end = time.time()
    print(end - start)
    return examples


train_data = load_dataset(f"{DATASET_PATH}/tokenized/train.msgpack")
dev_data = load_dataset(f"{DATASET_PATH}/tokenized/dev.msgpack")
test_data = load_dataset(f"{DATASET_PATH}/tokenized/test.msgpack")
print("Tokenized datasets loaded!")


100%|██████████| 1960641/1960641 [02:23<00:00, 13635.20it/s]
100%|██████████| 3003/3003 [00:00<00:00, 15644.65it/s]
100%|██████████| 3003/3003 [00:00<00:00, 15745.02it/s]


143.79657816886902
0.19379568099975586
0.19238901138305664
Tokenized datasets loaded!


In [None]:
def load_dataset2(filename):
    start = time.time()

    examples = []
    with open(filename, 'rb') as f:
        # Read num. elements
        line = f.readline()
        total = json.loads(line)

        # Save elements
        for i in tqdm(range(total), total=total):
            line = f.readline()
            example = json.loads(line)
            examples.append(example)

    end = time.time()
    print(end - start)
    return examples


train_data = load_dataset2(f"{DATASET_PATH}/tokenized/train.json")
dev_data = load_dataset2(f"{DATASET_PATH}/tokenized/dev.json")
test_data = load_dataset2(f"{DATASET_PATH}/tokenized/test.json")
print("Tokenized datasets loaded!")


In [3]:
def load_dataset3(filename):
    start = time.time()

    with open(filename, 'r') as f:
        # Read num. elements
        examples = json.load(f)

    end = time.time()
    print(end - start)
    return examples


train_data = load_dataset3(f"{DATASET_PATH}/tokenized/train3.json")
dev_data = load_dataset3(f"{DATASET_PATH}/tokenized/dev3.json")
test_data = load_dataset3(f"{DATASET_PATH}/tokenized/test3.json")
print("Tokenized datasets loaded!")

27.435794591903687
0.02357649803161621
0.02375054359436035
Tokenized datasets loaded!


In [6]:
# Build dataset
fields = [('src', SRC), ('trg', TRG)]

train_data = data.Dataset(train_data, fields)
dev_data = data.Dataset(dev_data, fields)
test_data = data.Dataset(test_data, fields)

print("Total pairs:")
print(f"\t- Train: {len(train_data.examples)}")
print(f"\t- Dev: {len(dev_data.examples)}")
print(f"\t- Test: {len(test_data.examples)}")

Total pairs:
	- Train: 1960641
	- Dev: 3003
	- Test: 3003
