In [14]:
import sys
sys.path.append("..")

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data, datasets
import pandas as pd
from tqdm import tqdm

import spacy
import numpy as np

import random
import math
import time

from seq2seq import utils, helpers
import re
import os
import pickle

In [15]:
DATASET_PATH = f"../.data/miguel"

**Load dataset**

In [16]:
def load_dataset(filename_src, filename_trg):
    file_src = open(filename_src, encoding='utf-8').read().split('\n')
    file_trg = open(filename_trg, encoding='utf-8').read().split('\n')
    assert len(file_src) == len(file_trg)
    return file_src, file_trg

In [17]:
(train_src, train_trg) = load_dataset(filename_src=f"{DATASET_PATH}/europarl.en", filename_trg=f"{DATASET_PATH}/europarl.es")
(dev_src, dev_trg) = load_dataset(filename_src=f"{DATASET_PATH}/dev.en", filename_trg=f"{DATASET_PATH}/dev.es")
(test_src, test_trg) = load_dataset(filename_src=f"{DATASET_PATH}/test.en", filename_trg=f"{DATASET_PATH}/test.es")


**Preview dataset**

In [30]:
def view_raw(src_raw, trg_raw, lines=5):
    for i, (src, trg) in enumerate(zip(src_raw, trg_raw)):
        print(f"#{i+1}: " + "-"*20)
        print(f"src => {src}")
        print(f"trg => {trg}")
        if i+1 == lines:
            print("")
            break

In [31]:
print("Train dataset: " + "*"*20)
view_raw(train_src, train_trg)

print("Dev dataset: " + "*"*20)
view_raw(dev_src, dev_trg)

print("Test dataset: " + "*"*20)
view_raw(test_src, test_trg)

Train dataset: ********************
#1: --------------------
src => Resumption of the session
trg => Reanudación del período de sesiones
#2: --------------------
src => I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
trg => Declaro reanudado el período de sesiones del Parlamento Europeo, interrumpido el viernes 17 de diciembre pasado, y reitero a Sus Señorías mi deseo de que hayan tenido unas buenas vacaciones.
#3: --------------------
src => Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
trg => Como todos han podido comprobar, el gran "efecto del año 2000" no se ha producido. En cambio, los ciudadanos de varios de nuestros países han sido víctimas de catástrofes naturales verdaderament

**Dataset stats**

In [32]:
train_lenghts = np.array([(len(src), len(trg)) for src, trg in zip(train_src, train_trg)])
dev_lenghts = np.array([(len(src), len(trg)) for src, trg in zip(dev_src, dev_trg)])
test_lenghts = np.array([(len(src), len(trg)) for src, trg in zip(test_src, test_trg)])


In [33]:
train_mean_len = train_lenghts.mean(axis=0).round().astype(int)
dev_mean_len = dev_lenghts.mean(axis=0).round().astype(int)
test_mean_len = test_lenghts.mean(axis=0).round().astype(int)

print(f"Train => Mean length: src={train_mean_len[0]} | trg={train_mean_len[1]}")
print(f"Dev => Mean length: src={dev_mean_len[0]} | trg={dev_mean_len[1]}")
print(f"Test => Mean length: src={test_mean_len[0]} | trg={test_mean_len[1]}")

Train => Mean length: src=149 | trg=162
Dev => Mean length: src=126 | trg=139
Test => Mean length: src=111 | trg=124


In [37]:
train_min_len = train_lenghts.min(axis=0).astype(int)
dev_min_len = dev_lenghts.min(axis=0).astype(int)
test_min_len = test_lenghts.min(axis=0).astype(int)

train_max_len = train_lenghts.max(axis=0).astype(int)
dev_max_len = dev_lenghts.max(axis=0).astype(int)
test_max_len = test_lenghts.max(axis=0).astype(int)


print(f"Train => Range length: src={train_min_len[0]}-{train_max_len[0]} | trg={train_min_len[1]}-{train_max_len[1]}")
print(f"Dev => Range length: src={dev_min_len[0]}-{dev_max_len[0]} | trg={dev_min_len[1]}-{dev_max_len[1]}")
print(f"Test => Range length: src={test_min_len[0]}-{test_max_len[0]} | trg={test_min_len[1]}-{test_max_len[1]}")


Train => Range length: src=0-3950 | trg=0-4278
Dev => Range length: src=0-723 | trg=0-728
Test => Range length: src=0-507 | trg=0-705
