In [1]:
import torch
from data import *

config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Subword.name,
        "vocab_size": 40000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

In [14]:
def split_dataset(config):
    dataset, batch_size, max_seq_len, segmentation = extract_config(
        config, "dataset", "batch_size", "max_seq_len", "segmentation")
    tt_dataset = getattr(datasets, dataset)
    # print(f"Fetched Data ({time.time() - ts:3f}s)")

    # # process non-ptb datasets
    # if dataset != Dataset.PennTreebank.name:
    #     print(dataset)
    #     return tt_dataset.splits(text_field=Field())
    
    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                        TRAINING_DATA[dataset]['filenames']))
    lines = open(paths[0], newline='\n')
    raw_data = map(lambda x: list(open(x, newline='\n')), paths)
    text_data = []
    for item in raw_data:
        # item = filter(lambda x: x != '\n', item)
        text_data.extend(item)
    total_count = len(text_data)

    train_count = int(0.7 * total_count) 
    valid_count = int(0.2 * total_count)
    test_count = total_count - train_count - valid_count
    return (text_data[0:train_count], text_data[train_count: train_count + valid_count], text_data[train_count + valid_count:len(text_data)])
    # return torch.utils.data.random_split(text_data, (train_count, valid_count, test_count))

train_dataset, valid_dataset, test_dataset = split_dataset(config)

def load_data_subword(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, batch_size, max_seq_len, segmentation = extract_config(
        config, "dataset", "batch_size", "max_seq_len", "segmentation")
    tt_dataset = getattr(datasets, dataset)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = split_dataset(config)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # tokenize
    print(segmentation)
    if segmentation == Segmentation.Subword.name:
        tokenizer = create_subword_tokenizer(config)
    elif segmentation == Segmentation.BBPE.name:
        tokenizer = create_bbpe_tokenizer(config)

    # get vocabulary
    vocab = tokenizer.get_vocab()

    # prep data
    def prep_data(dataset_arr):
        raw_text_iter = dataset_arr
        print(dataset_arr[0])
        data = [torch.tensor(tokenizer.encode(item).ids,
                             dtype=torch.long) for item in raw_text_iter]
        data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        return data

    # setup dataloaders
    train_dataloader = TextDataloader(prep_data(train_dataset), max_seq_len, batch_size)
    val_dataloader = TextDataloader(prep_data(val_dataset), max_seq_len, batch_size)
    test_dataloader = TextDataloader(prep_data(test_dataset), max_seq_len, batch_size)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer

train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer = load_data_subword(config)

[Start Load Data]
Fetched Data (0.000008s)
Tokenized and Split Data (0.014551s)
Subword
 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 

 in early trading in tokyo friday the nikkei index rose N points to N 

 but the company declines to comment 

[End Load Data] (6.073602s)


In [11]:
ts = time.time()

# get dataset
dataset, batch_size, max_seq_len, segmentation = extract_config(
    config, "dataset", "batch_size", "max_seq_len", "segmentation")
tt_dataset = getattr(datasets, dataset)
tt_dataset

<function torchtext.data.dataset.Dataset.split(self, split_ratio=0.7, stratified=False, strata_field='label', random_state=None)>

In [8]:
import re
def split_dataset(config):
    dataset  = extract_config(config, "dataset")
    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))

    # train data
    train_data = []
    valid_data = []
    test_data = []
    for path in paths:
        raw_data = list(open(path, newline='\n'))
        raw_data = list(filter(lambda x: x != '\n', raw_data))
        if re.search("train", path):
           train_data = raw_data
        if re.search("valid", path):
           valid_data = raw_data
        if re.search("test", path):
           test_data = raw_data

    return train_data, valid_data, test_data


In [19]:
def load_data_subword(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, batch_size, max_seq_len, segmentation = extract_config(
        config, "dataset", "batch_size", "max_seq_len", "segmentation")
    tt_dataset = getattr(datasets, dataset)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    # train_dataset, val_dataset, test_dataset = tt_dataset.splits(
    #     text_field=Field())
    train_dataset, val_dataset, test_dataset = split_dataset(config)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # tokenize
    if segmentation == Segmentation.Subword.name:
        tokenizer = create_subword_tokenizer(config)
    elif segmentation == Segmentation.BBPE.name:
        tokenizer = create_bbpe_tokenizer(config)

    # get vocabulary
    vocab = tokenizer.get_vocab()

    # prep data
    def prep_data(dataset_arr):
        data = [torch.tensor(tokenizer.encode(item).ids,
                             dtype=torch.long) for item in dataset_arr]
        data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        return data

    # setup dataloaders
    train_dataloader = TextDataloader(prep_data(train_dataset), max_seq_len, batch_size)
    val_dataloader = TextDataloader(prep_data(val_dataset), max_seq_len, batch_size)
    test_dataloader = TextDataloader(prep_data(test_dataset), max_seq_len, batch_size)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer

train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer = load_data_subword(config)


[Start Load Data]
Fetched Data (0.000007s)
Tokenized and Split Data (0.018512s)
[End Load Data] (5.125093s)


In [14]:
train, test, val = split_dataset(config)
tokenizer = create_subword_tokenizer(config)


In [17]:
for batch in train_dataloader:
    pri

[63, 94, 394, 2991, 250, 1115, 88, 876, 141, 156, 282, 5252, 260, 83, 139, 82, 208, 2026, 69, 6210, 77, 7546, 3692, 3655, 6051, 17, 2489, 130, 65, 71, 199, 73, 296, 12092, 221, 65, 75, 74, 86, 3404, 70, 64, 4583, 180, 80, 307, 503, 99, 716, 10706, 410, 2186, 5255, 337, 17, 1523, 81, 4036, 801, 280, 2541, 77, 85, 335, 168]


'a er bank note ber lit z call ow ay cent rust cl u et t from stein g itan o gut erman hy dro - que be c i po k ia memo te c m l x na h b pun ts r ake reg at ta rub ens sim sn ack - food s san gy ong swap o w ach ter'