In [1]:
import torch
from data import *



In [64]:
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Subword.name,
        "vocab_size": 40000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss",
        "torchtext_split": False,
    }

def split_dataset(config):
    dataset  = extract_config(config, "dataset")
    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))

    # train data
    train_data = []
    valid_data = []
    test_data = []
    for path in paths:
        raw_data = list(open(path, newline='\n'))
        raw_data = list(filter(lambda x: x != '\n', raw_data))
        if re.search("train", path):
           train_data = raw_data
        if re.search("valid", path):
           valid_data = raw_data
        if re.search("test", path):
           test_data = raw_data

    return train_data, valid_data, test_data

def create_subword_tokenizer(config):
    dataset, vocab_size, segmentation = extract_config(
        config, "dataset", "vocab_size", "segmentation")

    # get location
    output_location = 'tokenizer/'
    tokenizer_loc = segmentation +'_tokenizer_' + str(dataset) + '_' + str(vocab_size) + ".tokenizer.json"
    path_to_tokenizer_loc = DATA_PATH+output_location
    tokenizer_filepath = path_to_tokenizer_loc+tokenizer_loc

    # load tokenizer
    if os.path.isfile(tokenizer_filepath):
        tokenizer = Tokenizer.from_file(tokenizer_filepath)
        return tokenizer

    # build tokenizer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()

    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<unk>", "<eos>"])

    tokenizer.train(files=paths, trainer=trainer)

    # save tokenizer
    try:
        if not os.path.isdir(path_to_tokenizer_loc):
            os.makedirs(path_to_tokenizer_loc)
        tokenizer.save(str(tokenizer_filepath))
    except Exception as e:
        print("Error saving tokenizer", e)

    return tokenizer

def load_data_subword(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, batch_size, max_seq_len, segmentation, torchtext_split = extract_config(
        config, "dataset", "batch_size", "max_seq_len", "segmentation", "torchtext_split")
    tt_dataset = getattr(datasets, dataset)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    train_dataset, val_dataset, test_dataset = split_dataset(config)
    if torchtext_split:
        train_dataset, val_dataset, test_dataset = tt_dataset.splits(
            text_field=Field())
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # tokenize
    if segmentation == Segmentation.Subword.name:
        tokenizer = create_subword_tokenizer(config)
    elif segmentation == Segmentation.BBPE.name:
        tokenizer = create_bbpe_tokenizer(config)
    elif segmentation == Segmentation.Word.name:
        tokenizer = create_word_tokenizer(config)

    # get vocabulary
    vocab = tokenizer.get_vocab()

    # prep data
    def prep_data(dataset_arr):
        if torchtext_split:
            raw_text_iter = dataset_arr[0].text
            data = torch.tensor(tokenizer.encode(raw_text_iter, is_pretokenized=True).ids, dtype=torch.long)
            return data

        raw_text_iter = dataset_arr
        data = [torch.tensor(tokenizer.encode(item).ids,
                             dtype=torch.long) for item in raw_text_iter]
        data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        return data

    # setup dataloaders
    train_dataloader = TextDataloader(prep_data(train_dataset), max_seq_len, batch_size)
    val_dataloader = TextDataloader(prep_data(val_dataset), max_seq_len, batch_size)
    test_dataloader = TextDataloader(prep_data(test_dataset), max_seq_len, batch_size)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer

train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer = load_data_subword(config)
test_string = "The funding is attached to an estimated $27.1 billion transportation bill that goes next to the Senate and carries with it a proposed permanent smoking ban on virtually all U.S. domestic airline flights. "
test_tokenizer(tokenizer, test_string)
print_examples(train_dataloader, tokenizer)

[Start Load Data]
Fetched Data (0.000007s)
Tokenized and Split Data (0.017809s)
[End Load Data] (4.230067s)
['he', 'funding', 'is', 'attached', 'to', 'an', 'estimated', '$', '2', '7', '.', '1', 'billion', 'transportation', 'bill', 'that', 'goes', 'next', 'to', 'the', 'en', 'ate', 'and', 'carries', 'with', 'it', 'a', 'proposed', 'permanent', 'smoking', 'ban', 'on', 'virtually', 'all', '.', '.', 'domestic', 'airline', 'flights', '.']
[132, 2481, 78, 5682, 72, 58, 1390, 8, 17, 22, 13, 16, 272, 2045, 683, 103, 2921, 613, 72, 57, 67, 138, 80, 4758, 144, 73, 29, 1462, 4910, 6042, 251, 61, 3721, 174, 13, 13, 1769, 1966, 4160, 13]
he funding is attached to an estimated $ 2 7 . 1 billion transportation bill that goes next to the en ate and carries with it a proposed permanent smoking ban on virtually all . . domestic airline flights .
[10, 47, 1025, 12, 1267, 558, 1372, 56, 57, 27, 582, 695, 56, 1108, 6083, 80, 7064, 71, 1511, 12, 2314, 1336, 589, 27, 27, 483, 1025, 921, 984, 5, 145, 27]
[47, 1

In [65]:
test_string = "the funding is attached to an estimated $27.1 billion transportation bill that goes next to the senate and carries with it a proposed permanent smoking ban on virtually all domestic airline flights. "
test_tokenizer(tokenizer, test_string)

['the', 'funding', 'is', 'attached', 'to', 'an', 'estimated', '$', '2', '7', '.', '1', 'billion', 'transportation', 'bill', 'that', 'goes', 'next', 'to', 'the', 'senate', 'and', 'carries', 'with', 'it', 'a', 'proposed', 'permanent', 'smoking', 'ban', 'on', 'virtually', 'all', 'domestic', 'airline', 'flights', '.']
[57, 2481, 78, 5682, 72, 58, 1390, 8, 17, 22, 13, 16, 272, 2045, 683, 103, 2921, 613, 72, 57, 1288, 80, 4758, 144, 73, 29, 1462, 4910, 6042, 251, 61, 3721, 174, 1769, 1966, 4160, 13]
the funding is attached to an estimated $ 2 7 . 1 billion transportation bill that goes next to the senate and carries with it a proposed permanent smoking ban on virtually all domestic airline flights .


In [59]:
# bbpe
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.BBPE.name,
        "vocab_size": 10000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss",
        "torchtext_split": True

    }
train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer = load_data_subword(config)
test_tokenizer(tokenizer, test_string)
print_examples(train_dataloader, tokenizer)
    

[Start Load Data]
Fetched Data (0.000006s)
Tokenized and Split Data (0.020541s)
[End Load Data] (5.117110s)
['the', 'Ġfunding', 'Ġis', 'Ġatt', 'ach', 'ed', 'Ġto', 'Ġan', 'Ġestimated', 'Ġ$', '2', '7', '.', '1', 'Ġbillion', 'Ġtransportation', 'Ġbill', 'Ġthat', 'Ġgoes', 'Ġnext', 'Ġto', 'Ġthe', 'Ġsenate', 'Ġand', 'Ġcar', 'r', 'ies', 'Ġwith', 'Ġit', 'Ġa', 'Ġproposed', 'Ġper', 'man', 'ent', 'Ġsm', 'ok', 'ing', 'Ġban', 'Ġon', 'Ġvirt', 'ually', 'Ġall', 'Ġdomestic', 'Ġairline', 'Ġfl', 'ights', '.', 'Ġ']
[1850, 3145, 330, 912, 564, 285, 290, 293, 1775, 338, 18, 23, 14, 17, 523, 2440, 957, 323, 3386, 890, 290, 263, 1709, 302, 749, 82, 391, 369, 318, 258, 1817, 626, 524, 307, 889, 569, 292, 2693, 324, 3985, 1160, 549, 2141, 2361, 1002, 1461, 14, 221]
the funding is attached to an estimated $27.1 billion transportation bill that goes next to the senate and carries with it a proposed permanent smoking ban on virtually all domestic airline flights. 
[14, 1303, 313, 75, 277, 492, 514, 274, 1412, 386, 

In [70]:
from tokenizers.models import BPE, WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer, WordLevelTrainer

config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Word.name,
        "vocab_size": 40000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

# word tokenizer
def create_word_tokenizer(config):
    dataset, vocab_size, segmentation = extract_config(
        config, "dataset", "vocab_size", "segmentation")

    # get location
    output_location = 'tokenizer/'
    tokenizer_loc = segmentation +'_tokenizer_' + str(dataset) + ".tokenizer.json"
    path_to_tokenizer_loc = DATA_PATH+output_location
    tokenizer_filepath = path_to_tokenizer_loc+tokenizer_loc

    # load tokenizer
    if os.path.isfile(tokenizer_filepath):
        tokenizer = Tokenizer.from_file(tokenizer_filepath)
        return tokenizer

    # build tokenizer
    tokenizer = Tokenizer(WordLevel())
    tokenizer.pre_tokenizer = Whitespace()

    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
  
    trainer = WordLevelTrainer(
        min_frequency=1,
        # vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<unk>", "<eos>"]
        )

    tokenizer.train(files=paths, trainer=trainer)

    # save tokenizer
    try:
        if not os.path.isdir(path_to_tokenizer_loc):
            os.makedirs(path_to_tokenizer_loc)
        tokenizer.save(str(tokenizer_filepath))
    except Exception as e:
        print("Error saving tokenizer", e)

    return tokenizer

tokenizer = create_word_tokenizer(config)
test_tokenizer(tokenizer, test_string)


9663
['the', 'funding', 'is', 'attached', 'to', 'an', 'estimated', '$', '27', '.', '1', 'billion', 'transportation', 'bill', 'that', 'goes', 'next', 'to', 'the', 'senate', 'and', 'carries', 'with', 'it', 'a', 'proposed', 'permanent', 'smoking', 'ban', 'on', 'virtually', 'all', 'domestic', 'airline', 'flights', '.']
[7, 1219, 23, 3258, 13, 44, 495, 22, 5, 17, 3626, 64, 829, 278, 21, 1366, 149, 13, 7, 428, 16, 2603, 32, 24, 14, 514, 2667, 3535, 1901, 27, 1876, 86, 686, 785, 2205, 17]
the funding is attached to an estimated $ . 1 billion transportation bill that goes next to the senate and carries with it a proposed permanent smoking ban on virtually all domestic airline flights .


In [46]:
# word
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Word.name,
        "vocab_size": 40000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

def load_data_word(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, batch_size, max_seq_len = extract_config(config, "dataset", "batch_size", "max_seq_len")
    dataset = getattr(datasets, dataset)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # # tokenize
    tokenizer = get_tokenizer('basic_english')
    field_processor = Field(tokenize=tokenizer)

    # split dataset
    train_dataset, val_dataset, test_dataset = dataset.splits(
        text_field=field_processor)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # get vocabulary
    field_processor.build_vocab(
        train_dataset, val_dataset, test_dataset, min_freq=1)
    vocab = field_processor.vocab
    print(f"Built Vocab ({time.time() - ts:3f}s)")

    # data prep
    def data_prep(tt_dataset_split):
        raw_text_iter = tt_dataset_split[0].text
        data = [torch.tensor([vocab[token] for token in tokenizer(item)],
                                dtype=torch.long) for item in raw_text_iter]
        data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        return data

    # setup dataloaders
    train_dataloader = TextDataloader(data_prep(train_dataset), max_seq_len, batch_size)
    val_dataloader = TextDataloader(data_prep(val_dataset), max_seq_len, batch_size)
    test_dataloader = TextDataloader(data_prep(test_dataset), max_seq_len, batch_size)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer

train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer = load_data_word(config)
test_tokenizer(tokenizer)
print_examples(train_dataloader, tokenizer)

[Start Load Data]
Fetched Data (0.000007s)
Tokenized and Split Data (0.658534s)
Built Vocab (0.836997s)
[End Load Data] (13.951997s)
<function _basic_english_normalize at 0x7f49ec001820>


AttributeError: 'function' object has no attribute 'encode'

In [None]:
# character
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Character.name,
        "vocab_size": 40000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

In [42]:

# byte
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.BYTE.name,
        "vocab_size": 40000,
        "max_seq_len": 32,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

from tokenizers.models import BPE, WordLevel
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer, WordLevelTrainer

# byte tokenizer
def create_word_tokenizer(config):
    dataset, vocab_size, segmentation = extract_config(
        config, "dataset", "vocab_size", "segmentation")

    # get location
    output_location = 'tokenizer/'
    tokenizer_loc = segmentation +'_tokenizer_' + str(dataset) + '_' + str(vocab_size) + ".tokenizer.json"
    path_to_tokenizer_loc = DATA_PATH+output_location
    tokenizer_filepath = path_to_tokenizer_loc+tokenizer_loc

    # load tokenizer
    # if os.path.isfile(tokenizer_filepath):
    #     tokenizer = Tokenizer.from_file(tokenizer_filepath)
    #     return tokenizer

    # build tokenizer
    tokenizer = Tokenizer(WordLevel())
    tokenizer.pre_tokenizer = ByteLevel()

    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
    trainer = WordLevelTrainer(
        min_frequency=1,
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<unk>", "<eos>"]
        )

    tokenizer.train(files=paths, trainer=trainer)

    # save tokenizer
    try:
        if not os.path.isdir(path_to_tokenizer_loc):
            os.makedirs(path_to_tokenizer_loc)
        tokenizer.save(str(tokenizer_filepath))
    except Exception as e:
        print("Error saving tokenizer", e)

    return tokenizer

tokenizer = create_word_tokenizer(config)
test_tokenizer(tokenizer)


ImportError: cannot import name 'ByteLevel' from 'tokenizers.models' (/home/gbafa/miniconda3/envs/transformer/lib/python3.8/site-packages/tokenizers/models/__init__.py)

In [11]:
ts = time.time()

# get dataset
dataset, batch_size, max_seq_len, segmentation = extract_config(
    config, "dataset", "batch_size", "max_seq_len", "segmentation")
tt_dataset = getattr(datasets, dataset)
tt_dataset

<function torchtext.data.dataset.Dataset.split(self, split_ratio=0.7, stratified=False, strata_field='label', random_state=None)>

In [8]:
import re
def split_dataset(config):
    dataset  = extract_config(config, "dataset")
    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))

    # train data
    train_data = []
    valid_data = []
    test_data = []
    for path in paths:
        raw_data = list(open(path, newline='\n'))
        raw_data = list(filter(lambda x: x != '\n', raw_data))
        if re.search("train", path):
           train_data = raw_data
        if re.search("valid", path):
           valid_data = raw_data
        if re.search("test", path):
           test_data = raw_data

    return train_data, valid_data, test_data


In [19]:
def load_data_subword(config):
    print("[Start Load Data]")
    ts = time.time()

    # get dataset
    dataset, batch_size, max_seq_len, segmentation = extract_config(
        config, "dataset", "batch_size", "max_seq_len", "segmentation")
    tt_dataset = getattr(datasets, dataset)
    print(f"Fetched Data ({time.time() - ts:3f}s)")

    # split dataset
    # train_dataset, val_dataset, test_dataset = tt_dataset.splits(
    #     text_field=Field())
    train_dataset, val_dataset, test_dataset = split_dataset(config)
    print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

    # tokenize
    if segmentation == Segmentation.Subword.name:
        tokenizer = create_subword_tokenizer(config)
    elif segmentation == Segmentation.BBPE.name:
        tokenizer = create_bbpe_tokenizer(config)

    # get vocabulary
    vocab = tokenizer.get_vocab()

    # prep data
    def prep_data(dataset_arr):
        data = [torch.tensor(tokenizer.encode(item).ids,
                             dtype=torch.long) for item in dataset_arr]
        data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        return data

    # setup dataloaders
    train_dataloader = TextDataloader(prep_data(train_dataset), max_seq_len, batch_size)
    val_dataloader = TextDataloader(prep_data(val_dataset), max_seq_len, batch_size)
    test_dataloader = TextDataloader(prep_data(test_dataset), max_seq_len, batch_size)

    print(f"[End Load Data] ({time.time() - ts:3f}s)")
    return train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer

train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer = load_data_subword(config)


[Start Load Data]
Fetched Data (0.000007s)
Tokenized and Split Data (0.018512s)
[End Load Data] (5.125093s)


In [39]:
def print_examples(dataloader, tokenizer, print_count = 1):
    count = 0
    for batch in dataloader:
        data, targets = batch
        d1 = data[0].tolist()
        t1 = targets[0:len(data[0])].tolist()
        print(d1)
        print(t1)
        print(tokenizer.decode(d1))
        print(tokenizer.decode(t1))
        count += 1
        if count > print_count - 1:
            break

def test_tokenizer(tokenizer, test_string = "this is a test to see how the encoder is working! Are the results pleasing?"):
    output = tokenizer.encode(test_string)
    print(output.tokens)
    print(output.ids)
    print(tokenizer.decode(output.ids))

test_tokenizer(tokenizer)
print_examples(train_dataloader, tokenizer)



['this', 'is', 'a', 'test', 'to', 'see', 'how', 'the', 'encoder', 'is', 'working', '!', 'Are', 'the', 'results', 'pleasing', '?']
[52, 23, 14, 946, 13, 367, 273, 7, 5, 23, 703, 5, 5, 7, 362, 5, 5]
this is a test to see how the is working the results
[5966, 3837, 673, 271, 12211, 269, 268, 30, 284, 640, 1384, 302, 1361, 911, 287, 5317, 1491, 4945, 2709, 386, 1900, 258, 14556, 1309, 287, 466, 1331, 1462, 7119, 271, 258, 961]
[3837, 673, 271, 12211, 269, 268, 30, 284, 640, 1384, 302, 1361, 911, 287, 5317, 1491, 4945, 2709, 386, 1900, 258, 14556, 1309, 287, 466, 1331, 1462, 7119, 271, 258, 961, 287]
chaos discounts account economic common very as re face modest concern film district small hearst everything reduces massachusetts power accused past positions small demand radio factors diamonds economic past eight
discounts account economic common very as re face modest concern film district small hearst everything reduces massachusetts power accused past positions small demand radio factors 

In [74]:
# from transformers import GPT2TokenizerFast
import transformers
print(transformers.__version__)
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# tokenizer(test_string)['input_ids']

VersionConflict: tokenizers==0.9.4 is required for a normal functioning of this module, but found tokenizers==0.10.1.
Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git master