In [1]:
from transformers import GPT2TokenizerFast

pretrained_weights = 'gpt2'
tokenizer_en = GPT2TokenizerFast.from_pretrained(pretrained_weights)
tokenizer_en.pad_token = tokenizer_en.eos_token

ByteLevelBPE_tokenizer_pt_vocab_size = tokenizer_en.vocab_size
ByteLevelBPE_tokenizer_pt_vocab_size


50257

In [2]:
from tokenizers import ByteLevelBPETokenizer
ByteLevelBPE_tokenizer_pt = ByteLevelBPETokenizer()

data_path = './.data/'
wikitext2 = 'wikitext-2/wikitext-2/'
output_location = 'tokenizer/'
vocab_size=40000
paths = list(map(lambda x: str(data_path+wikitext2+x), ['wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens']))
print(paths)

ByteLevelBPE_tokenizer_pt.train(files=paths, 
                                vocab_size=vocab_size, 
                                min_frequency=2, 
                                special_tokens=["<|endoftext|>"])

ByteLevelBPE_tokenizer_pt.enable_truncation(max_length=1024)


['./.data/wikitext-2/wikitext-2/wiki.train.tokens', './.data/wikitext-2/wikitext-2/wiki.valid.tokens', './.data/wikitext-2/wikitext-2/wiki.test.tokens']


In [3]:
import os
# isdir = os.path.isdir(path)  

bpe_tokenizer_loc = 'BBPE_tokenizer_' + str(vocab_size)
path_to_bpe_tokenizer_loc = data_path+output_location+bpe_tokenizer_loc+ '/'
print(path_to_bpe_tokenizer_loc)
if not os.path.isdir(path_to_bpe_tokenizer_loc):
    os.makedirs(path_to_bpe_tokenizer_loc)
ByteLevelBPE_tokenizer_pt.save_model(str(path_to_bpe_tokenizer_loc))

./.data/tokenizer/BBPE_tokenizer_40000/


['./.data/tokenizer/BBPE_tokenizer_40000/vocab.json',
 './.data/tokenizer/BBPE_tokenizer_40000/merges.txt']

In [35]:
ByteLevelBPE_tokenizer_pt.encode("This is a test of the tokenizer").ids
# ByteLevelBPE_tokenizer_pt.get_vocab()
ByteLevelBPE_tokenizer_pt_vocab = ByteLevelBPE_tokenizer_pt.get_vocab() 

ByteLevelBPE_tokenizer_pt_vocab_ls = [k for k, v in sorted(ByteLevelBPE_tokenizer_pt_vocab.items(), key=lambda item: item[1])]
len(ByteLevelBPE_tokenizer_pt_vocab_ls)


40000

In [24]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

from tokenizers import CharBPETokenizer
tokenizer = CharBPETokenizer()


from constants import *
from utils import *

config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Subword.name,
    "vocab_size": 40000,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.0001,
    "adam_b1": 0.9,
    "adam_b2": 0.999,
    "adam_l2_weightdecay": 0.01,
    "loss_criterion": "CrossEntropyLoss"
}


def create_subword_tokenizer(config):
    dataset, vocab_size = extract_config(
        config, "dataset", "vocab_size")
    
    # get location
    output_location = 'tokenizer/'
    tokenizer_loc = 'bpe_tokenizer_' + str(dataset) + '_'+ str(vocab_size) + ".tokenizer.json"
    path_to_tokenizer_loc = DATA_PATH+output_location
    tokenizer_filepath = path_to_tokenizer_loc+tokenizer_loc


    # load tokenizer
    if os.path.isfile(tokenizer_filepath):
        tokenizer = Tokenizer.from_file(tokenizer_filepath)
        return tokenizer


    # build tokenizer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()

    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<unk>"])

    tokenizer.train(files=paths, trainer=trainer)

    # save tokenizer
    try:
        if not os.path.isdir(path_to_tokenizer_loc):
            os.makedirs(path_to_tokenizer_loc)
        tokenizer.save(str(path_to_tokenizer_loc+tokenizer_loc))
    except Exception as e:
            print("Error saving tokenizer", e)

    return tokenizer

tokenizer = create_subword_tokenizer(config)

output = tokenizer.encode("Hi hi[[s, y'all! How are you 😁 ?")
print(output.tokens)
len(tokenizer.get_vocab())
print(output.ids)
tokenizer.decode(output.ids)

['i', 'el', 'lo', 'y', "'", 'all', 'ow', 'are', 'you']
[36, 366, 103, 52, 9, 173, 5140, 99, 527]


"i el lo y ' all ow are you"

In [None]:
def create_bbpe_tokenizer(config):
    # prep data
    dataset, vocab_size, max_seq_len = extract_config(
        config, "dataset", "vocab_size", "max_seq_len")
    data_path = './.data/'
    location = file_data[dataset]['location']
    paths = list(map(lambda x: str(data_path+location+x), file_data[dataset]['filenames']))
    print(paths)

    # train tokenixer
    ByteLevelBPE_tokenizer_pt.train(files=paths, 
                                    vocab_size=vocab_size, 
                                    min_frequency=2, 
                                    special_tokens=["<|endoftext|>"])
    ByteLevelBPE_tokenizer_pt.enable_truncation(max_length=1024)

    # save tokenizer
    try:
        output_location = 'tokenizer/'
        bpe_tokenizer_loc = 'BBPE_tokenizer_' + str(vocab_size)
        path_to_bpe_tokenizer_loc = data_path+output_location+bpe_tokenizer_loc+ '/'
        if not os.path.isdir(path_to_bpe_tokenizer_loc):
            os.makedirs(path_to_bpe_tokenizer_loc)
        ByteLevelBPE_tokenizer_pt.save_model(str(path_to_bpe_tokenizer_loc))
        except Exception as e:
            print("Error saving tokenizer", e)
    return ByteLevelBPE_tokenizer_pt

config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.PennTreebank.name,
        "segmentation": Segmentation.Word.name,
        "vocab_size": 40000,
        "max_seq_len": 35,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }
    

In [52]:
from constants import *
from utils import *
file_data = {
   'PennTreebank': {
       "location": "penn-treebank/",
       "filenames": ['ptb.train.tokens', 'ptb.valid.tokens', 'ptb.test.tokens']
   },
   'WikiText2': {
       "location": "wikitext-2/wikitext-2/",
       "filenames": ['wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens']
   } ,
   'WikiText103': {
       "location": "wikitext-103/wikitext-103/",
       "filenames": ['wiki.train.tokens', 'wiki.valid.tokens', 'wiki.test.tokens']
   }  
}
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.WikiText2.name,
        "segmentation": Segmentation.Word.name,
        "vocab_size": 40000,
        "max_seq_len": 35,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }
    

In [57]:
dataset, vocab_size, max_seq_len = extract_config(
        config, "dataset", "vocab_size", "max_seq_len")


In [73]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

# tokenizer = Tokenizer(BPE())


# tokenizer.pre_tokenizer = Whitespace()


# location = file_data[dataset]['location']
# paths = list(map(lambda x: str(data_path+location+x), file_data[dataset]['filenames']))
# print(paths)
# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# tokenizer.train(files=paths, trainer=trainer)
# output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
# print(output.tokens)

def create_subword_tokenizer(config):
    dataset, vocab_size, max_seq_len = extract_config(
        config, "dataset", "vocab_size", "max_seq_len")

    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()

    location = file_data[dataset]['location']
    paths = list(map(lambda x: str(data_path+location+x), file_data[dataset]['filenames']))
    trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<unk>"])

    tokenizer.train(files=paths, trainer=trainer)

    return tokenizer


tokenizer = create_subword_tokenizer(config)
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
len(tokenizer.get_vocab())
print(output.ids)
tokenizer.decode(output.ids)


['Hel', 'lo', ',', 'y', "'", 'all', '!', 'How', 'are', 'you', '?']
[2966, 410, 17, 93, 12, 374, 6, 3193, 378, 1541, 36]


"Hel lo , y ' all ! How are you ?"

In [54]:
config = {
        "embedding_dimension": 200,
        "ff_dimension": 200,
        "n_attention_heads": 2,
        "n_encoder_layers": 0,
        "n_decoder_layers": 2,
        "dataset": Dataset.WikiText2.name,
        "segmentation": Segmentation.Word.name,
        "vocab_size": 40000,
        "max_seq_len": 35,
        "batch_size": 20,
        "eval_batch_size": 10,
        "dropout": 0.2,
        "n_epochs": 3,
        "learning_rate": 0.0001,
        "adam_b1": 0.9,
        "adam_b2": 0.999,
        "adam_l2_weightdecay": 0.01,
        "loss_criterion": "CrossEntropyLoss"
    }

In [1]:
import time
import os

import torch
# from torch.utils.data import DataLoader

from torchtext import datasets
from torchtext.data import Field
from torchtext.data.utils import get_tokenizer

# from transformers import AutoTokenizer

# from transformers import GPT2TokenizerFast
from tokenizers import ByteLevelBPETokenizer

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

from utils import extract_config
from constants import *
from utils import *

In [28]:
config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.WikiText2.name,
    "segmentation": Segmentation.Subword.name,
    "vocab_size": 40000,
    "max_seq_len": 35,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.0001,
    "adam_b1": 0.9,
    "adam_b2": 0.999,
    "adam_l2_weightdecay": 0.01,
    "loss_criterion": "CrossEntropyLoss"
}

def create_subword_tokenizer(config):
    dataset, vocab_size = extract_config(
        config, "dataset", "vocab_size")
    
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()

    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "<unk>"])

    tokenizer.train(files=paths, trainer=trainer)

    return tokenizer

class TextDataloader:
    def __init__(self, dataset, tokenizer, max_seq_len, batch_size):
        self.dataset = self.prep_data(dataset, tokenizer)
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.batch_size = batch_size
        self.dataset_len = len(self.dataset)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        i = self.index

        seq_len = min(self.max_seq_len, self.dataset_len - 1 - i)
        chunk_len = seq_len * self.batch_size
        data = self.dataset[i:i+ chunk_len]
        target = self.dataset[i+1:i+1+chunk_len].reshape(-1)

        self.index += 1
        data = self.batchify(data)
        return data, target

    def prep_data(self, dataset, tokenizer):
        raw_text_iter = dataset[0].text
        data = [torch.tensor(tokenizer.encode(item).ids,
                                dtype=torch.long) for item in raw_text_iter]
        data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
        return data

    def batchify(self, data):
        # Divide the dataset into batch_size parts.
        nbatch = data.size(0) // self.batch_size
        # Trim off any extra elements that wouldn't cleanly fit (remainders).
        data = data.narrow(0, 0, nbatch * self.batch_size)
        print(data[0:20], "batchify1")
        # Evenly divide the data across the batch_size batches.
        data = data.view(self.batch_size, -1).contiguous()
        print(data[0], "batchify2")
        return data
    
print("[Start Load Data]")
ts = time.time()

# get dataset
dataset, batch_size, max_seq_len, segmentation = extract_config(
    config, "dataset", "batch_size", "max_seq_len", "segmentation")
dataset = getattr(datasets, dataset)
print(f"Fetched Data ({time.time() - ts:3f}s)")

# split dataset
train_dataset, val_dataset, test_dataset = dataset.splits(
    text_field=Field())
print(f"Tokenized and Split Data ({time.time() - ts:3f}s)")

# tokenize
if segmentation == Segmentation.Subword.name:
    tokenizer = create_subword_tokenizer(config)
elif segmentation == Segmentation.BBPE.name:
    tokenizer = create_bbpe_tokenizer(config)

# get vocabulary
vocab = tokenizer.get_vocab()

# # data prep
# def data_prep(tt_dataset_split):
#     raw_text_iter = tt_dataset_split[0].text
#     data = [torch.tensor(tokenizer.encode(item).ids,
#                             dtype=torch.long) for item in raw_text_iter]
#     data = torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
#     # TESTING, trim data
#     data = data[0:batch_size*64]
#     print(data.shape)
#     # Divide the dataset into bsz parts.
#     nbatch = data.size(0) // batch_size
#     # Trim off any extra elements that wouldn't cleanly fit (remainders).
#     data = data.narrow(0, 0, nbatch * batch_size)
#     # Evenly divide the data across the batch_size batches.
#     data = data.view(batch_size, -1).t().contiguous()
#     return data

# setup dataloaders
train_dataloader = TextDataloader(train_dataset, tokenizer, max_seq_len, batch_size)
# val_dataloader = TextDataloader(data_prep(val_dataset), max_seq_len)
# test_dataloader = TextDataloader(data_prep(test_dataset), max_seq_len)

print(f"[End Load Data] ({time.time() - ts:3f}s)")
# return train_dataloader, val_dataloader, test_dataloader, vocab, tokenizer
for batch in train_dataloader:
        data, targets = batch
        print("data", data.shape)
        print("targets", targets.shape)
        break

print(data[0])
print(targets[0:20])
print(tokenizer.decode(data[0].tolist()))
print(tokenizer.decode(targets[0:20].tolist()))

[Start Load Data]
Fetched Data (0.000123s)
Tokenized and Split Data (0.343220s)
[End Load Data] (40.578641s)
tensor([   33,    73,   687,    35,    34,  8787,  9549,  2448,    34,    33,
           73,   687,    35,    33,    73,   687,    35, 34658,   529,  8787]) data
tensor([   73,   687,    35,    34,  8787,  9549,  2448,    34,    33,    73,
          687,    35,    33,    73,   687,    35, 34658,   529,  8787,    24]) targetes
< e os > = Valkyria Chronicles III = < e os > < e os > Senjō no Valkyria
e os > = Valkyria Chronicles III = < e os > < e os > Senjō no Valkyria 3
tensor([   33,    73,   687,    35,    34,  8787,  9549,  2448,    34,    33,
           73,   687,    35,    33,    73,   687,    35, 34658,   529,  8787]) batchify1
tensor([   33,    73,   687,    35,    34,  8787,  9549,  2448,    34,    33,
           73,   687,    35,    33,    73,   687,    35, 34658,   529,  8787,
           24,    31,     5,  9549,    13,  1731,    31,   277,   275,   261,
          272,  