In [9]:
from utils import *
from constants import *
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

# DATA_PATH = './.data/'
# ptb_filenames = list(map(lambda x: "wsj_"+ format(x, '04'), range(1, 200)))
# TRAINING_DATA = {
#     'PennTreebank': {
#         "location": "penn-treebank-raw/",
#         "filenames": ptb_filenames
#     },
# }

def create_subword_tokenizer(config):
    dataset, vocab_size = extract_config(
        config, "dataset", "vocab_size")

    # get location
    output_location = 'tokenizer/'
    tokenizer_loc = 'bpe_tokenizer_' + str(dataset) + '_' + str(vocab_size) + ".tokenizer.json"
    path_to_tokenizer_loc = DATA_PATH + output_location
    tokenizer_filepath = path_to_tokenizer_loc + tokenizer_loc

    # load tokenizer
    if os.path.isfile(tokenizer_filepath):
        tokenizer = Tokenizer.from_file(tokenizer_filepath)
        return tokenizer

    # build tokenizer
    tokenizer = Tokenizer(BPE())
    tokenizer.pre_tokenizer = Whitespace()

    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
    # print(paths)
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=[])
    # print(len(paths))
    tokenizer.train(files=paths, trainer=trainer)

    # save tokenizer
    try:
        if not os.path.isdir(path_to_tokenizer_loc):
            os.makedirs(path_to_tokenizer_loc)
        tokenizer.save(str(tokenizer_filepath))
    except Exception as e:
        print("Error saving tokenizer", e)

    return tokenizer

config = {
    "embedding_dimension": 200,
    "ff_dimension": 200,
    "n_attention_heads": 2,
    "n_encoder_layers": 0,
    "n_decoder_layers": 2,
    "dataset": Dataset.PennTreebank.name,
    "segmentation": Segmentation.Subword.name,
    "vocab_size": 40000,
    "max_seq_len": 32,
    "batch_size": 20,
    "eval_batch_size": 10,
    "dropout": 0.2,
    "n_epochs": 3,
    "learning_rate": 0.0001,
    "adam_b1": 0.9,
    "adam_b2": 0.999,
    "adam_l2_weightdecay": 0.01,
    "loss_criterion": "CrossEntropyLoss"
}

tokenizer = create_subword_tokenizer(config)

In [4]:
tokenizer.decode(tokenizer.encode("Hi! Hello! this is a test. This tastes like jello").ids)


'H i ! H ello ! this is a test . This t ast es like j ello'

In [5]:
from nltk.corpus import treebank
import torch
import nltk
nltk.download('treebank')
treebank_text = treebank.sents()
print(treebank)
print(treebank_text[0])
print(' '.join(treebank_text[0]))
total_count = len(treebank_text)
train_count = int(0.7 * total_count) 
valid_count = int(0.2 * total_count)
test_count = total_count - train_count - valid_count
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(treebank_text, (train_count, valid_count, test_count))

# tokenizer.encode(text[0])


[nltk_data] Downloading package treebank to /home/gbafa/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
<BracketParseCorpusReader in '/home/gbafa/nltk_data/corpora/treebank/combined'>
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .


In [4]:
import numpy as np
dataset, vocab_size = extract_config(
        config, "dataset", "vocab_size")
location = TRAINING_DATA[dataset]['location']
paths = list(map(lambda x: str(DATA_PATH+location+x),
                     TRAINING_DATA[dataset]['filenames']))
lines = open(paths[0], newline='\n')
raw_data = map(lambda x: list(open(x, newline='\n')), paths)
text_data = []
for item in text_data:
    item = filter(lambda x: x != '\n', item)
    text_data.extend(item)
print(len(text_data))
# return data

0


In [30]:
import torch
import nltk
from constants import *
from torchtext import datasets
from torchtext.data import Field

# DATA_PATH = './.data/'
# ptb_filenames = list(map(lambda x: "wsj_"+ format(x, '04'), range(1, 200)))
# TRAINING_DATA = {
#     'PennTreebank': {
#         "location": "penn-treebank-raw/",
#         "filenames": ptb_filenames
#     },
# }


def split_dataset(config):
    dataset, batch_size, max_seq_len, segmentation = extract_config(
        config, "dataset", "batch_size", "max_seq_len", "segmentation")
    tt_dataset = getattr(datasets, dataset)
    # print(f"Fetched Data ({time.time() - ts:3f}s)")

    # # process non-ptb datasets
    # if dataset != Dataset.PennTreebank.name:
    #     print(dataset)
    #     return tt_dataset.splits(text_field=Field())
    
    location = TRAINING_DATA[dataset]['location']
    paths = list(map(lambda x: str(DATA_PATH+location+x),
                        TRAINING_DATA[dataset]['filenames']))

    # train data
    train_data = []
    valid_data = []
    test_data = []
    for path in paths:
        raw_data = list(open(path, newline='\n'))
        raw_data = list(filter(lambda x: x != '\n', raw_data))
        if re.search("train", path):
           train_data = raw_data
        if re.search("valid", path):
           valid_data = raw_data
        if re.search("test", path):
           test_data = raw_data
    # print(test_data)
    # print(train_data)
    return train_data, valid_data, test_data
    # print(train_data)


    # train_count = int(0.7 * total_count) 
    # valid_count = int(0.2 * total_count)
    # test_count = total_count - train_count - valid_count
    # return (text_data[0:train_count], text_data[train_count: train_count + valid_count], text_data[train_count + valid_count:len(text_data)])
    # return torch.utils.data.random_split(text_data, (train_count, valid_count, test_count))

train_dataset, valid_dataset, test_dataset = split_dataset(config)

print(train_dataset[0:10])
print(valid_dataset[0:10])
print(test_dataset[0:10])


[' aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter \n', ' pierre <unk> N years old will join the board as a nonexecutive director nov. N \n', ' mr. <unk> is chairman of <unk> n.v. the dutch publishing group \n', ' rudolph <unk> N years old and former chairman of consolidated gold fields plc was named a nonexecutive director of this british industrial conglomerate \n', ' a form of asbestos once used to make kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than N years ago researchers reported \n', ' the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said \n', ' <unk> inc. the unit of new york-based <unk> corp. that makes kent cigarettes stopped using <unk> in its <unk> cigarette filters in N \n'

In [9]:
print(train_dataset[0])

 aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter 



In [7]:
import re
result = re.match("train", "ptb.train.txt")
print(result)

None
