In [1]:
from data_utils import *

In [2]:
batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset_name = DatasetType.IWSLT.name
language_direction = LanguageDirection.G2E.name
train_token_ids_loader, val_token_ids_loader, src_field_processor, trg_field_processor = get_data_loaders(DATA_DIR_PATH, language_direction, dataset_name, batch_size, device)

# Verify that the mask logic is correct
pad_token_id = src_field_processor.vocab.stoi[PAD_TOKEN]
for batch in train_token_ids_loader:
    # Visually inspect that masks make sense
    src_padding_mask, trg_mask, num_src_tokens, num_trg_tokens = get_masks_and_count_tokens(batch.src, batch.trg, pad_token_id, device)
    break

# Check vocab size
print(f'Source vocabulary size={len(src_field_processor.vocab)}')
print(f'Target vocabulary size={len(trg_field_processor.vocab)}')

# Show text from token loader
sample_text_from_loader(src_field_processor, trg_field_processor, train_token_ids_loader)

train dataset (IWSLT) has 3634135 tokens in the source language (German) corpus.
train dataset (IWSLT) has 3937527 tokens in the target language (English) corpus.
val dataset (IWSLT) has 19540 tokens in the source language (German) corpus.
val dataset (IWSLT) has 20911 tokens in the target language (English) corpus.
Time it took to prepare the data: 3.314288 seconds.
Source vocabulary size=58945
Target vocabulary size=36322
*****
Source text:	Weil Sie bei Google etwas <unk> in die Suchmaschine eingeben können und dabei eine Antwort erwarten , richtig ? 
Target text:	<s> Because you can type , you know , any kind of thing into Google , and you expect an answer back , right ? </s> 
*****
Source text:	Wir wissen , sie können Kotpillen auf einer geraden Linie rollen , indem sie Anhaltspunkte am Himmel benutzen . 
Target text:	<s> Well , we know that they can roll balls in a straight line using celestial cues . </s> 


In [13]:
def get_data_loaders_causal(dataset_path, dataset_name=DatasetType.PennTreebank.name, batch_size=32, device="cpu"):
    dataset = getattr(datasets, dataset_name)  # should not be translation datsets
    spacy_en = spacy.load('en_core_web_sm')

    def tokenizer(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    field_processor = Field(tokenize=tokenizer, init_token=BOS_TOKEN,
                            eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)
    ts = time.time()

    train_token_ids_loader, val_token_ids_loader = dataset.iters(
        batch_size=batch_size, root=dataset_path, device=device)
    
    print(f'Time it took to prepare the iterator: {time.time() - ts:3f} seconds.')

    return (train_token_ids_loader, val_token_ids_loader, field_processor)

IndentationError: unexpected indent (<ipython-input-13-1ada671ff99b>, line 14)

In [12]:
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset_name = DatasetType.PennTreebank.name
train_token_ids_loader, val_token_ids_loader, field_processor = get_data_loaders_causal(DATA_DIR_PATH, dataset_name, batch_size, device)

# Verify that the mask logic is correct
pad_token_id = src_field_processor.vocab.stoi[PAD_TOKEN]
for batch in train_token_ids_loader:
    # Visually inspect that masks make sense
    src_padding_mask, trg_mask, num_src_tokens, num_trg_tokens = get_masks_and_count_tokens(batch.src, batch.trg, pad_token_id, device)
    break

# Check vocab size
print(f'Source vocabulary size={len(field_processor.vocab)}')
print(f'Target vocabulary size={len(field_processor.vocab)}')

# Show text from token loader
sample_text_from_loader(src_field_processor, trg_field_processor, train_token_ids_loader)



ValueError: too many values to unpack (expected 2)

In [None]:
train_dataset, val_dataset, src_field_processor, trg_field_processor = get_datasets_and_vocabs(DATA_DIR_PATH, language_direction, dataset_name == DatasetType.IWSLT.name, use_caching_mechanism=False)
trained1 = train_dataset


In [None]:
print(len(trained1.examples))

print(trained1.examples[1].text)


In [None]:
len(trained2.examples[0].text)

In [None]:
train_dataset.examples

In [None]:
train_dataset, val_dataset, field_processor = get_datasets_and_vocab_causal(DATA_DIR_PATH)
trained2 = train_dataset

In [None]:
val_dataset

In [None]:
class DatasetType(enum.Enum):
    IWSLT = 0,
    WMT14 = 1,
    PennTreebank = 2,
    WikiText2 = 3,
    WikiText103 = 4

def get_datasets_and_vocab_causal(dataset_path, dataset_name= DatasetType.PennTreebank.name, use_caching_mechanism=False):
    # load data
    dataset_name= DatasetType.PennTreebank.name
    dataset = getattr(datasets, dataset_name) # should not be translation datsets
    spacy_en = spacy.load('en_core_web_sm')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    field_processor = Field(tokenize=tokenize_en, init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)

    # fields = [('src', src_field_processor), ('trg', trg_field_processor)]
    # MAX_LEN = 100  # filter out examples that have more than MAX_LEN tokens
    # filter_pred = lambda x: len(x.src) <= MAX_LEN and len(x.trg) <= MAX_LEN

    # tokenize data
    # create datasets
    prefix = 'causal_' + dataset_name
    train_cache_path = os.path.join(dataset_path, f'{prefix}_train_cache.csv')
    val_cache_path = os.path.join(dataset_path, f'{prefix}_val_cache.csv')
    test_cache_path = os.path.join(dataset_path, f'{prefix}_test_cache.csv')

    # This simple caching mechanism gave me ~30x speedup on my machine! From ~70s -> ~2.5s!
    ts = time.time()
    if not use_caching_mechanism or not (os.path.exists(train_cache_path) and os.path.exists(val_cache_path)):
        train_dataset, val_dataset, test_dataset = dataset.splits(
            text_field=field_processor,
            root=dataset_path
        )
        train_dataset, val_dataset, test_dataset = dataset.iters()
        # save_cache(train_cache_path, train_dataset)
        # save_cache(val_cache_path, val_dataset)
        # save_cache(test_cache_path, test_dataset)
    else:
        # TODO: load from cache 
        print("did not load from cache!")
        return

    print(f'Time it took to prepare the data: {time.time() - ts:3f} seconds.')
    
    MIN_FREQ = 2
    field_processor.build_vocab(train_dataset, min_freq=MIN_FREQ)

    return train_dataset, val_dataset, field_processor


In [None]:
field_processor

In [None]:
# load data
dataset_path = DATA_DIR_PATH
dataset_name= DatasetType.PennTreebank.name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = getattr(datasets, dataset_name) # should not be translation datsets
spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

field_processor = Field(tokenize=tokenize_en, init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)

# fields = [('src', src_field_processor), ('trg', trg_field_processor)]
# MAX_LEN = 100  # filter out examples that have more than MAX_LEN tokens
# filter_pred = lambda x: len(x.src) <= MAX_LEN and len(x.trg) <= MAX_LEN

# tokenize data
# create datasets
# prefix = 'causal_' + dataset_name
# train_cache_path = os.path.join(dataset_path, f'{prefix}_train_cache.csv')
# val_cache_path = os.path.join(dataset_path, f'{prefix}_val_cache.csv')
# test_cache_path = os.path.join(dataset_path, f'{prefix}_test_cache.csv')

# This simple caching mechanism gave me ~30x speedup on my machine! From ~70s -> ~2.5s!
ts = time.time()

train_dataset, val_dataset, test_dataset = dataset.iters(root=dataset_path, device=device)
    # save_cache(train_cache_path, train_dataset)
    # save_cache(val_cache_path, val_dataset)
    # save_cache(test_cache_path, test_dataset)
print(f'Time it took to prepare the data: {time.time() - ts:3f} seconds.')


In [None]:
test_dataset