In [87]:
from data_utils import *
from torchtext import datasets, vocab


In [45]:
batch_size = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset_name = DatasetType.IWSLT.name
language_direction = LanguageDirection.G2E.name
train_token_ids_loader, val_token_ids_loader, src_field_processor, trg_field_processor = get_data_loaders(DATA_DIR_PATH, language_direction, dataset_name, batch_size, device)

# Verify that the mask logic is correct
pad_token_id = src_field_processor.vocab.stoi[PAD_TOKEN]
for batch in train_token_ids_loader:
    # Visually inspect that masks make sense
    src_padding_mask, trg_mask, num_src_tokens, num_trg_tokens = get_masks_and_count_tokens(batch.src, batch.trg, pad_token_id, device)
    break

# Check vocab size
print(f'Source vocabulary size={len(src_field_processor.vocab)}')
print(f'Target vocabulary size={len(trg_field_processor.vocab)}')

# Show text from token loader
sample_text_from_loader(src_field_processor, trg_field_processor, train_token_ids_loader)

train dataset (IWSLT) has 3634135 tokens in the source language (German) corpus.
train dataset (IWSLT) has 3937527 tokens in the target language (English) corpus.
val dataset (IWSLT) has 19540 tokens in the source language (German) corpus.
val dataset (IWSLT) has 20911 tokens in the target language (English) corpus.
Time it took to prepare the data: 3.726960 seconds.
Source vocabulary size=58945
Target vocabulary size=36322
*****
Source text:	

AttributeError: 'Batch' object has no attribute 'text'

In [7]:
def sample_text_from_loader(src_field_processor, trg_field_processor, token_ids_loader, num_samples=2, sample_src=True, sample_trg=True, show_padded=False):
    assert sample_src or sample_trg, f'Either src or trg or both must be enabled.'

    for b_idx, token_ids_batch in enumerate(token_ids_loader):
        if b_idx == num_samples:  # Number of sentence samples to print
            break

        print('*' * 5)
        if sample_src:
            print("Source text:", end="\t")
            for token_id in token_ids_batch.text[0]:  # print only the first example from the batch
                src_token = src_field_processor.vocab.itos[token_id]

                if src_token == PAD_TOKEN and not show_padded:
                    continue

                print(src_token, end=" ")
            print()

        if sample_trg:
            print("Target text:", end="\t")
            for token_id in token_ids_batch.target[0]:
                trg_token = trg_field_processor.vocab.itos[token_id]

                if trg_token == PAD_TOKEN and not show_padded:
                    continue

                print(trg_token, end=" ")
            print()

In [82]:
num_samples = 2
sample_src=True
sample_trg=True
show_padded=False

for b_idx, token_ids_batch in enumerate(train_token_ids_loader):
    if b_idx == num_samples:  # Number of sentence samples to print
        break

    print('*' * 5)
    if sample_src:
        print("Source text:", end="\t")
        for token_id in token_ids_batch.text[0].tolist():  # print only the first example from the batch
            src_token = field_processor.vocab.itos[token_id]

            if src_token == PAD_TOKEN and not show_padded:
                continue

            print(src_token, end=" ")
        print()

    if sample_trg:
        print("Target text:", end="\t")
        for token_id in token_ids_batch.target[0].tolist():
            trg_token = field_processor.vocab.itos[token_id]

            if trg_token == PAD_TOKEN and not show_padded:
                continue

            print(trg_token, end=" ")
        print()

*****
Source text:	

IndexError: list index out of range

In [86]:
for token_id in token_ids_batch.text[0].tolist():  # print only the first example from the batch
    print(token_id)
    # src_token = field_processor.vocab.itos[token_id]
    # print(token_id, src_token)

9971
38
2438
11
233
540
44
168
1453
115
30
478
0
128
9
9
313
1596
1815
2
67
42
89
16
2
4
7133
331
573
8516
2392
32


In [117]:
def get_data_loaders_causal(dataset_path, dataset_name=DatasetType.PennTreebank.name, batch_size=32, device=None):
    ts = time.time()
    # prep dataset
    dataset = getattr(datasets, dataset_name)  # should not be translation datsets
    spacy_en = spacy.load('en')

    # prep field processor (vocab)
    def tokenizer(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    field_processor = Field(tokenize=tokenizer, init_token=BOS_TOKEN,
                            eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)
    
    train, validation, test = dataset.splits(text_field=field_processor, root=dataset_path)
    field_processor.build_vocab(train, validation, test, min_freq=1)

    
    # prep iterator
    train_token_ids_loader, val_token_ids_loader, test_token_ids_loader = dataset.iters(
        batch_size=batch_size, root=dataset_path, device=device)
    
    # get vocab
    # vocabulary = vocab.build_vocab_from_iterator(train_token_ids_loader)
    vocabulary = {}
    print(f'Time it took to prepare the iterator: {time.time() - ts:3f} seconds.')

    return train_token_ids_loader, val_token_ids_loader, field_processor, vocabulary

# test
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset_name = DatasetType.PennTreebank.name
train_token_ids_loader, val_token_ids_loader, field_processor, vocabulary = get_data_loaders_causal(DATA_DIR_PATH, dataset_name, batch_size, device)





Time it took to prepare the iterator: 3.347126 seconds.


In [122]:
print(len(field_processor.vocab))
for batch in train_token_ids_loader:
    # Visually inspect that masks make sense
    print(batch)
    print(batch.text)
    print(batch.target)
    # src_padding_mask, trg_mask, num_src_tokens, num_trg_tokens = get_masks_and_count_tokens(batch.src, batch.trg, pad_token_id, device)
    break


9734

[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 35x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 35x32 (GPU 0)]
tensor([[9971,   38, 2438,  ..., 8516, 2392,   32],
        [9972,   34,   55,  ...,  564, 2168,    0],
        [9973,  853, 2156,  ...,    3,    8,   44],
        ...,
        [   2,    2,  505,  ...,   15,   88,  184],
        [ 147, 1031,   14,  ...,   14,   33,  106],
        [  20,    6,    7,  ...,    6,  846,  402]], device='cuda:0')
tensor([[9972,   34,   55,  ...,  564, 2168,    0],
        [9973,  853, 2156,  ...,    3,    8,   44],
        [9975, 7536,    5,  ...,  225,  204,   50],
        ...,
        [ 147, 1031,   14,  ...,   14,   33,  106],
        [  20,    6,    7,  ...,    6,  846,  402],
        [   7,  183,  178,  ..., 1344,   35,    0]], device='cuda:0')


In [92]:
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset_name = DatasetType.PennTreebank.name
train_token_ids_loader, val_token_ids_loader, field_processor, vocabulary = get_data_loaders_causal(DATA_DIR_PATH, dataset_name, batch_size, device)

# Verify that the mask logic is correct
pad_token_id = src_field_processor.vocab.stoi[PAD_TOKEN]
for batch in train_token_ids_loader:
    # Visually inspect that masks make sense
    print(batch)
    print(batch.text)
    print(batch.target)
    # src_padding_mask, trg_mask, num_src_tokens, num_trg_tokens = get_masks_and_count_tokens(batch.src, batch.trg, pad_token_id, device)
    break

# Check vocab size
print(f'Vocabulary size={len(vocabulary)}')
# print(f'Target vocabulary size={len(field_processor.vocab)}')

# Show text from token loader
sample_text_from_loader(field_processor, field_processor, train_token_ids_loader)



0lines [00:00, ?lines/s]


AttributeError: 'Batch' object has no attribute 'input_fields'

In [86]:
from torchtext import vocab

In [None]:
train_dataset, val_dataset, src_field_processor, trg_field_processor = get_datasets_and_vocabs(DATA_DIR_PATH, language_direction, dataset_name == DatasetType.IWSLT.name, use_caching_mechanism=False)
trained1 = train_dataset


In [None]:
print(len(trained1.examples))

print(trained1.examples[1].text)


In [None]:
len(trained2.examples[0].text)

In [None]:
train_dataset.examples

In [57]:
train_dataset, val_dataset, field_processor = get_datasets_and_vocab_causal(DATA_DIR_PATH)
trained2 = train_dataset

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
Time it took to prepare the data: 2.646358 seconds.


AttributeError: 'Batch' object has no attribute 'input_fields'

In [None]:
val_dataset

In [56]:
class DatasetType(enum.Enum):
    IWSLT = 0,
    WMT14 = 1,
    PennTreebank = 2,
    WikiText2 = 3,
    WikiText103 = 4

def get_datasets_and_vocab_causal(dataset_path, dataset_name= DatasetType.PennTreebank.name, use_caching_mechanism=False):
    # load data
    dataset_name= DatasetType.PennTreebank.name
    dataset = getattr(datasets, dataset_name) # should not be translation datsets
    spacy_en = spacy.load('en_core_web_sm')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    field_processor = Field(tokenize=tokenize_en, init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)

    # fields = [('src', src_field_processor), ('trg', trg_field_processor)]
    # MAX_LEN = 100  # filter out examples that have more than MAX_LEN tokens
    # filter_pred = lambda x: len(x.src) <= MAX_LEN and len(x.trg) <= MAX_LEN

    # tokenize data
    # create datasets
    prefix = 'causal_' + dataset_name
    train_cache_path = os.path.join(dataset_path, f'{prefix}_train_cache.csv')
    val_cache_path = os.path.join(dataset_path, f'{prefix}_val_cache.csv')
    test_cache_path = os.path.join(dataset_path, f'{prefix}_test_cache.csv')

    # This simple caching mechanism gave me ~30x speedup on my machine! From ~70s -> ~2.5s!
    ts = time.time()
    if not use_caching_mechanism or not (os.path.exists(train_cache_path) and os.path.exists(val_cache_path)):
        train_dataset, val_dataset, test_dataset = dataset.splits(
            text_field=field_processor,
            root=dataset_path
        )
        train_dataset, val_dataset, test_dataset = dataset.iters()
        # save_cache(train_cache_path, train_dataset)
        # save_cache(val_cache_path, val_dataset)
        # save_cache(test_cache_path, test_dataset)
    else:
        # TODO: load from cache 
        print("did not load from cache!")
        return

    print(f'Time it took to prepare the data: {time.time() - ts:3f} seconds.')
    
    MIN_FREQ = 2
    field_processor.build_vocab(train_dataset, min_freq=MIN_FREQ)

    return train_dataset, val_dataset, field_processor


In [None]:
field_processor

In [55]:
train_dataset

NameError: name 'train_dataset' is not defined

In [130]:
# load data
dataset_path = DATA_DIR_PATH
dataset_name= DatasetType.PennTreebank.name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ts = time.time()
# prep dataset
dataset = getattr(datasets, dataset_name)  # should not be translation datsets
spacy_en = spacy.load('en')

# prep field processor (vocab)
def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

field_processor = Field(tokenize=tokenizer, init_token=BOS_TOKEN,
                    eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, batch_first=True)

train, validation, test = dataset.splits(text_field=field_processor, root=dataset_path)
field_processor.build_vocab(train, validation, test, min_freq=1)


# prep iterator
train_token_ids_loader, val_token_ids_loader, test_token_ids_loader = dataset.iters(
batch_size=batch_size, root=dataset_path, device=device)

# get vocab
# vocabulary = vocab.build_vocab_from_iterator(train_token_ids_loader)
vocabulary = {}
print(f'Time it took to prepare the iterator: {time.time() - ts:3f} seconds.')


TypeError: splits() missing 1 required positional argument: 'text_field'

In [153]:
# train.examples[0].text
for batch in train_token_ids_loader:
    print(batch)
    print(batch)
    break
# vocabulary = vocab.build_vocab_from_iterator()
# 


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 35x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 35x32 (GPU 0)]


AttributeError: 'Batch' object has no attribute 'input_fields'

In [160]:
len(catch_unique(train.examples[0].text))

st
reacting
communism
damp
critic
arranging
advisers
appearances
u.s.-soviet
terrorism
tariffs
grow
kremlin
comparisons
gathering
readily
transform
loosen
grip
suggesting
marking
bnl
fiduciary
suitable
rome
outlined
dragging
activists
gandhi
violent
disobedience
passive
webster
indians
affected
truly
excuse
parks
bus
illustration
tendency
gestures
posture
apt
site
criminals
excitement
demonstration
speeding
dies
airing
politician
cameras
phil
indirectly
vietnam
hide
fabric
enemy
cambodia
morally
draft
dignity
extraordinarily
informed
probable
calm
rational
deukmejian
repairs
quake
bipartisan
discounting
audits
pigs
update
selection
specially
deductions
dependents
satisfied
overhaul
enactment
modified
backs
negligence
urges
assessment
oregon
advises
mile
ira
violates
delegation
rode
horses
conservation
sideline
horse
inspector
arbitrator
arbitration
burton
collect
acceptable
unwarranted
delicate
supervisors
ironically
achenbaum
beating
worldwide
hesitate
saatchi
hyundai
searches
invites

TypeError: object of type 'NoneType' has no len()

In [154]:
def catch_unique(list_in):
   # intilize an empty list
   unq_list = []

   # Check for elements
   for x in list_in:
      # check if exists in unq_list
      if x not in unq_list:
         unq_list.append(x)
         # print list
   return unq_list