In [1]:

from fastai.callback.core import *

from IPython.core.debugger import set_trace as bk
import os
from pathlib import Path
from functools import partial
import torch
import datasets
from transformers import ElectraTokenizerFast
hf_tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-generator")
from hugdatafast import *

# 1.Basics

## Simple tokenization & infer cache name
`cols`(`Dict[str]`): tokenize the every column named key into column named its value  
`cols`(`List[str]`): specify the name of columns to be tokenized, replace the original columns' data with tokenized one

Here, we tokenized "sentence" into a new column named "text_idxs", the "sentence" column still exist.

In [3]:

cola = datasets.load_dataset('glue', 'cola')
tokenized_cola = cola.my_map(SimpleTokenize({'sentence':'text_idxs'}, hf_tokenizer),
                             cache_file_names='tokenized_{split}', num_proc=2)
print(tokenized_cola['train'][0])
print()
for dset in tokenized_cola.values(): print(dset.cache_files[0]['filename'])

Reusing dataset glue (/home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


HBox(children=(FloatProgress(value=0.0, description='#1', max=4275.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='#0', max=4276.0, style=ProgressStyle(description_width='i…





HBox(children=(FloatProgress(value=0.0, description='#0', max=522.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#1', max=521.0, style=ProgressStyle(description_width='in…





HBox(children=(FloatProgress(value=0.0, description='#0', max=532.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='#1', max=531.0, style=ProgressStyle(description_width='in…



{'idx': 0, 'label': 1, 'sentence': "Our friends won't buy this analysis, let alone the next one we propose.", 'text_idxs': [2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]}

/home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tokenized_train_00000_of_00002.arrow
/home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tokenized_validation_00000_of_00002.arrow
/home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tokenized_test_00000_of_00002.arrow


## Create fastai `Dataloaders` and `show_batch`

`cols`: **specify columns whose values form a output sample in order**, and the semantic type of each column to encode/decode, with one of the following signature (see doc).

Here, `['text_idxs, 'label']` is equal to `{'text_idxs': TensorText, 'label': TensorCategory}`

The bars are sorting samples according to length, see `MySortedDL`

In [3]:
cola_dsets = HF_Datasets(tokenized_cola, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer, neat_show=True)
cola_dls = cola_dsets.dataloaders(bs=32)
cola_dls.show_batch(max_n=2) # show at most two rows

Set __getitem__(key) output type to torch for ['text_idxs', 'label'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['text_idxs', 'label'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
 98%|█████████▊| 1037/1063 [00:02<00:00, 515.45it/s]

Unnamed: 0,text_idxs,label
0,"everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean.",1
1,"ron wanted to wear a tuxedo to the party, but wear a tuxedo to the party caspar couldn't decide whether to.",0


You can either specify `neat_show=False` (which is default), to show real data which is tokenized and  with pad 

In [4]:
cola_dsets = HF_Datasets(tokenized_cola, cols={'text_idxs': TensorText, 'label': TensorCategory}, hf_toker=hf_tokenizer)
cola_dls = cola_dsets.dataloaders(bs=32)
cola_dls.show_batch(max_n=2)

Set __getitem__(key) output type to torch for ['text_idxs', 'label'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['text_idxs', 'label'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
 98%|█████████▊| 1040/1063 [00:02<00:00, 516.29it/s]

Unnamed: 0,text_idxs,label
0,"everybody who has ever , worked in any office which contained any type ##writer which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean .",1
1,"will put a picture of bill on your desk before tomorrow , this girl in the red coat will put a picture of bill on your desk before tomorrow . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",0


`test_with_label` is `False` by default, so in test set the sample formed by only first `n_inp` columns specified, which is x.

This make you able to apply the same to all splits when test set come with no y or fake y

## Multiple columns (> 2) in sample
Some points to notice:
- title of each column showed is and in order of `cols` specified in `HF_Datasets`
- auto pad sequence to the max length in the batch, for all columns
- If a fastai semantic tensor type is not specified, it look dtype and shape of the tensor and decide how to decode it autmatically 

In [5]:
wsc = datasets.load_dataset('super_glue', 'wsc.fixed')
print(wsc['train'][0])
tokenized_wsc = wsc.my_map(simple_tokenize_func(['text', 'span1_text', 'span2_text'], hf_tokenizer))
wsc_dsets = HF_Datasets(tokenized_wsc, cols={'text': TensorText, 'span1_index': noop, 'span1_text':TensorText, 'span2_index': noop, 'span2_text': TensorText, 'label': lambda t: t.bool()}, # convert label (int) to (bool), just to test its abililty to show tensor(bool)
hf_toker=hf_tokenizer)
dls = wsc_dsets.dataloaders(bs=3, srtkey_fc=False, shuffle_train=False) # don't sort samples, don't shuffle trainset
#bk()
dls.show_batch()

https://raw.githubusercontent.com/huggingface/datasets/master/datasets/super_glue/super_glue.py not found in cache or force_download set to True, downloading to /home/yisiang/.cache/huggingface/datasets/tmp4vauussu


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10474.0, style=ProgressStyle(descriptio…


storing https://raw.githubusercontent.com/huggingface/datasets/master/datasets/super_glue/super_glue.py in cache at /home/yisiang/.cache/huggingface/datasets/17727f4c5312e09bd16ee8581466c4f74b1802efd416965b4cfd523c12fad94d.ab72d3ffcbe0d0e93a4595f2a810b3988c20d7836ae0bdb5ff4bdccf6bd92a36.py
creating metadata file for /home/yisiang/.cache/huggingface/datasets/17727f4c5312e09bd16ee8581466c4f74b1802efd416965b4cfd523c12fad94d.ab72d3ffcbe0d0e93a4595f2a810b3988c20d7836ae0bdb5ff4bdccf6bd92a36.py
https://raw.githubusercontent.com/huggingface/datasets/master/datasets/super_glue/dataset_infos.json not found in cache or force_download set to True, downloading to /home/yisiang/.cache/huggingface/datasets/tmpfw9s15jy


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9253.0, style=ProgressStyle(description…

storing https://raw.githubusercontent.com/huggingface/datasets/master/datasets/super_glue/dataset_infos.json in cache at /home/yisiang/.cache/huggingface/datasets/be2c3836d8078b3465c52eebc3e437eeb18adabce99af20c98422a53acc7d3d4.9fa45241690c27df567c8014a4bf461a4ba1e82bd4358961888c6bf59769c3b5
creating metadata file for /home/yisiang/.cache/huggingface/datasets/be2c3836d8078b3465c52eebc3e437eeb18adabce99af20c98422a53acc7d3d4.9fa45241690c27df567c8014a4bf461a4ba1e82bd4358961888c6bf59769c3b5
Checking /home/yisiang/.cache/huggingface/datasets/17727f4c5312e09bd16ee8581466c4f74b1802efd416965b4cfd523c12fad94d.ab72d3ffcbe0d0e93a4595f2a810b3988c20d7836ae0bdb5ff4bdccf6bd92a36.py for additional imports.
Found main folder for dataset https://raw.githubusercontent.com/huggingface/datasets/master/datasets/super_glue/super_glue.py at /home/yisiang/.cache/huggingface/modules/datasets_modules/datasets/super_glue
Found specific version folder for dataset https://raw.githubusercontent.com/huggingface/datas

Unnamed: 0,text,span1_index,span1_text,span2_index,span2_text,label
0,"mark told pete many lies about himself , which pete included in his book . he should have been more skeptical . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",0,mark [PAD] [PAD],13,he,0
1,"the mothers of arthur and celeste have come to the town to fetch them . they are very happy to have them back , but they sc ##old them just the same because they ran away . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",1,mothers [PAD] [PAD],25,them,0
2,"mark was close to mr . singer ' s heels . he heard him calling for the captain , promising him , in the jar ##gon everyone talked that night , that not one thing should be damaged on the ship except only the ammunition , but the captain and all his crew had best stay in the cabin until the work was over",4,mr . singer,8,he,0


# 2. Aggregate Dataset
a sample in transformed dataset is aggregated/accumulated from multiple original samples.

- Except for `LMTransform`, you can implement your own logic create a class inherits `AggregateTransform` and implements `accumulate` and `create_example` method

- Note that you should pass **tokenized** dataset(s)

## Make  dataset(s) for (traditional) language model`

In [6]:
cola_val = tokenized_cola['validation']
#bk()
lm_cola_val = LMTransform(cola_val, max_len=20, text_col='text_idxs').map()

print('Original dataset:')
print('num of samples:', len(cola['validation']))
print('second to last sentence:', cola['validation'][-2]['sentence'])
print('          last sentence:', cola['validation'][-1]['sentence'])
print('LM dataset:')
print('num of sampels:', len(lm_cola_val))
assert len(lm_cola_val) == 481
print('last text (x):', hf_tokenizer.decode(lm_cola_val[-1]['x_text']))
print('last text (y):', hf_tokenizer.decode(lm_cola_val[-1]['y_text']))

Set __getitem__(key) output type to python objects for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-e6acb88170f61c6d.arrow


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Done writing 481 examples in 157576 bytes /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tmppum5q2sm.
Set __getitem__(key) output type to python objects for [] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['x_text', 'y_text'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Original dataset:
num of samples: 1043
second to last sentence: John arranged for himself to get the prize.
          last sentence: John talked to Bill about himself.
LM dataset:
num of sampels: 481
last text (x): . john talked to bill about himself
last text (y): john talked to bill about himself.


In [13]:
lm_cola = LMTransform(tokenized_cola, max_len=50, text_col='text_idxs').map()
# test single dataset
lm_ds = HF_Dataset(lm_cola['validation'], cols={'x_text':LMTensorText, 'y_text':TensorText},hf_toker=hf_tokenizer)
lm_dl = MySortedDL(lm_ds, srtkey_fc=False)
lm_dl.show_batch(max_n=2)

Set __getitem__(key) output type to python objects for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-2509c56d4d553502.arrow


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


Done writing 1564 examples in 1263672 bytes /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tmp6hips0lj.
Set __getitem__(key) output type to python objects for [] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['x_text', 'y_text'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-b25d35199b6c7232.arrow


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Done writing 198 examples in 159840 bytes /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tmplm69lznx.
Set __getitem__(key) output type to python objects for [] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['x_text', 'y_text'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-888ff239e1f71e93.arrow



HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Done writing 200 examples in 161056 bytes /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tmpigpvusur.
Set __getitem__(key) output type to python objects for [] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['x_text', 'y_text'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['x_text', 'y_text'] columns  (when key is int or slice) and don't output other (un-formatted) columns.



Unnamed: 0,x_text,y_text
0,"the sailors rode the breeze clear of the rocks . the weights made the rope stretch over the pull ##ey . the mechanical doll wr ##ig ##gled itself loose . if you had eaten more , you would want less . as you eat the most , you want the","sailors rode the breeze clear of the rocks . the weights made the rope stretch over the pull ##ey . the mechanical doll wr ##ig ##gled itself loose . if you had eaten more , you would want less . as you eat the most , you want the least"
1,". the more you would want , the less you would eat . i demand that the more john eat , the more he pays . mary listen ##s to the grateful dead , she gets depressed . the ang ##rier mary got , the more she looked at pictures","the more you would want , the less you would eat . i demand that the more john eat , the more he pays . mary listen ##s to the grateful dead , she gets depressed . the ang ##rier mary got , the more she looked at pictures ."


## Test ELECTRA data creating

In [14]:
proc_dset = ELECTRADataTransform(cola['validation'], is_docs=False, text_col='sentence', max_length=128, hf_toker=hf_tokenizer).map()
e_dsets = HF_Datasets({'train':proc_dset}, cols={'input_ids':TensorText,'sentA_lenth':noop}, hf_toker=hf_tokenizer)
e_dls = e_dsets.dataloaders(srtkey_fc=False)
e_dls.show_batch(max_n=2)

Set __getitem__(key) output type to python objects for ['sentence'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Testing the mapped function outputs
Testing finished, running the mapping function on the dataset
Caching processed dataset at /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-7db22ab214e91040.arrow


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Done writing 78 examples in 78488 bytes /home/yisiang/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/tmpi_20vusq.

Set __getitem__(key) output type to python objects for [] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to python objects for ['input_ids', 'sentA_lenth'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['input_ids', 'sentA_lenth'] columns  (when key is int or slice) and don't output other (un-formatted) columns.


Unnamed: 0,input_ids,sentA_lenth
0,"[CLS] no writer , nor any playwright , meets in vienna . that you will marry any student is not certain . felicia kicked the ball off the bench . i sent the package halfway around the world . sam gave the ball out of the basket . sam offered the ball out of the basket . park square has a fest ##ive air . [SEP] the worker will have a job . no one can forgive that comment to you . we launched the rocket to the moon , but it blew up before it got there . sarah promised catherine her old car , but then gave it to her son instead . i lent the book part ##way to tony . the farmer loaded [SEP]",66
1,"[CLS] i borrowed fred ' s diagram of a snake ' s eye because steve ' s had been stolen . jerry attempted to blow up the pentagon . so fast did he run that nobody could catch him . bill bought a red house , and max bought one too . who always drinks milk ? the book which inspired them was very long . [SEP] the book what inspired them was very long . i know the person whose mother died . the person whose mother ' s dog we were all fond of . i wonder whose mother died . i wonder whose mother ' s dog died . i wonder to whom they dedicated the building . give me the phone number of [SEP]",67


# 3. Test filtering feature
Note that filter won't be applied to split other than train, because validation/test set is for fair comparison, and you can't take out samples at your will 

In [9]:
l = 23
num = {}
for split in tokenized_cola:
  num[split] = reduce(lambda sum, sample: sum+(1 if len(sample['text_idxs'])==l else 0), 
                      tokenized_cola[split], 0)
print(num)

{'train': 26, 'validation': 2, 'test': 6}


In [10]:
ccola_dsets = HF_Datasets(tokenized_cola, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer)
ccola_dls = ccola_dsets.dataloaders(filter_fc=lambda text_idxs, label: len(text_idxs)!=l,)

for i, split in enumerate(tokenized_cola):
  if split == 'train':
    assert ccola_dls[i].n == len(tokenized_cola[split])-num[split],f"{split}: filtered: {ccola_dls[i].n}, unfiltered: {len(tokenized_cola[split])}, should be filtered: {num[split]}"
  else:
    assert ccola_dls[i].n == len(tokenized_cola[split]), f"{split}: accidentally filtered: {ccola_dls[i].n}, unfiltered: {len(tokenized_cola[split])}"
print("Test passed")

Set __getitem__(key) output type to torch for ['text_idxs', 'label'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['text_idxs', 'label'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
Set __getitem__(key) output type to torch for ['text_idxs'] columns  (when key is int or slice) and don't output other (un-formatted) columns.
 99%|█████████▉| 1054/1063 [00:02<00:00, 433.55it/s]Test passed


# 4. Cache dataloader
If sorting or filtering is applied, dataloader need to create some record inside it, to do it only once, we can cache the records. 

If `cache_dir` is not specified, it will be the cache_dir of `dsets` passed to `HF_Datasets`.

In [11]:
for f in ['/tmp/cached_train.json','/tmp/cached_val.json', '/tmp/cached_test.json']:
  if Path(f).exists(): os.remove(f)

ccola_dls = ccola_dsets.dataloaders(cache_dir='/tmp', cache_name='cached_{split}.json')

99%|█████████▉| 1054/1063 [00:02<00:00, 420.38it/s]

This time we load the caches, it should be fast and progress bars sholdn't appear

In [12]:
ccola_dls = ccola_dsets.dataloaders(cache_dir='/tmp', cache_name='cached_{split}.json')