In [1]:
from IPython.core.debugger import set_trace as bk
import os
from pathlib import Path
from functools import partial
import torch
import nlp
from transformers import ElectraTokenizerFast
hf_tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-generator")
from fastai2.text.all import *
from _utils.huggingface import *

# 1.Basics

## Tokenize all splits of dataset at once
`cols`(`Dict[str]`): tokenize the every column named key into column named its value  
`cols`(`List[str]`): specify the name of columns to be tokenized, replace the original columns' data with tokenized one

Here, we tokenized "sentence" into a new column named "text_idxs", the "sentence" column still exist.

In [2]:
cola = nlp.load_dataset('glue', 'cola') 
# cola is {'train':nlp.Dataset, 'validation':nlp.Dataset, 'test':nlp.Dataset}
tokenized_cola = HF_TokenizeTfm(cola, {'sentence':'text_idxs'}, hf_tokenizer).map()
tokenized_cola['train'][0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0,
 'text_idxs': [2256,
  2814,
  2180,
  1005,
  1056,
  4965,
  2023,
  4106,
  1010,
  2292,
  2894,
  1996,
  2279,
  2028,
  2057,
  16599,
  1012]}

## Custom function apply to all splits of dataset at once
The `func` of `HF_Transform` is `function` in `nlp.Dataset.map`, but it will be applied to all splits individually. 

In [3]:
rte = nlp.load_dataset('glue', 'rte')
# ax is {'test': nlp.Dataset}
def custom_tokenize(example):
  example['tok_ids'] = hf_tokenizer.encode(example['sentence1'], example['sentence2'])
  return example
tokenized_rte = HF_Transform(rte, custom_tokenize).map()
tokenized_rte['validation'][0]

{'sentence1': 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.',
 'sentence2': 'Christopher Reeve had an accident.',
 'label': 1,
 'idx': 0,
 'tok_ids': [101,
  11271,
  20726,
  1010,
  1996,
  7794,
  1997,
  1996,
  3364,
  5696,
  20726,
  1010,
  2038,
  2351,
  1997,
  11192,
  4456,
  2012,
  2287,
  4008,
  1010,
  2429,
  2000,
  1996,
  5696,
  20726,
  3192,
  1012,
  102,
  5696,
  20726,
  2018,
  2019,
  4926,
  1012,
  102]}

## Create fastai `Dataloaders` and `show_batch`

`cols`: **specify columns whose values form a output sample in order**, and the semantic type of each column to encode/decode, with one of the following signature (see doc).

Here, `['text_idxs, 'label']` is equal to `{'text_idxs': TensorText, 'label': TensorCategory}`

In [4]:
cola_dsets = HF_Datasets(tokenized_cola, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer, neat_show=True)
cola_dls = cola_dsets.dataloaders(bs=32)
cola_dls.show_batch(max_n=2) # show at most two rows

48%|████▊     | 514/1063 [00:00<00:00, 5134.17it/s]

Unnamed: 0,text_idxs,label
0,"everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean.",1
1,"playing with matches is ; lots of fun, but doing, so and emptying gasoline from one can to another at the same time is a sport best reserved for arsons.",1


You can either specify `neat_show=False` (which is default), to show real data which is tokenized and  with pad 

In [5]:
cola_dsets = HF_Datasets(tokenized_cola, cols={'text_idxs': TensorText, 'label': TensorCategory}, hf_toker=hf_tokenizer)
cola_dls = cola_dsets.dataloaders(bs=32)
cola_dls.show_batch(max_n=2)

48%|████▊     | 505/1063 [00:00<00:00, 5049.32it/s]

Unnamed: 0,text_idxs,label
0,"everybody who has ever , worked in any office which contained any type ##writer which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean .",1
1,"playing with matches is ; lots of fun , but doing , so and empty ##ing gasoline from one can to another at the same time is a sport best reserved for arson ##s . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",1


`test_with_label` is `False` by default, so in test set the sample formed by only first `n_inp` columns specified, which is x.

This make you able to apply the same to all splits when test set come with no y or fake y

## Multiple columns (> 2) in sample
Some points to notice:
- title of each column showed is and in order of `cols` specified in `HF_Datasets`
- auto pad sequence to the max length in the batch, for all columns
- If a fastai semantic tensor type is not specified, it look dtype and shape of the tensor and decide how to decode it autmatically 

In [6]:
wsc = nlp.load_dataset('super_glue', 'wsc.fixed')
print(wsc['train'][0])
tokenized_wsc = HF_TokenizeTfm(wsc, ['text', 'span1_text', 'span2_text'], hf_tokenizer).map()
wsc_dsets = HF_Datasets(tokenized_wsc, cols={'text': TensorText, 'span1_index': noop, 'span1_text':TensorText, 'span2_index': noop, 'span2_text': TensorText, 'label': lambda t: t.bool()}, # convert label (int) to (bool), just to test its abililty to show tensor(bool)
hf_toker=hf_tokenizer)
dls = wsc_dsets.dataloaders(bs=3, srtkey_fc=False, shuffle_train=False) # don't sort samples, don't shuffle trainset
#bk()
dls.show_batch()

{'text': 'Mark told Pete many lies about himself, which Pete included in his book. He should have been more skeptical.', 'span1_index': 0, 'span2_index': 13, 'span1_text': 'Mark', 'span2_text': 'He', 'idx': 0, 'label': 0}


Unnamed: 0,text,span1_index,span1_text,span2_index,span2_text,label
0,"mark told pete many lies about himself , which pete included in his book . he should have been more skeptical . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",0,mark [PAD] [PAD],13,he,False
1,"the mothers of arthur and celeste have come to the town to fetch them . they are very happy to have them back , but they sc ##old them just the same because they ran away . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",1,mothers [PAD] [PAD],25,them,False
2,"mark was close to mr . singer ' s heels . he heard him calling for the captain , promising him , in the jar ##gon everyone talked that night , that not one thing should be damaged on the ship except only the ammunition , but the captain and all his crew had best stay in the cabin until the work was over",4,mr . singer,8,he,False


# 2. Aggregate Dataset
a sample in transformed dataset is aggregated/accumulated from multiple original samples.

- Except for `LMTransform`, you can implement your own logic create a class inherits `AggregateTransform` and implements `accumulate` and `create_example` method

- Note that you should pass **tokenized** dataset(s)

## Make  dataset for (traditional) language model
You can always pass dict of `nlp.Dataset` or a `nlp.Dataset` at your will for any transform class, we've test passing a dict, now we test a `nlp.Dataset`

In [7]:
cola_val = tokenized_cola['validation']
lm_dataset = LMTransform(cola_val, max_len=20, text_col='text_idxs').map()

print('Original dataset:')
print('num of samples:', len(cola['validation']))
print('second to last sentence:', cola['validation'][-2]['sentence'])
print('          last sentence:', cola['validation'][-1]['sentence'])
print('LM dataset:')
print('num of sampels:', len(lm_dataset))
print('last text (x):', hf_tokenizer.decode(lm_dataset[-1]['x_text']))
print('last text (y):', hf_tokenizer.decode(lm_dataset[-1]['y_text']))

Original dataset:
num of samples: 1043
second to last sentence: John arranged for himself to get the prize.
          last sentence: John talked to Bill about himself.
LM dataset:
num of sampels: 481
last text (x): . john talked to bill about himself
last text (y): john talked to bill about himself.


In [8]:
lm_ds = HF_Dataset(lm_dataset, cols={'x_text':LMTensorText, 'y_text':TensorText},hf_toker=hf_tokenizer)
lm_dl = MySortedDL(lm_ds, srtkey_fc=False)
lm_dl.show_batch(max_n=2)

Unnamed: 0,x_text,y_text
0,the sailors rode the breeze clear of the rocks . the weights made the rope stretch over the pull ##ey,sailors rode the breeze clear of the rocks . the weights made the rope stretch over the pull ##ey .
1,"the mechanical doll wr ##ig ##gled itself loose . if you had eaten more , you would want less .","mechanical doll wr ##ig ##gled itself loose . if you had eaten more , you would want less . as"


## Test ELECTRA data creating

In [9]:
proc_dsets = ELECTRADataTransform(tokenized_cola, text_col={'text_idxs':'inpids'}, max_length=128, cls_idx=hf_tokenizer.cls_token_id, sep_idx=hf_tokenizer.sep_token_id).map()
e_dsets = HF_Datasets(proc_dsets, cols=['inpids'], hf_toker=hf_tokenizer)
e_dls = e_dsets.dataloaders(srtkey_fc=False)
e_dls.show_batch(max_n=2)

Unnamed: 0,inpids
0,[CLS] tess was knocking at the door . tess knocked at the door . frank churchill was crossing the street . jane is visiting emma . jane visits emma . tess is knocking at the door . tess knocks at the door . frank churchill is crossing the street . frank churchill crosses the street . real play valencia next sunday . i leave for paris next week . [SEP] the volcano er ##upt ##s on tuesday . the minister has arrived . i ' ve been at work for six hours . have you ever visited doubtful sound ? there was an attack yesterday . emma and harriet were attacked by those bandits . those bandits attacked emma and harriet yesterday . the vase was smashed [SEP]
1,[CLS] who do you guess will be here ? who do you think borrowed my book ? which city does fred think that you believe that john lives in ? i wonder on which shelf john will put the book ? what proof that he has implicated have you found ? joseph has forgotten how many matches he has won . fred will warn martha that she should claim that her brother is patriotic . [SEP] that bill tried to discover which drawer alice put the money in made us realize that we should have left him in seoul . jasper wonders which book he should attempt to persuade his students to buy . i wonder if on which she ##l ##ve john will put the book [SEP]


# 3. Test filtering feature
Note that filter won't be applied to split other than train, because validation/test set is for fair comparison, and you can't take out samples at your will 

In [10]:
l = 23
num = {}
for split in tokenized_cola:
  num[split] = reduce(lambda sum, sample: sum+(1 if len(sample['text_idxs'])==l else 0), 
                      tokenized_cola[split], 0)
print(num)

{'train': 26, 'validation': 2, 'test': 6}


In [11]:
ccola_dsets = HF_Datasets(tokenized_cola, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer)
ccola_dls = ccola_dsets.dataloaders(filter_fc=lambda text_idxs, label: len(text_idxs)!=l,)

for i, split in enumerate(tokenized_cola):
  if split == 'train':
    assert ccola_dls[i].n == len(tokenized_cola[split])-num[split],f"{split}: filtered: {ccola_dls[i].n}, unfiltered: {len(tokenized_cola[split])}, should be filtered: {num[split]}"
  else:
    assert ccola_dls[i].n == len(tokenized_cola[split]), f"{split}: accidentally filtered: {ccola_dls[i].n}, unfiltered: {len(tokenized_cola[split])}"
print("Test passed")

54%|█████▍    | 575/1063 [00:00<00:00, 5742.42it/s]Test passed


# 4. Cache dataloader
If sorting or filtering is applied, dataloader need to create some record inside it, to do it only once, we can cache the records. 

If `cache_dir` is not specified, it will be the cache_dir of `dsets` passed to `HF_Datasets`.

In [12]:
for f in ['/tmp/cached_train.json','/tmp/cached_val.json', '/tmp/cached_test.json']:
  if Path(f).exists(): os.remove(f)

ccola_dls = ccola_dsets.dataloaders(cache_dir='/tmp', cache_name='cached_{split}.json')

54%|█████▍    | 574/1063 [00:00<00:00, 5734.54it/s]

This time we load the caches, it should be fast and progress bars sholdn't appear

In [13]:
ccola_dls = ccola_dsets.dataloaders(cache_dir='/tmp', cache_name='cached_{split}.json')