In [1]:
from IPython.core.debugger import set_trace as bk
import os
from pathlib import Path
from functools import partial
import torch
import nlp
from transformers import ElectraTokenizerFast
hf_tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-generator")
from fastai2.text.all import *
from _utils.huggingface import *

# 1.Basics

## Tokenize all splits of dataset at once
`cols`(`Dict[str]`): tokenize the every column named key into column named its value  
`cols`(`List[str]`): specify the name of columns to be tokenized, replace the original columns' data with tokenized one

Here, we tokenized "sentence" into a new column named "text_idxs", the "sentence" column still exist.

In [2]:
cola = nlp.load_dataset('glue', 'cola') 
# cola is {'train':nlp.Dataset, 'validation':nlp.Dataset, 'test':nlp.Dataset}
tokenized_cola = HF_TokenizeTfm(cola, {'sentence':'text_idxs'}, hf_tokenizer).map()
tokenized_cola['train'][0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0,
 'text_idxs': [2256,
  2814,
  2180,
  1005,
  1056,
  4965,
  2023,
  4106,
  1010,
  2292,
  2894,
  1996,
  2279,
  2028,
  2057,
  16599,
  1012]}

## Custom function apply to all splits of dataset at once
The `func` of `HF_Transform` is `function` in `nlp.Dataset.map`, but it will be applied to all splits individually. 

In [3]:
rte = nlp.load_dataset('glue', 'rte')
# ax is {'test': nlp.Dataset}
def custom_tokenize(example):
  example['tok_ids'] = hf_tokenizer.encode(example['sentence1'], example['sentence2'])
  return example
tokenized_rte = HF_Transform(rte, custom_tokenize).map()
tokenized_rte['validation'][0]

{'sentence1': 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.',
 'sentence2': 'Christopher Reeve had an accident.',
 'label': 1,
 'idx': 0,
 'tok_ids': [101,
  11271,
  20726,
  1010,
  1996,
  7794,
  1997,
  1996,
  3364,
  5696,
  20726,
  1010,
  2038,
  2351,
  1997,
  11192,
  4456,
  2012,
  2287,
  4008,
  1010,
  2429,
  2000,
  1996,
  5696,
  20726,
  3192,
  1012,
  102,
  5696,
  20726,
  2018,
  2019,
  4926,
  1012,
  102]}

## Create fastai `Dataloaders` and `show_batch`

`cols`: **specify columns whose values form a output sample in order**, and the semantic type of each column to encode/decode, with one of the following signature (see doc).

Here, `['text_idxs, 'label']` is equal to `{'text_idxs': TensorText, 'label': TensorCategory}`

In [4]:
cola_dsets = HF_Datasets(tokenized_cola, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer, neat_show=True)
cola_dls = cola_dsets.dataloaders(bs=32, pad_idx=hf_tokenizer.pad_token_id)
cola_dls.show_batch(max_n=2)

51%|█████     | 544/1063 [00:00<00:00, 5432.45it/s]

Unnamed: 0,text_idxs,label
0,"everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean.",1
1,"hank plays the guitar and finds arrangements for all the old folk songs which are still sung in these hills, and ernie writes down all the old folk songs which are still sung in these hills.",1


You can either specify `neat_show=False` (which is default), to show real data which is tokenized and  with pad 

In [5]:
cola_dsets = HF_Datasets(tokenized_cola, cols={'text_idxs': TensorText, 'label': TensorCategory}, hf_toker=hf_tokenizer)
cola_dls = cola_dsets.dataloaders(bs=32, pad_idx=hf_tokenizer.pad_token_id)
cola_dls.show_batch(max_n=2)

54%|█████▍    | 572/1063 [00:00<00:00, 5714.26it/s]

Unnamed: 0,text_idxs,label
0,"everybody who has ever , worked in any office which contained any type ##writer which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean .",1
1,"in january 2002 , a dull star in an obscure constellation suddenly became 600 , 000 times more luminous than our sun , temporarily making it the brightest star in our galaxy . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",1


`test_with_label` is `False` by default, so in test set the sample formed by only first `n_inp` columns specified, which is x.

This make you able to apply the same to all splits when test set come with no y or fake y

In [6]:
cola_dls[2].show_batch(max_n=2)

Unnamed: 0,text_idxs
0,"cultural commissioner megan smith said that the five ` ` sounds ##cape ' ' pieces would ` ` give a fest ##ive air to park square , they ' re fun and interesting ' ' ."
1,"wendy is eager to sail around the world and bruce is eager to climb ki ##lim ##an ##jar ##o , but neither of them can because money is too tight . [PAD] [PAD] [PAD] [PAD] [PAD]"


# 2. Aggregate Dataset
a sample in transformed dataset is aggregated/accumulated from multiple original samples.

- Except for `LMTransform`, you can implement your own logic create a class inherits `AggregateTransform` and implements `accumulate` and `create_example` method

- Note that you should pass **tokenized** dataset(s)

## Make  dataset for (traditional) language model
You can always pass dict of `nlp.Dataset` or a `nlp.Dataset` at your will for any transform class, we've test passing a dict, now we test a `nlp.Dataset`

In [7]:
cola_val = tokenized_cola['validation']
lm_dataset = LMTransform(cola_val, max_len=20, text_col='text_idxs').map()

print('Original dataset:')
print('num of samples:', len(cola['validation']))
print('second to last sentence:', cola['validation'][-2]['sentence'])
print('          last sentence:', cola['validation'][-1]['sentence'])
print('LM dataset:')
print('num of sampels:', len(lm_dataset))
print('last text (x):', hf_tokenizer.decode(lm_dataset[-1]['x_text']))
print('last text (y):', hf_tokenizer.decode(lm_dataset[-1]['y_text']))

Original dataset:
num of samples: 1043
second to last sentence: John arranged for himself to get the prize.
          last sentence: John talked to Bill about himself.
LM dataset:
num of sampels: 481
last text (x): . john talked to bill about himself
last text (y): john talked to bill about himself.


In [8]:
lm_ds = HF_Dataset(lm_dataset, cols={'x_text':LMTensorText, 'y_text':TensorText},hf_toker=hf_tokenizer)
lm_dl = MySortedDL(lm_ds, srtkey_fc=False, pad_idx=hf_tokenizer.pad_token_id)
lm_dl.show_batch(max_n=2)

Unnamed: 0,y_text,y_text_
0,the sailors rode the breeze clear of the rocks . the weights made the rope stretch over the pull ##ey,sailors rode the breeze clear of the rocks . the weights made the rope stretch over the pull ##ey .
1,"the mechanical doll wr ##ig ##gled itself loose . if you had eaten more , you would want less .","mechanical doll wr ##ig ##gled itself loose . if you had eaten more , you would want less . as"


## Test ELECTRA data creating

In [9]:
proc_dsets = ELECTRADataTransform(tokenized_cola, text_col={'text_idxs':'inpids'}, max_length=128, cls_idx=hf_tokenizer.cls_token_id, sep_idx=hf_tokenizer.sep_token_id).map()
e_dsets = HF_Datasets(proc_dsets, cols=['inpids'], hf_toker=hf_tokenizer)
e_dls = e_dsets.dataloaders(srtkey_fc=False, pad_idx=hf_tokenizer.pad_token_id)
e_dls.show_batch(max_n=2)

Unnamed: 0,inpids
0,"[CLS] owners of a pig love to eat tr ##uf ##fles . that whether the world is round is unknown bothered athena . no one expected ag ##ame ##m ##non to to win eu ##cl ##id was interested in plato ' s description of geometry . every reading shakespeare satisfied me can will he do it ? med ##ea poisoned who ? he looked up it [SEP] who guy did you see . we kicked myself who would pose ##idon run away , if the execution ##er murdered ? anson kissed him which city the claim that philip would invade . i haven ' t left yet i am eating a mango and gillian has too . letter is on the table who ate the cake ? [SEP]"
1,[CLS] john had an error in the proof he presented . john had an error in the proof sarah presented . fred had a snake behind the car joe was sitting in . fred had a snake behind the car he was sitting in . there was a yellow collar on the dog which the car injured . [SEP] there was a snake behind the car the time bomb was sitting in . the car had a yellow collar on the dog which it injured . that stone has a hole in the tar ##pa ##ulin which it is holding down . the time bomb had a snake behind the car which it was sitting in . there were several hundred people yelling for me to put [SEP]


# 3. Test filtering feature
Note that filter won't be applied to split other than train, because validation/test set is for fair comparison, and you can't take out samples at your will 

In [10]:
l = 23
num = {}
for split in tokenized_cola:
  num[split] = reduce(lambda sum, sample: sum+(1 if len(sample['text_idxs'])==l else 0), 
                      tokenized_cola[split], 0)
print(num)

{'train': 26, 'validation': 2, 'test': 6}


In [11]:
ccola_dsets = HF_Datasets(tokenized_cola, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer)
ccola_dls = ccola_dsets.dataloaders(pad_idx=hf_tokenizer.pad_token_id, 
                                    filter_fc=lambda text_idxs, label: len(text_idxs)!=l,)

for i, split in enumerate(tokenized_cola):
  if split == 'train':
    assert ccola_dls[i].n == len(tokenized_cola[split])-num[split],f"{split}: filtered: {ccola_dls[i].n}, unfiltered: {len(tokenized_cola[split])}, should be filtered: {num[split]}"
  else:
    assert ccola_dls[i].n == len(tokenized_cola[split]), f"{split}: accidentally filtered: {ccola_dls[i].n}, unfiltered: {len(tokenized_cola[split])}"
print("Test passed")

51%|█████▏    | 546/1063 [00:00<00:00, 5453.76it/s]Test passed


# 4. Cache dataloader
If sorting or filtering is applied, dataloader need to create some record inside it, to do it only once, we can cache the records. 

If `cache_dir` is not specified, it will be the cache_dir of `dsets` passed to `HF_Datasets`.

In [12]:
for f in ['/tmp/cached_train.json','/tmp/cached_val.json', '/tmp/cached_test.json']:
  if Path(f).exists(): os.remove(f)

ccola_dls = ccola_dsets.dataloaders(bs=32, pad_idx=hf_tokenizer.pad_token_id, 
                                    cache_dir='/tmp', cache_name='cached_{split}.json')

56%|█████▋    | 598/1063 [00:00<00:00, 5973.88it/s]

This time we load the caches, it should be fast and progress bars sholdn't appear

In [13]:
ccola_dls = ccola_dsets.dataloaders(bs=32, pad_idx=hf_tokenizer.pad_token_id, 
                                    cache_dir='/tmp', cache_name='cached_{split}.json')