From 380084ada9d0c0953480a1a87558f204d9f9cdbf Mon Sep 17 00:00:00 2001 From: richardyy1188 Date: Sun, 30 Aug 2020 18:48:10 +0900 Subject: [PATCH] some doc --- README.md | 2 +- docs/source/start.rst | 17 +++++++--- hugdatafast/fastai.py | 34 ++++++++++++++++++- hugdatafast/transform.py | 9 +++-- tests/hf_nlp_extension_test.ipynb | 55 ++++++++++++++++++++++--------- 5 files changed, 93 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index bdcb5fc..8f39122 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # hugdatafast The elegant integration of huggingface/nlp and fastai, and some handy data transformation for huggingface/nlp. -πŸŽ“ ** Documentation ** : https://hugdatafast.readthedocs.io/en/latest/ +πŸŽ“ **Documentation** : https://hugdatafast.readthedocs.io/en/latest/ # Install `pip install hugdatafast` diff --git a/docs/source/start.rst b/docs/source/start.rst index de9e169..ba0a22e 100644 --- a/docs/source/start.rst +++ b/docs/source/start.rst @@ -62,9 +62,11 @@ Even you don't use fastai to train, you can still use as a normal DataLoader Other use cases ---------------- -1. Use your own dataset ? \n -- `nlp.Dataset s from local structured files (csv, json, ...) `_ -- `nlp.Dataset s from custom loading script `_ +1. Use your own dataset ? + +* `nlp.Dataset s from local structured files (csv, json, ...) `_ + +* `nlp.Dataset s from custom loading script `_ 2. Use custom tokenization or custom processing function ? use :class:`HF_Transform` @@ -74,4 +76,11 @@ use :class:`HF_Transform` >>> def custom_tokenize(example): ... example['tok_ids'] = hf_tokenizer.encode(example['sentence1'], example['sentence2']) ... return example - >>> tokenized_rte = HF_Transform(rte, custom_tokenize).map() \ No newline at end of file + >>> tokenized_rte = HF_Transform(rte, custom_tokenize).map() + +``hugdatafast`` in practice +------------------------- +You can see how to use ``hugdatafast`` in real situations. Also, we're welcome you to share how you use +``hugdatafast`` in your project, contact me via github or twitter to put your project link here. + +* `electra_pytorch `_ : Pretrain ELECTRA and finetune on GLUE benchmark \ No newline at end of file diff --git a/hugdatafast/fastai.py b/hugdatafast/fastai.py index ec4e826..76c771d 100644 --- a/hugdatafast/fastai.py +++ b/hugdatafast/fastai.py @@ -404,6 +404,8 @@ def dataloaders(self, device='cpu', cache_dir=None, cache_name=None, dl_kwargs=N neither of them can because money is too tight. """ if dl_kwargs is None: dl_kwargs = [{} for _ in range(len(self.hf_dsets))] + elif isinstance(dl_kwargs, dict): + dl_kwargs = [ dl_kwargs[split] if split in dl_kwargs else {} for split in self.hf_dsets] # infer cache file names for each dataloader if needed dl_type = kwargs.pop('dl_type', self._dl_type) if dl_type==MySortedDL and cache_name: @@ -412,7 +414,9 @@ def dataloaders(self, device='cpu', cache_dir=None, cache_name=None, dl_kwargs=N cache_dir.mkdir(exist_ok=True) if not cache_name.endswith('.json'): cache_name += '.json' for i, split in enumerate(self.hf_dsets): - dl_kwargs[i]['cache_file'] = cache_dir/cache_name.format(split=split) + filled_cache_name = dl_kwargs[i].pop('cache_name', cache_name.format(split=split)) + if 'cache_file' not in dl_kwargs[i]: + dl_kwargs[i]['cache_file'] = cache_dir/filled_cache_name # change default to not drop last kwargs['drop_last'] = kwargs.pop('drop_last', False) # when corpus like glue/ax has only testset, set it to non-train setting @@ -421,9 +425,37 @@ def dataloaders(self, device='cpu', cache_dir=None, cache_name=None, dl_kwargs=N kwargs['drop_last'] = False return super().dataloaders(dl_kwargs=dl_kwargs, device=device, **kwargs) +def hf_merge_datasets(*datasets_s): + """ + Args: + *datasets_s: multiple dicts that contains :class:`nlp.Dataset`, each dict must have the same keys (split names), all datasets should have some columns with the same name. + + Returns + Dict[ str : :class:`HF_MergeDataset` ] + + Example: + # Just for example, you may not concates rte and wnli datasets in real. + >>> rte, wnli = nlp.load_dataset('glue', 'rte'), nlp.load_dataset('glue', 'wnli') + # rte: {'train':Dataset(schema:{...,'sentence1':...,'sentence2':...}),'validation':...}, wnli: {'train':Dataset(schema:{...,'sentence1':...,'sentence2':...}),'validation':... + >>> merge_dsets = hf_merge_datasets(rte, wnli) + {'train': HF_MergedDataset, 'validation': HF_MergedDataset, 'test': HF_MergedDataset} + """ + keys_s = [ list(dsets.keys()) for dsets in datasets_s ] + for keys in keys_s: assert keys == keys_s[0] + merged_dsets = {} + for split in keys: + merged_dsets[split] = HF_MergedDataset(*[ dsets[split] for dsets in datasets_s]) + return merged_dsets + class HF_MergedDataset(): """Merge multiple :class:`nlp.Dataset` s to be a fake :class:`nlp.Dataset` be able to passed to :class:`HF_Dataset` + Args: + *datasets : multiple :class:`nlp.Dataset` s, that all of these have some columns with the same names. + + Returns: + HF_MergedDataset: a :class:`nlp.Dataset` like object that concats passed datasets, with basic functions to be turned into :class:`HF_Dataset`. + Example: >>> tokenized_wiki_train, tokenized_bookcorpus_train Dataset(schema: {...., 'input_ids': 'list', ...), Dataset(schema: {...., 'input_ids': 'list', ...) diff --git a/hugdatafast/transform.py b/hugdatafast/transform.py index 2f125c9..0f1ca73 100644 --- a/hugdatafast/transform.py +++ b/hugdatafast/transform.py @@ -14,7 +14,8 @@ def __init__(self, hf_dsets, remove_original=False, out_cols=None): out_cols (List[str]): output column names. If specified, it will assure they are not in the columns to be removed. """ # check arguments - if isinstance(hf_dsets, nlp.Dataset): hf_dsets = {'Single': hf_dsets} + if isinstance(hf_dsets, nlp.Dataset): hf_dsets = {'Single': hf_dsets}; self.single = True + else: self.single = False assert isinstance(hf_dsets, dict) # save attributes self.hf_dsets = hf_dsets @@ -49,12 +50,14 @@ def map(self, split_kwargs=None, cache_dir=None, cache_name=None, **kwargs): new_dsets = {} for split, dset in self.hf_dsets.items(): if self.remove_original: kwargs['remove_columns'] = dset.column_names - if cache_name: kwargs['cache_file_name'] = str(cache_dir/cache_name.format(split=split)) + if cache_name: + if self.single: kwargs['cache_file_name'] = str(cache_dir/cache_name) + else: kwargs['cache_file_name'] = str(cache_dir/cache_name.format(split=split)) kwargs.update(split_kwargs[split]) if hasattr(kwargs, 'remove_columns'): self._check_outcols(kwargs['remove_columns'], split) new_dsets[split] = self._map(dset, split, **kwargs) # return - if len(new_dsets)==1 and 'Single' in new_dsets: return new_dsets['Single'] + if self.single: return new_dsets['Single'] else: return new_dsets def _check_outcols(self, out_cols, rm_cols, split): diff --git a/tests/hf_nlp_extension_test.ipynb b/tests/hf_nlp_extension_test.ipynb index 09f7c56..5d97b8a 100644 --- a/tests/hf_nlp_extension_test.ipynb +++ b/tests/hf_nlp_extension_test.ipynb @@ -42,6 +42,11 @@ "tags": [] }, "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "8551it [00:00, 9465.16it/s]\n1043it [00:00, 10052.16it/s]\n1063it [00:00, 8299.48it/s]\n" + }, { "output_type": "execute_result", "data": { @@ -73,6 +78,11 @@ "tags": [] }, "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": "2490it [00:01, 2350.48it/s]\n277it [00:00, 2447.21it/s]\n3000it [00:00, 3372.38it/s]\n" + }, { "output_type": "execute_result", "data": { @@ -113,13 +123,13 @@ { "output_type": "stream", "name": "stderr", - "text": "60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 642/1063 [00:00<00:00, 6415.30it/s]" + "text": "58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 619/1063 [00:00<00:00, 6189.99it/s]" }, { "output_type": "display_data", "data": { "text/plain": "", - "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
text_idxslabel
0everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean.1
1the dumplings which sasha is gobbling down faster than i can reheat the meatballs are extremely tasty, if i do say so.1
" + "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
text_idxslabel
0everybody who has ever, worked in any office which contained any typewriter which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean.1
1playing with matches is ; lots of fun, but doing, so and emptying gasoline from one can to another at the same time is a sport best reserved for arsons.1
" }, "metadata": {} } @@ -147,13 +157,13 @@ { "output_type": "stream", "name": "stderr", - "text": "55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 583/1063 [00:00<00:00, 5826.69it/s]" + "text": "46%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 490/1063 [00:00<00:00, 4894.75it/s]" }, { "output_type": "display_data", "data": { "text/plain": "", - "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
text_idxslabel
0everybody who has ever , worked in any office which contained any type ##writer which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean .1
1one of the jewish children is a spun ##ky girl , who gave a black eye to the kid with the german roots before the start of the war . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]1
" + "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
text_idxslabel
0everybody who has ever , worked in any office which contained any type ##writer which had ever been used to type any letters which had to be signed by any administrator who ever worked in any department like mine will know what i mean .1
1in january 2002 , a dull star in an obscure constellation suddenly became 600 , 000 times more luminous than our sun , temporarily making it the brightest star in our galaxy . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]1
" }, "metadata": {} } @@ -191,10 +201,22 @@ "tags": [] }, "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29413.0, style=ProgressStyle(descriptio…", + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "cad85ddb77924b4182c01dbbfbc3fda7" + } + }, + "metadata": {} + }, { "output_type": "stream", "name": "stdout", - "text": "{'text': 'Mark told Pete many lies about himself, which Pete included in his book. He should have been more skeptical.', 'span1_index': 0, 'span2_index': 13, 'span1_text': 'Mark', 'span2_text': 'He', 'idx': 0, 'label': 0}\n" + "text": "\n554it [00:00, 3446.54it/s]{'text': 'Mark told Pete many lies about himself, which Pete included in his book. He should have been more skeptical.', 'span1_index': 0, 'span2_index': 13, 'span1_text': 'Mark', 'span2_text': 'He', 'idx': 0, 'label': 0}\n\n104it [00:00, 3162.25it/s]\n146it [00:00, 3802.30it/s]\n" }, { "output_type": "display_data", @@ -245,8 +267,8 @@ "outputs": [ { "output_type": "stream", - "name": "stdout", - "text": "Original dataset:\nnum of samples: 1043\nsecond to last sentence: John arranged for himself to get the prize.\n last sentence: John talked to Bill about himself.\nLM dataset:\nnum of sampels: 481\nlast text (x): . john talked to bill about himself\nlast text (y): john talked to bill about himself.\n" + "name": "stderr", + "text": "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 102.43it/s]Original dataset:\nnum of samples: 1043\nsecond to last sentence: John arranged for himself to get the prize.\n last sentence: John talked to Bill about himself.\nLM dataset:\nnum of sampels: 481\nlast text (x): . john talked to bill about himself\nlast text (y): john talked to bill about himself.\n\n" } ], "source": [ @@ -300,12 +322,15 @@ }, "outputs": [ { - "output_type": "display_data", - "data": { - "text/plain": "", - "text/html": "\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
input_idsattention_masktoken_type_ids
0[CLS] each author whose contribution is written in any language other than english will provide a summary in english . i ' m sure we even got these tickets ! i ' m even sure we got these tickets ! it ' s not because i have any sympathy for urban gu ##eri ##llas that i helped him . it isn ' t because sue said anything bad about me that i ' m angry . [SEP] that he was hungry , john w ##hine ##d . 1 gave mary after the party a book . because she ' s so pleasant , as for mary i really like her . though he may hate those that critic ##ize carter , it doesn ' t matter . [SEP](1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
1[CLS] this dye will run . she can run an accelerator . these stockings will run . we need another run to win . lee saw the student with a telescope . i forgot how good beer tastes . visiting relatives can be boring . if only superman would stop flying planes ! that ' s a new car dealers ##hip . i know you like the back of my hand . [SEP] max is on the phone now . i saw her duck . i ' m creating a committee . kim – you ' re in charge . lights go out at ten . there will be no talking afterwards . they found the book on the atom . which experts testified against defendants who [SEP](1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
" - }, - "metadata": {} + "output_type": "error", + "ename": "NameError", + "evalue": "name 'ELECTRADataTransform' is not defined", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mproc_dset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mELECTRADataTransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcola\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'validation'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mis_docs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtext_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'sentence'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_length\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m128\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhf_toker\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhf_tokenizer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0me_dsets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHF_Datasets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mproc_dset\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'input_ids'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mTensorText\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'attention_mask'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mnoop\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'token_type_ids'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mnoop\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhf_toker\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhf_tokenizer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0me_dls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0me_dsets\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdataloaders\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrtkey_fc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0me_dls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_n\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'ELECTRADataTransform' is not defined" + ] } ], "source": [ @@ -355,7 +380,7 @@ { "output_type": "stream", "name": "stderr", - "text": "92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 977/1063 [00:00<00:00, 5087.53it/s]Test passed\n" + "text": "78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 830/1063 [00:00<00:00, 8297.06it/s]Test passed\n" } ], "source": [ @@ -390,7 +415,7 @@ { "output_type": "stream", "name": "stderr", - "text": "83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 882/1063 [00:00<00:00, 4539.60it/s]" + "text": "80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 850/1063 [00:00<00:00, 8490.94it/s]" } ], "source": [