0.4.5

richarddwang · Sep 25, 2020 · 0af1177 · 0af1177
1 parent 5626db1
commit 0af1177
Show file tree

Hide file tree

Showing 8 changed files with 300 additions and 135 deletions.
diff --git a/README.md b/README.md
@@ -1,17 +1,17 @@
 # hugdatafast
-The elegant integration of huggingface/nlp and fastai, and some handy data transformation for huggingface/nlp.
+The elegant integration of huggingface/datasets and fastai, and some handy data transformation for huggingface/datasets.
 
 🎓 **Documentation** : https://hugdatafast.readthedocs.io/en/latest/
 
 # Install
 `pip install hugdatafast`
 
 # Furture Plan
-- I would like to merge this library to fastai and huggingface/nlp respectively. But I may have no time for it. You're welcome to pr this library to the two libraries.
+- I would like to merge this library to fastai and huggingface/datasets respectively. But I may have no time for it. You're welcome to pr this library to the two libraries.
 
 - The implemenatation of `CombineTransform` works but might be too complexed to extend, hope HuggingFace or someone come up with some great ideas.
 
-- Currently, it is designed to work with the dataset part of huggingface/nlp, I may also integrate the part of metric or not.
+- Currently, it is designed to work with the dataset part of huggingface/datasets, I may also integrate the part of metric or not.
 
 # Quick Intro
 ![hugdatafast_fastai](https://user-images.githubusercontent.com/17963619/92091020-be672f00-ee02-11ea-84c0-d54b4855ff4b.png)
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,7 +1,7 @@
 hugdatafast
 =============
 
-This package is to provide a  elegant bridge between fastai and huggingface/nlp and some handy data transforms 
+This package is to provide a  elegant bridge between fastai and huggingface/datasets and some handy data transforms 
 for NLPers.
 
 Author: Richard Wang
@@ -15,7 +15,7 @@ Installation
 
     pip install hugdatafast
 
-This will install also the lastest ``fastai`` and ``nlp``.
+This will install also the lastest ``fastai`` and ``datasets``.
 
 .. toctree::
     :maxdepth: 2

diff --git a/docs/source/start.rst b/docs/source/start.rst
@@ -8,7 +8,7 @@ Base use case
 
 ::
 
-    >>> from nlp import load_dataset
+    >>> from datasets import load_dataset
     >>> from hugdatafast import *
 
 .. note::
@@ -19,7 +19,7 @@ Can you turn your data pipeline into only 3 lines ?
 ::
 
     >>> datasets = load_dataset('glue', 'cola') 
-    -> {'train': nlp.Dataset, 'validation': nlp.Dataset, 'test': nlp.Dataset}
+    -> {'train': datasets.Dataset, 'validation': datasets.Dataset, 'test': datasets.Dataset}
     >>> tokenized_datasets = datasets.map(simple_tokenize_func({'sentence':'text_idxs'}, hf_tokenizer))
     >>> dls = HF_Datasets(tokenized_datasets, cols=['text_idxs', 'label'], hf_toker=hf_tokenizer).dataloaders(bs=64) 
 
@@ -67,9 +67,9 @@ Other use cases
 
 1. Use your own dataset ?
 
-* `nlp.Dataset s from local structured files (csv, json, ...) <https://huggingface.co/nlp/loading_datasets.html#from-local-files>`_
+* `datasets.Dataset s from local structured files (csv, json, ...) <https://huggingface.co/datasets/loading_datasets.html#from-local-files>`_
 
-* `nlp.Dataset s from custom loading script <https://huggingface.co/nlp/add_dataset.html>`_
+* `datasets.Dataset s from custom loading script <https://huggingface.co/datasets/add_dataset.html>`_
 
 2. Need to combine examples to generate new example ? (e.g. Traditional language model) 
 

diff --git a/hugdatafast/__init__.py b/hugdatafast/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.4.4"
+__version__ = "0.4.5"
 from .fastai import *
 from .transform import *
diff --git a/hugdatafast/fastai.py b/hugdatafast/fastai.py
@@ -3,7 +3,7 @@
 import json
 from tqdm import tqdm
 from torch.nn.utils.rnn import pad_sequence
-import nlp
+import datasets
 from fastai.text.all import *
 
 
@@ -207,7 +207,7 @@ def show(self, ctx=None, sep=';', color='black', **kwargs):
 These two function is inperfect.
 But they cope with mutiple input columns problem (n_inp >1), which cause no df printing but just sequentail print
 These will be a problem when you are doing non-text problem with n_inp > 1 (multiple input column),
-which shouldn't be the case of huggingface/nlp user.
+which shouldn't be the case of huggingface/datasets user.
 And I hope fastai come up with a good solution to show_batch multiple inputs problems for text/non-text.
 """
 @typedispatch
@@ -225,14 +225,14 @@ def show_results(x: tuple, y, samples, outs, ctxs=None, max_n=10, trunc_at=150,
   return ctxs
 
 class HF_Dataset():
-  """A wrapper for :class:`nlp.Dataset`.  It will behavior like original :class:`nlp.Dataset`, 
+  """A wrapper for :class:`datasets.Dataset`.  It will behavior like original :class:`datasets.Dataset`, 
   but also function as a :class:`fastai.data.core.datasets` that provides samples and decodes."""
 
   def __init__(self, hf_dset, cols=None, hf_toker=None, neat_show=False, n_inp=1):
     """
     Args:
-      hf_dset (:class:`nlp.Dataset`): Prerocessed Hugging Face dataset to be wrapped.
-      cols (dict, optional): columns of :class:`nlp.Dataset` to be used to construct samples, and (optionally) semantic tensor type for each of those columns to decode.\n
+      hf_dset (:class:`datasets.Dataset`): Prerocessed Hugging Face dataset to be wrapped.
+      cols (dict, optional): columns of :class:`datasets.Dataset` to be used to construct samples, and (optionally) semantic tensor type for each of those columns to decode.\n
         - cols(``Dict[Fastai Semantic Tensor]``): encode/decode column(key) with semantic tensor type(value). If {value} is ``noop``, semantic tensor of the column is by default `TensorTuple`.
         - cols(``list[str]``): specify only columns and take default setting for semantic tensor type of them.\n
           - if length is 1, regard the 1st element as `TensorText`
@@ -326,22 +326,22 @@ def _decode(self, t:TensorCategory, title): return self._decode_title(t.item(),
   def _decode(self, t:TensorMultiCategory, title): return self._decode_title(t.tolist(), _MultiCategory, title)
 
   def __getattr__(self, name):
-    "If not defined, let the nlp.Dataset in it act for us."
+    "If not defined, let the datasets.Dataset in it act for us."
     if name in HF_Dataset.__dict__: return HF_Dataset.__dict__[name]
     elif name in self.__dict__: return self.__dict__[name]
     elif hasattr(self.hf_dset, name): return getattr(self.hf_dset, name)
-    raise AttributeError(f"Both 'HF_Dataset' object and 'nlp.Dataset' object have no '{name}' attribute ")
+    raise AttributeError(f"Both 'HF_Dataset' object and 'datasets.Dataset' object have no '{name}' attribute ")
 
 class HF_Datasets(FilteredBase):
-  """Function as :class:`fastai.data.core.Datasets` to create :class:`fastai.data.core.Dataloaders` from a group of :class:`nlp.Dataset`s"""
+  """Function as :class:`fastai.data.core.Datasets` to create :class:`fastai.data.core.Dataloaders` from a group of :class:`datasets.Dataset`s"""
 
   _dl_type,_dbunch_type = MySortedDL,DataLoaders
 
   @delegates(HF_Dataset.__init__)
   def __init__(self, hf_dsets: dict, test_with_y=False, **kwargs):
     """
     Args:
-      hf_dsets (`Dict[nlp.Dataset]`): Prerocessed Hugging Face Datasets, {key} is split name, {value} is :class:`nlp.Dataset`, order will become the order in :class:`fastai.data.core.Dataloaders`.
+      hf_dsets (`Dict[datasets.Dataset]`): Prerocessed Hugging Face Datasets, {key} is split name, {value} is :class:`datasets.Dataset`, order will become the order in :class:`fastai.data.core.Dataloaders`.
       test_with_y (bool, optional): Whether the test set come with y (answers) but not with fake y (e.g. all -1 label). 
         If ``False``, tell only test set to construct samples from first ``n_inp`` columns (do not output fake y). 
         And all datasets passed in ``hf_dsets`` with its name starts with "test" will be regarded as test set. 
@@ -369,14 +369,14 @@ def dataloaders(self, device='cpu', cache_dir=None, cache_name=None, dl_kwargs=N
     """
     Args:
       device (str): device where outputed batch will be on. Because a batch will be loaded to test when creating :class: `fastai.data.core.Dataloaders`, to prevent always leaving a batch of tensor in cuda:0, using default value cpu and then ``dls.to(other device)`` at the time you want is suggested.
-      cache_dir (str, optional): directory to store caches of :class:`MySortedDL`. if ``None``, use cache directory of the first :class:`nlp.Dataset` in ``hf_dsets`` that passed to :method:`HF_Datasets.__init__`.
+      cache_dir (str, optional): directory to store caches of :class:`MySortedDL`. if ``None``, use cache directory of the first :class:`datasets.Dataset` in ``hf_dsets`` that passed to :method:`HF_Datasets.__init__`.
       cache_name (str, optional): format string that includes one param "{split}", which will be replaced with name of split as cache file name under `cache_dir` for each split. If ``None``, tell :class:MySortedDL don't do caching.
       dl_kwargs (list[dict], optional): ith item is addtional kwargs to be passed to initialization of ith dataloader for ith split
       kwargs: Passed to :func:`fastai.data.core.FilteredBase.dataloaders`
     
     Example:
       >>> tokenized_cola
-      {'train': nlp.Dataset, 'validation': nlp.Dataset, 'test': nlp.Dataset}
+      {'train': datasets.Dataset, 'validation': datasets.Dataset, 'test': datasets.Dataset}
       >>> tokenized_cola['test'][0]
       {'sentence': 'Bill whistled past the house.',
        'label': -1, # Fake label. True labels are not open to the public.
@@ -423,62 +423,4 @@ def dataloaders(self, device='cpu', cache_dir=None, cache_name=None, dl_kwargs=N
     if list(self.hf_dsets.keys())[0].startswith('test'):
       kwargs['shuffle_train'] = False
       kwargs['drop_last'] = False
-    return super().dataloaders(dl_kwargs=dl_kwargs, device=device, **kwargs)
-
-def hf_merge_datasets(*datasets_s):
-  """
-  Args:
-    *datasets_s: multiple dicts that contains :class:`nlp.Dataset`, each dict must have the same keys (split names), all datasets should have some columns with the same name.
-
-  Returns
-    :class:`nlp.DatasetDict`
-
-  Example:
-    >>> rte, wnli = nlp.load_dataset('glue', 'rte'), nlp.load_dataset('glue', 'wnli') # Just for example, you may not concates rte and wnli datasets in real.
-    # rte: {'train':Dataset(schema:{...,'sentence1':...,'sentence2':...}),'validation':...}, wnli: {'train':Dataset(schema:{...,'sentence1':...,'sentence2':...}),'validation':...
-    >>> merge_dsets = hf_merge_datasets(rte, wnli)
-    {'train': HF_MergedDataset, 'validation': HF_MergedDataset, 'test': HF_MergedDataset}
-  """
-  keys_s = [ list(dsets.keys()) for dsets in datasets_s ]
-  for keys in keys_s: assert keys == keys_s[0]
-  merged_dsets = {}
-  for split in keys:
-    merged_dsets[split] = HF_MergedDataset(*[ dsets[split] for dsets in datasets_s])
-  return nlp.DatasetDict(merged_dsets)
-
-class HF_MergedDataset():
-  """Merge multiple :class:`nlp.Dataset` s to be a fake :class:`nlp.Dataset` be able to passed to :class:`HF_Dataset`
-  
-  Args:
-    *datasets : multiple :class:`nlp.Dataset` s, that all of these have some columns with the same names.
-
-  Returns:
-    HF_MergedDataset: a :class:`nlp.Dataset` like object that concats passed datasets, with basic functions to be turned into :class:`HF_Dataset`.
-
-  Example:
-    >>> tokenized_wiki_train, tokenized_bookcorpus_train
-    Dataset(schema: {...., 'input_ids': 'list<item: int64>', ...), Dataset(schema: {...., 'input_ids': 'list<item: int64>', ...)
-    >>> merged_dset = HF_MergedDataset(tokenized_wiki_train, tokenized_bookcorpus_train)
-    >>> dls = HF_Datasets({'train': merged_dset}, cols=['input_ids'], hf_toker=hf_tokenizer).dataloaders(bs=128)
-  """
-  def __init__(self, *datasets):
-    self.dsets = datasets
-    self.len = reduce(lambda a,d: a+len(d), self.dsets, 0)
-  def __len__(self):
-    return self.len
-  def __getitem__(self, i):
-    for dset in self.dsets:
-      if i < len(dset): return dset[i]
-      else: i -= len(dset)
-    raise IndexError
-  def set_format(self, type, columns):
-    for dset in self.dsets: dset.set_format(type, columns)
-  @property
-  def format(self):
-    form = self.dsets[0].format
-    for dset in self.dsets:
-      assert form == dset.format
-    return form
-  @property
-  def cache_files(self):
-    return concat(*[ds.cache_files for ds in self.dsets])
+    return super().dataloaders(dl_kwargs=dl_kwargs, device=device, **kwargs)
diff --git a/hugdatafast/transform.py b/hugdatafast/transform.py
@@ -1,16 +1,16 @@
 from pathlib import Path
 import pyarrow as pa
-import nlp
+import datasets
 from fastai.text.all import *
 
 @patch
-def cache_directory(self: nlp.arrow_dataset.Dataset):
+def cache_directory(self: datasets.arrow_dataset.Dataset):
   return os.path.abspath(os.path.dirname(self.cache_files[0]['filename']))
 
 @patch
-def my_map(self: nlp.arrow_dataset.Dataset, *args, **kwargs):
+def my_map(self: datasets.arrow_dataset.Dataset, *args, **kwargs):
   """
-  The same as :class:`nlp.arrow_dataset.Dataset` , but it can add cache directory and .arrow to cache_file_name autmomatically for us.
+  The same as :class:`datasets.arrow_dataset.Dataset` , but it can add cache directory and .arrow to cache_file_name autmomatically for us.
   
   Example:
     >>> dataset.map(a_func, cache_file_name='processed')
@@ -23,9 +23,9 @@ def my_map(self: nlp.arrow_dataset.Dataset, *args, **kwargs):
   return self.map(*args, cache_file_name=cache_file_name, **kwargs)
 
 @patch
-def my_map(self: nlp.dataset_dict.DatasetDict, *args, **kwargs):
+def my_map(self: datasets.dataset_dict.DatasetDict, *args, **kwargs):
   """
-  The same as :class:`nlp.dataset_dict.DatasetDict` , but it can infer cache names for us.
+  The same as :class:`datasets.dataset_dict.DatasetDict` , but it can infer cache names for us.
 
   Example:
     >>> datasets.map(a_func, cache_file_names='processed_{split}')
@@ -35,20 +35,20 @@ def my_map(self: nlp.dataset_dict.DatasetDict, *args, **kwargs):
   self._check_values_type()
   if cache_file_names is None: cache_file_names = {k: None for k in self}
   if isinstance(cache_file_names, str): cache_file_names = {k: cache_file_names.format(split=k) for k in self}
-  return nlp.dataset_dict.DatasetDict({k: dataset.my_map(*args, cache_file_name=cache_file_names[k], **kwargs) for k, dataset in self.items()})
-
-def simple_tokenize_func(cols, hf_tokenizer):
-  if isinstance(cols, list): cols = {c:c for c in cols}
-  elif isinstance(cols, str): cols = {cols:cols}
-  assert isinstance(cols, dict)
-
-  def _tokenize(example):
-    for in_col, out_col in cols.items():
-      example[out_col] = hf_tokenizer.convert_tokens_to_ids(hf_tokenizer.tokenize(example[in_col]))
+  return datasets.dataset_dict.DatasetDict({k: dataset.my_map(*args, cache_file_name=cache_file_names[k], **kwargs) for k, dataset in self.items()})
+
+class SimpleTokenize():
+  def __init__(self, cols, hf_toker):
+    if isinstance(cols, list): cols = {c:c for c in cols}
+    elif isinstance(cols, str): cols = {cols:cols}
+    assert isinstance(cols, dict)
+    self.cols = cols
+    self.hf_toker = hf_toker
+  def __call__(self, example):
+    for in_col, out_col in self.cols.items():
+      example[out_col] = self.hf_toker.convert_tokens_to_ids(self.hf_toker.tokenize(example[in_col]))
     return example
 
-  return _tokenize
-
 class CombineTransform():
   """
   Base Class for Transform that combine multiple original samples into a new sample. 
@@ -62,7 +62,7 @@ def __init__(self, hf_dset, in_cols, out_cols, drop_last=False):
       drop_last` (`Optional[bool]`, default: `False`): whether to drop the last accumulated sample.
     """
     # Always do the case of multiple datasets for the convenience of coding
-    if isinstance(hf_dset, nlp.arrow_dataset.Dataset): self.dsets = {'Single': hf_dset}; self.single=True
+    if isinstance(hf_dset, datasets.arrow_dataset.Dataset): self.dsets = {'Single': hf_dset}; self.single=True
     else: self.dsets = hf_dset; self.single=False
 
     # check column names
@@ -73,16 +73,17 @@ def __init__(self, hf_dset, in_cols, out_cols, drop_last=False):
     for dset in self.dsets.values(): dset.set_format(type=None, columns=in_cols)
 
     # dealing with last sample
-    self.last_idx = None
+    self.last_idx = len(hf_dset) - 1
     self.drop_last = drop_last 
 
   def __call__(self, b, indices):
-    # If first batch, `nlp.Dataset.map` first test with several samples which affects our internal states, so we need to reinitialize.
+    # If first batch, `datasets.Dataset.map` first test with several samples which affects our internal states, so we need to reinitialize.
     if 0 in indices:
       self.reset_states()
 
     self.new_b = { c:[] for c in self.out_cols }
-    for z in zip(*b.values()):
+    values = [ b[c] for c in self.in_cols ]
+    for z in zip(*values):
       self.accumulate(*z)
 
     # If Last batch, whehther commit last incomplete example
@@ -124,9 +125,9 @@ def create_example(self):
   def map(self, batch_size=1000, cache_file_name=None, **kwargs):
     """
     Args:
-      batch_size(int): See :class:`nlp.Dataset.map`, shouldn't be None here
+      batch_size(int): See :class:`datasets.Dataset.map`, shouldn't be None here
       cache_file_name: The same with the one of :func:`my_map`
-      kwargs: passed to :class:`nlp.Dataset.map`
+      kwargs: passed to :class:`datasets.Dataset.map`
     """
 
     # check
@@ -145,15 +146,18 @@ def map(self, batch_size=1000, cache_file_name=None, **kwargs):
     mapped_dsets = {}
     for k, dset in self.dsets.items():
       self.last_idx = len(dset) - 1
-      mapped_dsets[k] = dset.map(function=self, 
-                                 batched=True, batch_size=batch_size, 
-                                 with_indices=True, 
-                                 cache_file_name=cache_names[k],
-                                 remove_columns=self.in_cols, # Cuz output column has less rows (combined) than orginal column
-                                 **kwargs)
+      mapped_dset = dset.map(function=self, 
+                             batched=True, batch_size=batch_size, 
+                             with_indices=True,
+                             num_proc=1,
+                             cache_file_name=cache_names[k],
+                             remove_columns=self.in_cols, # Cuz output column has less rows (combined) than orginal column
+                             **kwargs)
+      mapped_dset.set_format(None, columns=self.out_cols)
+      mapped_dsets[k] = mapped_dset
 
     if self.single: return mapped_dsets['Single']
-    else: return nlp.DatasetDict(mapped_dsets)
+    else: return datasets.DatasetDict(mapped_dsets)
 
 @delegates(CombineTransform, but=["inp_cols", "out_cols", "init_attrs"])
 class LMTransform(CombineTransform):

diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 REQUIRED_PKGS = [
     'fastai>=2.0.8',
     'fastscore>=1.0.1', # change of store_attr api
-    'nlp>=0.4.0',
+    'datasets',
 ]
 
 setuptools.setup(
@@ -32,5 +32,5 @@
     ],
     python_requires='>=3.6',
     install_requires=REQUIRED_PKGS,
-    keywords='nlp machine learning datasets metrics fastai huggingface',
+    keywords='datasets machine learning datasets metrics fastai huggingface',
 )