4.0.2

richarddwang · Aug 26, 2020 · 52063f2 · 52063f2
1 parent a570d19
commit 52063f2
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 127 deletions.
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # hugdatafast
-The elegant integration of huggingface/nlp and fastai2, and some handy data transformation for huggingface/nlp.
+The elegant integration of huggingface/nlp and fastai, and some handy data transformation for huggingface/nlp.
 
 🎓 ** Documentation ** : https://hugdatafast.readthedocs.io/en/latest/
 
 # Install
 `pip install hugdatafast`
 
 # Furture Plan
-- I will try to merge this library to fastai2 and huggingface/nlp respectively. But to not introduce bugs into the two great libraries and for fast development, I may try it after my personal project which hugdatafast is created for, is completed (few months later I think), to somewhat assure it is mature enough to be merged into two libraries.
+- I would like to merge this library to fastai and huggingface/nlp respectively. But I may have no time for it. You're welcome to help the library merged to the two.
 - The implemenatation of `ConcatTransform` works but might be too complexed to extend, so I may discuss with huggingface to see how to improve it, if I have time.
 - Currently, it is designed to work with the dataset part of huggingface/nlp, I may also integrate metric part of it.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -15,7 +15,7 @@ Installation
 
     pip install hugdatafast
 
-This will install also the lastest ``fastai2`` and ``nlp``.
+This will install also the lastest ``fastai`` and ``nlp``.
 
 .. toctree::
     :maxdepth: 2

diff --git a/docs/source/start.rst b/docs/source/start.rst
@@ -10,7 +10,7 @@ Base use case
     >>> from hugdatafast import *
 
 .. note::
-   This will also implicitly do ``from fastai2.text.all import *``
+   This will also implicitly do ``from fastai.text.all import *``
 
 Can you turn your data pipeline into only 3 lines ?
 
@@ -23,7 +23,7 @@ Can you turn your data pipeline into only 3 lines ?
 
 Now you can enjoy 
 
-1. :func:`show_batch` of fastai2 \n
+1. :func:`show_batch` of fastai \n
 Even you don't use fastai to train, you can still use as a normal DataLoader
 
 .. code-block::
@@ -52,7 +52,7 @@ Even you don't use fastai to train, you can still use as a normal DataLoader
     1 [CLS] as a teacher , you have to deal simultaneously with the administration ' s pressure   0    1
       on you to succeed , and the children ' s to be a nice guy . [SEP] [PAD] [PAD]
     
-3. Use it as normal Dataloaders if you don't use fastai . (Try fastai !)
+3. Use it as normal Dataloaders if you don't use fastai .
 
 .. code-block::
 

diff --git a/hugdatafast/__init__.py b/hugdatafast/__init__.py
@@ -1,3 +1,3 @@
-__version__ = "0.4.0"
+__version__ = "0.4.2"
 from .fastai import *
 from .transform import *
diff --git a/hugdatafast/fastai.py b/hugdatafast/fastai.py
@@ -4,7 +4,7 @@
 from tqdm import tqdm
 from torch.nn.utils.rnn import pad_sequence
 import nlp
-from fastai2.text.all import *
+from fastai.text.all import *
 
 
 @delegates()
@@ -20,9 +20,11 @@ def __init__(self, dataset, srtkey_fc=None, filter_fc=False, pad_idx=None, cache
               - If ``False``, not sort. 
             filter_fc (``*args->bool``, optional): Return ``True`` to keep the sample.
             pad_idx (``int``, optional): pad each attribute of samples to the max length of its max length within the batch.\n 
+              - If ``List[int]``, specify pad_idx for each attribute of a sample. e.g. a sample is a tuple (masked_inputs, labels), `pad_idx=[0 ,-100]` pad masked_inputs with 0, labels with -100.
               - If ``False``, do no padding. 
               - If ``None``, try ``dataset.pad_idx``, do no padding if no such attribute.
             cache_file (``str``, optional): Path of a json file to cache info for sorting and filtering.
+            kwargs: key arguments for `TfmDl` or `DataLoader`
 
         Example:
             >>> samples = [ (torch.tensor([1]), torch.tensor([7,8]), torch.tensor(1)),,
@@ -45,12 +47,14 @@ def __init__(self, dataset, srtkey_fc=None, filter_fc=False, pad_idx=None, cache
         # Defaults
         if srtkey_fc is not False: srtkey_fc = lambda *x: len(x[0])
         if pad_idx is None: pad_idx = getattr(dataset, 'pad_idx', False)
+        if isinstance(pad_idx, int): pad_idxs = [pad_idx] * len(dataset[0])
+        elif isinstance(pad_idx, (list, tuple)): pad_idxs = pad_idx
         cache_file = Path(cache_file) if cache_file else None
         idmap = list(range(len(dataset)))
 
         # Save attributes
         super().__init__(dataset, **kwargs)
-        store_attr(self, 'pad_idx,srtkey_fc,filter_fc,cache_file,idmap')
+        store_attr(self, 'pad_idxs,srtkey_fc,filter_fc,cache_file,idmap')
 
         # Prepare records for sorting / filtered samples
         if srtkey_fc or filter_fc:
@@ -88,7 +92,7 @@ def create_item(self, i): return self.dataset[self.idmap[i]]
 
     def create_batch(self, samples):
         if self.pad_idx is False: return super().create_batch(samples)
-        return tuple( pad_sequence(attr, batch_first=True, padding_value=self.pad_idx) if attr[0].shape else torch.stack(attr) for i, attr in enumerate(zip(*samples)))
+        return tuple( pad_sequence(attr, batch_first=True, padding_value=self.pad_idxs[i]) if attr[0].shape and isinstance(self.pad_idxs[i], int) else torch.stack(attr) for i, attr in enumerate(zip(*samples)))
 
     def get_idxs(self):
         idxs = super().get_idxs()
@@ -114,6 +118,8 @@ def shuffle_fn(self,idxs):
 
     @delegates(TfmdDL.new)
     def new(self, dataset=None, **kwargs):
+        if 'get_idxs' in kwargs: # when Learner.get_preds, dataload has `get_idxs` will be cloned. So we need to prevent sorting again
+          kwargs['cache_file'] = self.cache_file
         # We don't use filter_fc here cuz we can't don't validate certaion samples in dev/test set. 
         return super().new(dataset=dataset, pad_idx=self.pad_idx, srtkey_fc=self.srtkey_fc, filter_fc=False, **kwargs)
 
@@ -139,7 +145,7 @@ def __new__(cls, *args, **kwargs):
         for n,v in kwargs.items(): setattr(item, n, v)
         return item
 
-class _Tuple(Tuple, ShowPrint):
+class _Tuple(fastuple, ShowPrint):
     def __new__(cls, *args, **kwargs):
         item = super().__new__(cls, *args)
         for n,v in kwargs.items(): setattr(item, n, v)
@@ -220,7 +226,7 @@ def show_results(x: tuple, y, samples, outs, ctxs=None, max_n=10, trunc_at=150,
 
 class HF_Dataset():
   """A wrapper for :class:`nlp.Dataset`.  It will behavior like original :class:`nlp.Dataset`, 
-  but also function as a :class:`fastai2.data.core.datasets` that provides samples and decodes."""
+  but also function as a :class:`fastai.data.core.datasets` that provides samples and decodes."""
 
   def __init__(self, hf_dset, cols=None, hf_toker=None, neat_show=False, n_inp=1):
     """
@@ -327,15 +333,15 @@ def __getattr__(self, name):
     raise AttributeError(f"Both 'HF_Dataset' object and 'nlp.Dataset' object have no '{name}' attribute ")
 
 class HF_Datasets(FilteredBase):
-  """Function as :class:`fastai2.data.core.Datasets` to create :class:`fastai2.data.core.Dataloaders` from a group of :class:`nlp.Dataset`s"""
+  """Function as :class:`fastai.data.core.Datasets` to create :class:`fastai.data.core.Dataloaders` from a group of :class:`nlp.Dataset`s"""
 
   _dl_type,_dbunch_type = MySortedDL,DataLoaders
 
   @delegates(HF_Dataset.__init__)
   def __init__(self, hf_dsets: dict, test_with_y=False, **kwargs):
     """
     Args:
-      hf_dsets (`Dict[nlp.Dataset]`): Prerocessed Hugging Face Datasets, {key} is split name, {value} is :class:`nlp.Dataset`, order will become the order in :class:`fastai2.data.core.Dataloaders`.
+      hf_dsets (`Dict[nlp.Dataset]`): Prerocessed Hugging Face Datasets, {key} is split name, {value} is :class:`nlp.Dataset`, order will become the order in :class:`fastai.data.core.Dataloaders`.
       test_with_y (bool, optional): Whether the test set come with y (answers) but not with fake y (e.g. all -1 label). 
         If ``False``, tell only test set to construct samples from first ``n_inp`` columns (do not output fake y). 
         And all datasets passed in ``hf_dsets`` with its name starts with "test" will be regarded as test set. 
@@ -362,11 +368,11 @@ def cache_dir(self): return Path(next(iter(self.hf_dsets.values())).cache_files[
   def dataloaders(self, device='cpu', cache_dir=None, cache_name=None, dl_kwargs=None, **kwargs):
     """
     Args:
-      device (str): device where outputed batch will be on. Because a batch will be loaded to test when creating :class: `fastai2.data.core.Dataloaders`, to prevent always leaving a batch of tensor in cuda:0, using default value cpu and then ``dls.to(other device)`` at the time you want is suggested.
+      device (str): device where outputed batch will be on. Because a batch will be loaded to test when creating :class: `fastai.data.core.Dataloaders`, to prevent always leaving a batch of tensor in cuda:0, using default value cpu and then ``dls.to(other device)`` at the time you want is suggested.
       cache_dir (str, optional): directory to store caches of :class:`MySortedDL`. if ``None``, use cache directory of the first :class:`nlp.Dataset` in ``hf_dsets`` that passed to :method:`HF_Datasets.__init__`.
       cache_name (str, optional): format string that includes one param "{split}", which will be replaced with name of split as cache file name under `cache_dir` for each split. If ``None``, tell :class:MySortedDL don't do caching.
       dl_kwargs (list[dict], optional): ith item is addtional kwargs to be passed to initialization of ith dataloader for ith split
-      kwargs: Passed to :func:`fastai2.data.core.FilteredBase.dataloaders`
+      kwargs: Passed to :func:`fastai.data.core.FilteredBase.dataloaders`
     
     Example:
       >>> tokenized_cola

diff --git a/hugdatafast/transform.py b/hugdatafast/transform.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 import pyarrow as pa
 import nlp
-from fastai2.text.all import *
+from fastai.text.all import *
 
 class HF_BaseTransform():
   "The base class of HuggingFace/nlp transform. Inherit it to get the ability to do :func:`map` on not only a :class:`nlp.Dataset` but also  :class:`nlp.Dataset` s at once."
@@ -263,7 +263,7 @@ def accumulate(self, text): # *inp_cols
       usable_len -= use_len
       cursor += use_len
       if self.residual_len == 0:
-        self.commit_example(self.create_example())   
+        self.commit_example(self.create_example())
 
   def create_example(self):
     """ Implement the abstract method"""
@@ -276,110 +276,4 @@ def create_example(self):
     self.new_text = []
     self.residual_len = self._max_len
 
-    return example
-
-@delegates(CombineTransform, but=["inp_cols", "out_cols", "init_attrs"])
-class ELECTRADataTransform(CombineTransform):
-  "Process any text corpus for ELECTRA's use"
-  def __init__(self, hf_dset, is_docs, text_col, max_length, hf_toker, delimiter='\n', **kwargs):
-    """
-    Args:
-      hf_dset (:class:`nlp.Dataset` or Dict[:class:`nlp.Dataset`]): **untokenized** Hugging Face dataset(s) to do the transform
-      is_docs (bool): Whether each sample of this dataset is a doc
-      text_col (str): the name of column of the dataset contains text 
-      max_length (str): max length of each sentence
-      hf_toker (:class:`transformers.PreTrainedTokenizer`): Hugging Face tokenizer
-      delimiter (str): what is the delimiter to segment sentences in the input text
-      kwargs: passed to :class:`CombineTransform`
-    """
-    self.is_docs = is_docs
-    self.in_col = text_col
-    self._current_sentences = []
-    self._current_length = 0
-    self._max_length = max_length
-    self._target_length = max_length
-    self.cls_idx, self.sep_idx = hf_toker.cls_token_id, hf_toker.sep_token_id
-    self.hf_toker = hf_toker
-    self.delimiter = delimiter
-    super().__init__(hf_dset, inp_cols=[self.in_col], out_cols=['input_ids','attention_mask','token_type_ids'], 
-                    init_attrs=['_current_sentences', '_current_length', '_target_length'], **kwargs)
-
-  """
-  This two main functions adapts official source code creates pretraining dataset, to CombineTransform
-  """
-  def accumulate(self, text):
-    sentences = text.split(self.delimiter)
-    for sentence in sentences:
-      if not sentence: continue # skip empty
-      tokids = self.hf_toker.convert_tokens_to_ids(self.hf_toker.tokenize(sentence))
-      self.add_line(tokids)
-    # end of doc
-    if self.is_docs and self._current_length > 0:
-      self.commit_example(self.create_example())
-
-  def create_example(self):
-    input_ids, token_type = self._create_example() # this line reset _current_sentences and _current_length in the end
-    return {'input_ids': input_ids, 'attention_mask':[1]*len(input_ids), 'token_type_ids':token_type}
-  # ...................................................
-
-  def add_line(self, tokids):
-    """Adds a line of text to the current example being built."""
-    self._current_sentences.append(tokids)
-    self._current_length += len(tokids)
-    if self._current_length >= self._target_length:
-      self.commit_example(self.create_example())
-
-  def _create_example(self):
-    """Creates a pre-training example from the current list of sentences."""
-    # small chance to only have one segment as in classification tasks
-    if random.random() < 0.1:
-      first_segment_target_length = 100000
-    else:
-      # -3 due to not yet having [CLS]/[SEP] tokens in the input text
-      first_segment_target_length = (self._target_length - 3) // 2
-
-    first_segment = []
-    second_segment = []
-    for sentence in self._current_sentences:
-      # the sentence goes to the first segment if (1) the first segment is
-      # empty, (2) the sentence doesn't put the first segment over length or
-      # (3) 50% of the time when it does put the first segment over length
-      if (len(first_segment) == 0 or
-          len(first_segment) + len(sentence) < first_segment_target_length or
-          (len(second_segment) == 0 and
-           len(first_segment) < first_segment_target_length and
-           random.random() < 0.5)):
-        first_segment += sentence
-      else:
-        second_segment += sentence
-
-    # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
-    first_segment = first_segment[:self._max_length - 2]
-    second_segment = second_segment[:max(0, self._max_length -
-                                         len(first_segment) - 3)]
-
-    # prepare to start building the next example
-    self._current_sentences = []
-    self._current_length = 0
-    # small chance for random-length instead of max_length-length example
-    if random.random() < 0.05:
-      self._target_length = random.randint(5, self._max_length)
-    else:
-      self._target_length = self._max_length
-
-    return self._make_example(first_segment, second_segment)
-
-  def _make_example(self, first_segment, second_segment):
-    """Converts two "segments" of text into a tf.train.Example."""
-    input_ids = [self.cls_idx] + first_segment + [self.sep_idx]
-    token_type = [0]*len(input_ids)
-    if second_segment:
-      input_ids += second_segment + [self.sep_idx]
-      token_type += [1]*(len(second_segment)+1)
-    return input_ids, token_type
-
-  def __getstate__(self):
-    "specify something you don't want pickle here, remember to use copy to not modfiy orginal instance"
-    state = self.__dict__.copy() 
-    state['hf_toker'] = None 
-    return state
+    return example
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
     long_description = fh.read()
 
 REQUIRED_PKGS = [
-    'fastai2',
+    'fastai',
     'nlp',
 ]
 
@@ -14,7 +14,7 @@
     version=__version__,
     author="Richard Wang",
     author_email="richardyy1188@gmail.com",
-    description="The elegant bridge between hugginface data and fastai2",
+    description="The elegant bridge between hugginface data and fastai",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/richarddwang/hugdatafast",