feat(#422): introducing the rb.Dataset* classes (#1109)

* feat: add Dataset class * test: add dataset tests * feat: add meaningful error message to TaskType * feat: add to from datasest for text classification * test: add dataset fixtures * feat: implement pandas to text classification * feat: add token classification support * test: add token classification tests * test: use singlelabel_textclassification_records * chore: small improvements * refactor: switch to class implementation * chore: import in init * test: add missing tests plus minor fixes * chore: add future warning about as_pandas * chore: more integrations in the library * fix: wrong import * chore: add new test dependency * test: add test for tasktype * feat: ignore not supported columns * test: add tests for read_pandas/datasets * docs: put type hints only in description, it becomes too messy * test: improve tests * docs: curate docstrings * docs: add datasets to python reference * fix: return None's instead of empty dicts for metrics * docs: add dataset guide * docs: increase contrast * Update docs/guides/datasets_in_the_client.ipynb Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> * Update docs/guides/datasets_in_the_client.ipynb Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> * Update docs/guides/datasets_in_the_client.ipynb Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> * Update docs/guides/datasets_in_the_client.ipynb Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> * Update docs/guides/datasets_in_the_client.ipynb Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> * test: add general log/load tests for all allowed input types of rb.load * fix: remove append Co-authored-by: Daniel Vila Suero <daniel@recogn.ai> (cherry picked from commit 14c8087)
argilla-io · Feb 17, 2022 · b5bbca6 · b5bbca6
1 parent a374204
commit b5bbca6
Show file tree

Hide file tree

Showing 19 changed files with 1,769 additions and 84 deletions.
diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css
@@ -12,6 +12,7 @@
   --sidebarColor: #484848;
   --sidebarTitleColor: #404040;
   --noteHeader: #4C4EA3;
+  --noteBackground: #4c4ea31c;
   --white: #ffffff;
   --line: #e9eaed;
 }
@@ -135,13 +136,13 @@ p {
   line-height: 1.8em;
 }
 .rst-content .note, .rst-content .seealso, .rst-content .wy-alert-info.admonition, .rst-content .wy-alert-info.admonition-todo, .rst-content .wy-alert-info.attention, .rst-content .wy-alert-info.caution, .rst-content .wy-alert-info.danger, .rst-content .wy-alert-info.error, .rst-content .wy-alert-info.hint, .rst-content .wy-alert-info.important, .rst-content .wy-alert-info.tip, .rst-content .wy-alert-info.warning, .wy-alert.wy-alert-info {
-  background: var(--sidebarBackgroundDark);
+  background: var(--noteBackground);
 }
 .rst-content .note .admonition-title, .rst-content .note .wy-alert-title, .rst-content .seealso .admonition-title, .rst-content .seealso .wy-alert-title, .rst-content .wy-alert-info.admonition-todo .admonition-title, .rst-content .wy-alert-info.admonition-todo .wy-alert-title, .rst-content .wy-alert-info.admonition .admonition-title, .rst-content .wy-alert-info.admonition .wy-alert-title, .rst-content .wy-alert-info.attention .admonition-title, .rst-content .wy-alert-info.attention .wy-alert-title, .rst-content .wy-alert-info.caution .admonition-title, .rst-content .wy-alert-info.caution .wy-alert-title, .rst-content .wy-alert-info.danger .admonition-title, .rst-content .wy-alert-info.danger .wy-alert-title, .rst-content .wy-alert-info.error .admonition-title, .rst-content .wy-alert-info.error .wy-alert-title, .rst-content .wy-alert-info.hint .admonition-title, .rst-content .wy-alert-info.hint .wy-alert-title, .rst-content .wy-alert-info.important .admonition-title, .rst-content .wy-alert-info.important .wy-alert-title, .rst-content .wy-alert-info.tip .admonition-title, .rst-content .wy-alert-info.tip .wy-alert-title, .rst-content .wy-alert-info.warning .admonition-title, .rst-content .wy-alert-info.warning .wy-alert-title, .rst-content .wy-alert.wy-alert-info .admonition-title, .wy-alert.wy-alert-info .rst-content .admonition-title, .wy-alert.wy-alert-info .wy-alert-title {
   background: var(--noteHeader);
 }
 html.writer-html4 .rst-content dl:not(.docutils)>dt, html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt {
-  background: var(--sidebarBackgroundDark);
+  background: var(--noteBackground);
   color: #484848;
   border-top: 3px solid var(--noteHeader);
   font-weight: 600;
@@ -296,7 +297,7 @@ a:active {
 .highlight .vc { color: #d5abe0 } /* Name.Variable.Class */
 .highlight .vg { color: #d5abe0 } /* Name.Variable.Global */
 .highlight .vi { color: #d5abe0 } /* Name.Variable.Instance */
-.highlight .il { color: #fbffc2 } 
+.highlight .il { color: #fbffc2 }
 
 @font-face {
   font-family: 'Futura Medium';
@@ -311,16 +312,16 @@ a:active {
   font-weight: normal;
   src: local('Futura Medium Condensed'), url('fonts/futura-medium-condensed.woff2') format('woff2');
 }
-  
-  
+
+
   @font-face {
   font-family: 'Futura Bold';
   font-style: normal;
   font-weight: normal;
   src: local('Futura Bold'), url('fonts/futura-bold.woff2') format('woff2');
   }
-  
-  
+
+
   @font-face {
   font-family: 'Futura Light';
   font-style: normal;
@@ -356,18 +357,18 @@ a:active {
   }
   .wy-menu-vertical li.toctree-l2.current>a {
     padding-left: 34px;
-    padding-right: 2em;  
+    padding-right: 2em;
   };
-  .wy-menu-vertical li.toctree-l2.current li.toctree-l3>a, 
+  .wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,
   .wy-menu-vertical li.toctree-l3.current>a {
     padding-left: 4em;
-    padding-right: 2em;    
+    padding-right: 2em;
   }
 }
 /* Change the white to any color */
 input:-webkit-autofill,
-input:-webkit-autofill:hover, 
-input:-webkit-autofill:focus, 
+input:-webkit-autofill:hover,
+input:-webkit-autofill:focus,
 input:-webkit-autofill:active
 {
  -webkit-box-shadow: 0 0 0 30px white inset !important;
@@ -376,4 +377,4 @@ input:-webkit-autofill:active
 .rst-content .highlighted {
   background: #c5c6e0;
   box-shadow: 0 0 0 2px #c5c6e0;
-}
+}
diff --git a/docs/conf.py b/docs/conf.py
@@ -80,8 +80,7 @@
     </style>
 """
 
-# TODO: Change this to "both" once Sphinx 4.1 is out
-autodoc_typehints = "both"
+autodoc_typehints = "description"
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]

diff --git a/docs/guides/datasets_in_the_client.ipynb b/docs/guides/datasets_in_the_client.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d09c1533-97fd-43f7-83ea-f3a56edd1d5e",
+   "metadata": {},
+   "source": [
+    "# Datasets\n",
+    "\n",
+    "This guide showcases some features of the `Dataset` classes in the Rubrix client.\n",
+    "The Dataset classes are lightweight containers for Rubrix records. These classes facilitate importing from and exporting to different formats (e.g., `pandas.DataFrame`, `datasets.Dataset`) as well as sharing and versioning Rubrix datasets using the Hugging Face Hub.\n",
+    "\n",
+    "For each record type there's a corresponding Dataset class called `DatasetFor<RecordType>`.\n",
+    "You can look up their API in the [reference section](../reference/python/python_client.rst#module-rubrix.client.datasets)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae9d4c9e-24a1-4a59-8a17-3b6ac1a39c88",
+   "metadata": {},
+   "source": [
+    "## Working with a Dataset\n",
+    "\n",
+    "Under the hood the Dataset classes store the records in a simple Python list.\n",
+    "Therefore, working with a Dataset class is not very different to working with a simple list of records:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edbf8c7f-463d-48ee-944a-3adb57edb159",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rubrix as rb\n",
+    "\n",
+    "# Start with a list of Rubrix records\n",
+    "dataset_rb = rb.DatasetForTextClassification(my_records)\n",
+    "\n",
+    "# Loop over the dataset\n",
+    "for record in dataset_rb:\n",
+    "    print(record)\n",
+    "    \n",
+    "# Index into the dataset\n",
+    "dataset_rb[0] = rb.TextClassificationRecord(inputs=\"replace record\")\n",
+    "\n",
+    "# log a dataset to the Rubrix web app\n",
+    "rb.log(dataset_rb, \"my_dataset\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d3e9fc0-8563-4727-abca-62205a4de385",
+   "metadata": {},
+   "source": [
+    "The Dataset classes do some extra checks for you, to make sure you do not mix record types when appending or indexing into a dataset. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df88889b-12f4-472f-bcbe-fb47be475d02",
+   "metadata": {},
+   "source": [
+    "## Importing from other formats\n",
+    "\n",
+    "When you have your data in a [_pandas DataFrame_](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or a [_datasets Dataset_](https://huggingface.co/docs/datasets/access.html), we provide some neat shortcuts to import this data into a Rubrix Dataset. \n",
+    "You have to make sure that the data follows the record model of a specific task, otherwise you will get validation errors. \n",
+    "Columns in your DataFrame/Dataset that are not supported or recognized, will simply be ignored.\n",
+    "\n",
+    "The record models of the tasks are explained in the [reference section](../reference/python/python_client.rst#module-rubrix.client.models). \n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "\n",
+    "Note\n",
+    "\n",
+    "Due to it's pyarrow nature, data in a `datasets.Dataset` has to follow a slightly different model, that you can look up in the examples of the `Dataset*.from_datasets` [docstrings](../reference/python/python_client.rst#rubrix.client.datasets.DatasetForTokenClassification.from_datasets). \n",
+    "    \n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62ca56d4-2bb5-4c77-a069-7a50ee78b415",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rubrix as rb\n",
+    "\n",
+    "# import data from a pandas DataFrame\n",
+    "dataset_rb = rb.read_pandas(my_dataframe, task=\"TextClassification\")\n",
+    "\n",
+    "# import data from a datasets Dataset\n",
+    "dataset_rb = rb.read_datasets(my_dataset, task=\"TextClassification\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb290a71-ad07-496f-b167-ad80d91fa633",
+   "metadata": {},
+   "source": [
+    "## Sharing via the Hugging Face Hub\n",
+    "\n",
+    "You can easily share your Rubrix dataset with your community via the Hugging Face Hub.\n",
+    "For this you just need to export your Rubrix Dataset to a `datasets.Dataset` and [push it to the hub](https://huggingface.co/docs/datasets/upload_dataset.html?highlight=push_to_hub#upload-from-python):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4d6d70b-0b91-4efb-94b6-6b7710c105c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import rubrix as rb\n",
+    "\n",
+    "# load your annotated dataset from the Rubrix web app\n",
+    "dataset_rb = rb.load(\"my_dataset\", as_pandas=False)\n",
+    "\n",
+    "# export your Rubrix Dataset to a datasets Dataset\n",
+    "dataset_ds = dataset_rb.to_datasets()\n",
+    "\n",
+    "# push the dataset to the Hugging Face Hub\n",
+    "dataset_ds.push_to_hub(\"my_dataset\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "696605dd-be87-4ae6-b367-b0cdabfaf39f",
+   "metadata": {},
+   "source": [
+    "Afterward, your community can easily access your annotated dataset and log it directly to the Rubrix web app:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e70a4792-bc91-4d64-8465-b2bccf23502f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "# download the dataset from the Hugging Face Hub\n",
+    "dataset_ds = load_dataset(\"user/my_dataset\", split=\"train\")\n",
+    "\n",
+    "# read in dataset, assuming its a dataset for text classification\n",
+    "dataset_rb = rb.read_datasets(dataset_ds, task=\"TextClassification\")\n",
+    "\n",
+    "# log the dataset to the Rubrix web app\n",
+    "rb.log(dataset_rb, \"dataset_by_user\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/index.rst b/docs/index.rst
@@ -180,6 +180,7 @@ You can join the conversation on our Github page and our Github forum.
    guides/weak-supervision
    guides/monitoring
    guides/metrics
+   guides/datasets_in_the_client
 
 .. toctree::
    :maxdepth: 3

diff --git a/docs/reference/python/python_client.rst b/docs/reference/python/python_client.rst
@@ -20,3 +20,9 @@ Models
 .. automodule:: rubrix.client.models
    :members:
    :exclude-members: BaseRecord, BulkResponse
+
+Datasets
+--------
+
+.. automodule:: rubrix.client.datasets
+   :members: DatasetForTextClassification, DatasetForTokenClassification, DatasetForText2Text, read_datasets, read_pandas
diff --git a/environment_dev.yml b/environment_dev.yml
@@ -29,6 +29,7 @@ dependencies:
       - pre-commit==2.15.0
       # extra test dependencies
       - cleanlab
+      - datasets>1.17.0
       - https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0.tar.gz
       - flair==0.10
       - flyingsquid

diff --git a/src/rubrix/__init__.py b/src/rubrix/__init__.py
@@ -28,6 +28,14 @@
 
 from rubrix._constants import DEFAULT_API_KEY
 from rubrix.client import RubrixClient
+from rubrix.client.datasets import (
+    Dataset,
+    DatasetForText2Text,
+    DatasetForTextClassification,
+    DatasetForTokenClassification,
+    read_datasets,
+    read_pandas,
+)
 from rubrix.client.models import (
     BulkResponse,
     Record,
@@ -143,7 +151,7 @@ def set_workspace(ws: str) -> None:
 
 
 def log(
-    records: Union[Record, Iterable[Record]],
+    records: Union[Record, Iterable[Record], Dataset],
     name: str,
     tags: Optional[Dict[str, str]] = None,
     metadata: Optional[Dict[str, Any]] = None,
@@ -206,19 +214,19 @@ def load(
     ids: Optional[List[Union[str, int]]] = None,
     limit: Optional[int] = None,
     as_pandas: bool = True,
-) -> Union[pandas.DataFrame, List[Record]]:
-    """Loads a dataset as a pandas DataFrame or a list of records.
+) -> Union[pandas.DataFrame, Dataset]:
+    """Loads a dataset as a pandas DataFrame or a Dataset.
 
     Args:
         name: The dataset name.
         query: An ElasticSearch query with the
             `query string syntax <https://rubrix.readthedocs.io/en/stable/reference/webapp/search_records.html>`_
         ids: If provided, load dataset records with given ids.
         limit: The number of records to retrieve.
-        as_pandas: If True, return a pandas DataFrame. If False, return a list of records.
+        as_pandas: If True, return a pandas DataFrame. If False, return a Dataset.
 
     Returns:
-        The dataset as a pandas Dataframe or a list of records.
+        The dataset as a pandas Dataframe or a Dataset.
 
     Examples:
         >>> import rubrix as rb