From 08cd7b43bf2c1f690c38264910e4c5697f48a52e Mon Sep 17 00:00:00 2001
From: Daniel Peng <daniel.peng898@gmail.com>
Date: Mon, 12 May 2025 16:48:30 -0400
Subject: [PATCH 1/5] refactor/chore: remove dataset and benchmark 'upload...',
 'upload_to_hub' functions from Python client

---
 polaris/benchmark/_base.py     |  24 ---
 polaris/dataset/_base.py       |  10 -
 polaris/dataset/_dataset.py    |  15 --
 polaris/dataset/_dataset_v2.py |  15 --
 polaris/hub/client.py          | 329 ---------------------------------
 5 files changed, 393 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 67d73ba8..3014e753 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -167,30 +167,6 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs
             featurization_fn=featurization_fn,
         )
 
-    def upload_to_hub(
-        self,
-        settings: PolarisHubSettings | None = None,
-        cache_auth_token: bool = True,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-        **kwargs: dict,
-    ):
-        """
-        Very light, convenient wrapper around the
-        [`PolarisHubClient.upload_benchmark`][polaris.hub.client.PolarisHubClient.upload_benchmark] method.
-        """
-        from polaris.hub.client import PolarisHubClient
-
-        with PolarisHubClient(
-            settings=settings,
-            cache_auth_token=cache_auth_token,
-            **kwargs,
-        ) as client:
-            return client.upload_benchmark(
-                self, access=access, owner=owner, parent_artifact_id=parent_artifact_id
-            )
-
     def to_json(self, destination: str) -> str:
         """Save the benchmark to a destination directory as a JSON file.
 
diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py
index 8ecccf98..115c38ba 100644
--- a/polaris/dataset/_base.py
+++ b/polaris/dataset/_base.py
@@ -301,16 +301,6 @@ def get_data(
         """
         raise NotImplementedError
 
-    @abc.abstractmethod
-    def upload_to_hub(
-        self,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-    ):
-        """Uploads the dataset to the Polaris Hub."""
-        raise NotImplementedError
-
     @classmethod
     @abc.abstractmethod
     def from_json(cls, path: str):
diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py
index df2325c1..a2b4fa8a 100644
--- a/polaris/dataset/_dataset.py
+++ b/polaris/dataset/_dataset.py
@@ -218,21 +218,6 @@ def get_data(
 
         return arr
 
-    def upload_to_hub(
-        self,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-    ):
-        """
-        Very light, convenient wrapper around the
-        [`PolarisHubClient.upload_dataset`][polaris.hub.client.PolarisHubClient.upload_dataset] method.
-        """
-        from polaris.hub.client import PolarisHubClient
-
-        with PolarisHubClient() as client:
-            client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id)
-
     @classmethod
     def from_json(cls, path: str):
         """Loads a dataset from a JSON file.
diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py
index 40ba3ba5..69cf3d5c 100644
--- a/polaris/dataset/_dataset_v2.py
+++ b/polaris/dataset/_dataset_v2.py
@@ -192,21 +192,6 @@ def get_data(self, row: int, col: str, adapters: dict[str, Adapter] | None = Non
 
         return arr
 
-    def upload_to_hub(
-        self,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-    ):
-        """
-        Uploads the dataset to the Polaris Hub.
-        """
-
-        from polaris.hub.client import PolarisHubClient
-
-        with PolarisHubClient() as client:
-            client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id)
-
     @classmethod
     def from_json(cls, path: str):
         """
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
index 30627312..c4b6ee74 100644
--- a/polaris/hub/client.py
+++ b/polaris/hub/client.py
@@ -535,335 +535,6 @@ def upload_results(
                 f"[green]Your result has been successfully uploaded to the Hub. View it here: {result_url}"
             )
 
-    def upload_dataset(
-        self,
-        dataset: DatasetV1 | DatasetV2,
-        access: AccessType = "private",
-        timeout: TimeoutTypes = (10, 200),
-        owner: HubOwner | str | None = None,
-        if_exists: ZarrConflictResolution = "replace",
-        parent_artifact_id: str | None = None,
-    ):
-        """Upload a dataset to the Polaris Hub.
-
-        Info: Owner
-            You have to manually specify the owner in the dataset data model. Because the owner could
-            be a user or an organization, we cannot automatically infer this from just the logged-in user.
-
-        Note: Required metadata
-            The Polaris client and Hub maintain different requirements as to which metadata is required.
-            The requirements by the Hub are stricter, so when uploading to the Hub you might
-            get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible
-            before uploading.
-
-        Args:
-            dataset: The dataset to upload.
-            access: Grant public or private access to result
-            timeout: Request timeout values. User can modify the value when uploading large dataset as needed.
-                This can be a single value with the timeout in seconds for all IO operations, or a more granular
-                tuple with (connect_timeout, write_timeout). The type of the the timout parameter comes from `httpx`.
-                Since datasets can get large, it might be needed to increase the write timeout for larger datasets.
-                See also: https://www.python-httpx.org/advanced/#timeout-configuration
-            owner: Which Hub user or organization owns the artifact. Takes precedence over `dataset.owner`.
-            if_exists: Action for handling existing files in the Zarr archive. Options are 'raise' to throw
-                an error, 'replace' to overwrite, or 'skip' to proceed without altering the existing files.
-            parent_artifact_id: The `owner/slug` of the parent dataset, if uploading a new version of a dataset.
-        """
-        # Normalize timeout
-        if timeout is None:
-            timeout = self.settings.default_timeout
-
-        # Check if a dataset license was specified prior to upload
-        if not dataset.license:
-            raise InvalidDatasetError(
-                f"\nPlease specify a supported license for this dataset prior to uploading to the Polaris Hub.\nOnly some licenses are supported - {get_args(SupportedLicenseType)}."
-            )
-
-        if isinstance(dataset, DatasetV1):
-            self._upload_v1_dataset(dataset, timeout, access, owner, if_exists, parent_artifact_id)
-        elif isinstance(dataset, DatasetV2):
-            self._upload_v2_dataset(dataset, timeout, access, owner, if_exists, parent_artifact_id)
-
-    def _upload_v1_dataset(
-        self,
-        dataset: DatasetV1,
-        timeout: TimeoutTypes,
-        access: AccessType,
-        owner: HubOwner | str | None,
-        if_exists: ZarrConflictResolution,
-        parent_artifact_id: str | None,
-    ):
-        """
-        Upload a V1 dataset to the Polaris Hub.
-        """
-
-        with track_progress(description="Uploading dataset", total=1) as (progress, task):
-            # Get the serialized data-model
-            # We exclude the table as it handled separately
-            dataset.owner = HubOwner.normalize(owner or dataset.owner)
-            dataset_json = dataset.model_dump(exclude={"table"}, exclude_none=True, by_alias=True)
-
-            # If the dataset uses Zarr, we will save the Zarr archive to the Hub as well
-            if dataset.uses_zarr:
-                dataset_json["zarrRootPath"] = f"{StorageSession.polaris_protocol}://data.zarr"
-
-            # Uploading a dataset is a three-step process.
-            # 1. Upload the dataset metadata to the Hub and prepare the Hub to receive the data
-            # 2. Upload the parquet file to Hub storage
-            # 3. Upload the associated Zarr archive to Hub storage
-
-            # Prepare the parquet file
-            in_memory_parquet = BytesIO()
-            dataset.table.to_parquet(in_memory_parquet)
-            parquet_size = len(in_memory_parquet.getbuffer())
-            parquet_md5 = md5(in_memory_parquet.getbuffer()).hexdigest()
-
-            # Step 1: Upload metadata
-            # Instead of directly uploading the data, we announce to the Hub that we intend to upload it.
-            # We do so separately for the Zarr archive and Parquet file.
-            url = f"/v1/dataset/{dataset.artifact_id}"
-            response = self._base_request_to_hub(
-                url=url,
-                method="PUT",
-                json={
-                    "tableContent": {
-                        "size": parquet_size,
-                        "fileType": "parquet",
-                        "md5Sum": parquet_md5,
-                    },
-                    "zarrContent": [md5sum.model_dump() for md5sum in dataset._zarr_md5sum_manifest],
-                    "access": access,
-                    "parentArtifactId": parent_artifact_id,
-                    **dataset_json,
-                },
-                timeout=timeout,
-            )
-
-            inserted_dataset = response.json()
-
-            # We modify the slug in the server
-            # Update dataset.slug here so dataset.urn is constructed correctly
-            dataset.slug = inserted_dataset["slug"]
-
-            with StorageSession(self, "write", dataset.urn) as storage:
-                with track_progress(description="Copying Parquet file", total=1) as (progress, task):
-                    # Step 2: Upload the parquet file
-                    progress.log("[yellow]This may take a while.")
-                    storage.set_file("root", in_memory_parquet.getvalue())
-
-                    # Step 3: Upload any associated Zarr archive
-                if dataset.uses_zarr:
-                    with track_progress(description="Copying Zarr archive", total=1):
-                        destination = storage.store("extension")
-
-                        # Locally consolidate Zarr archive metadata. Future updates on handling consolidated
-                        # metadata based on Zarr developers' recommendations can be tracked at:
-                        # https://github.com/zarr-developers/zarr-python/issues/1731
-                        zarr.consolidate_metadata(dataset.zarr_root.store.store)
-                        zmetadata_content = dataset.zarr_root.store.store[".zmetadata"]
-                        destination[".zmetadata"] = zmetadata_content
-
-                        # Copy the Zarr archive to the Hub
-                        destination.copy_from_source(
-                            dataset.zarr_root.store.store, if_exists=if_exists, log=logger.info
-                        )
-
-            dataset_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location"))
-            progress.log(
-                f"[green]Your dataset has been successfully uploaded to the Hub.\nView it here: {dataset_url}"
-            )
-
-    def _upload_v2_dataset(
-        self,
-        dataset: DatasetV2,
-        timeout: TimeoutTypes,
-        access: AccessType,
-        owner: HubOwner | str | None,
-        if_exists: ZarrConflictResolution,
-        parent_artifact_id: str | None,
-    ):
-        """
-        Upload a V2 dataset to the Polaris Hub.
-        """
-
-        with track_progress(description="Uploading dataset", total=1) as (progress, task):
-            # Get the serialized data-model
-            dataset.owner = HubOwner.normalize(owner or dataset.owner)
-            dataset_json = dataset.model_dump(exclude_none=True, by_alias=True)
-
-            # Step 1: Upload dataset metadata
-            url = f"/v2/dataset/{dataset.artifact_id}"
-            response = self._base_request_to_hub(
-                url=url,
-                method="PUT",
-                json={
-                    "zarrManifestFileContent": {
-                        "md5Sum": dataset.zarr_manifest_md5sum,
-                    },
-                    "access": access,
-                    "parentArtifactId": parent_artifact_id,
-                    **dataset_json,
-                },
-                timeout=timeout,
-            )
-
-            inserted_dataset = response.json()
-
-            # We modify the slug in the server
-            # Update dataset.slug here so dataset.urn is constructed correctly
-            dataset.slug = inserted_dataset["slug"]
-
-            with StorageSession(self, "write", dataset.urn) as storage:
-                # Step 2: Upload the manifest file
-                with track_progress(description="Copying manifest file", total=1):
-                    with open(dataset.zarr_manifest_path, "rb") as manifest_file:
-                        storage.set_file("manifest", manifest_file.read())
-
-                # Step 3: Upload the Zarr archive
-                with track_progress(description="Copying Zarr archive", total=1) as (
-                    progress_zarr,
-                    task_zarr,
-                ):
-                    progress_zarr.log("[yellow]This may take a while.")
-
-                    destination = storage.store("root")
-
-                    # Locally consolidate Zarr archive metadata. Future updates on handling consolidated
-                    # metadata based on Zarr developers' recommendations can be tracked at:
-                    # https://github.com/zarr-developers/zarr-python/issues/1731
-                    zarr.consolidate_metadata(dataset.zarr_root.store.store)
-                    zmetadata_content = dataset.zarr_root.store.store[".zmetadata"]
-                    destination[".zmetadata"] = zmetadata_content
-
-                    # Copy the Zarr archive to the Hub
-                    destination.copy_from_source(
-                        dataset.zarr_root.store.store, if_exists=if_exists, log=logger.info
-                    )
-
-            dataset_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location"))
-            progress.log(
-                f"[green]Your V2 dataset has been successfully uploaded to the Hub.\nView it here: {dataset_url}"
-            )
-
-    def upload_benchmark(
-        self,
-        benchmark: BenchmarkV1Specification | BenchmarkV2Specification,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-    ):
-        """Upload a benchmark to the Polaris Hub.
-
-        Info: Owner
-            You have to manually specify the owner in the benchmark data model. Because the owner could
-            be a user or an organization, we cannot automatically infer this from the logged-in user.
-
-        Note: Required metadata
-            The Polaris client and Hub maintain different requirements as to which metadata is required.
-            The requirements by the Hub are stricter, so when uploading to the Hub you might
-            get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible
-            before uploading.
-
-        Note: Non-existent datasets
-            The client will _not_ upload the associated dataset to the Hub if it does not yet exist.
-            Make sure to specify an existing dataset or upload the dataset first.
-
-        Args:
-            benchmark: The benchmark to upload.
-            access: Grant public or private access to result
-            owner: Which Hub user or organization owns the artifact. Takes precedence over `benchmark.owner`.
-            parent_artifact_id: The `owner/slug` of the parent benchmark, if uploading a new version of a benchmark.
-        """
-        match benchmark:
-            case BenchmarkV1Specification():
-                self._upload_v1_benchmark(benchmark, access, owner, parent_artifact_id)
-            case BenchmarkV2Specification():
-                self._upload_v2_benchmark(benchmark, access, owner, parent_artifact_id)
-
-    def _upload_v1_benchmark(
-        self,
-        benchmark: BenchmarkV1Specification,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-    ):
-        """
-        Upload a V1 benchmark to the Polaris Hub.
-        """
-        with track_progress(description="Uploading benchmark", total=1) as (progress, task):
-            # Get the serialized data-model
-            # We exclude the dataset as we expect it to exist on the Hub already.
-            benchmark.owner = HubOwner.normalize(owner or benchmark.owner)
-            benchmark_json = benchmark.model_dump(exclude={"dataset"}, exclude_none=True, by_alias=True)
-            benchmark_json["datasetArtifactId"] = benchmark.dataset.artifact_id
-            benchmark_json["access"] = access
-
-            url = f"/v1/benchmark/{benchmark.artifact_id}"
-            response = self._base_request_to_hub(
-                url=url, method="PUT", json={"parentArtifactId": parent_artifact_id, **benchmark_json}
-            )
-
-            benchmark_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location"))
-            progress.log(
-                f"[green]Your benchmark has been successfully uploaded to the Hub.\nView it here: {benchmark_url}"
-            )
-
-    def _upload_v2_benchmark(
-        self,
-        benchmark: BenchmarkV2Specification,
-        access: AccessType = "private",
-        owner: HubOwner | str | None = None,
-        parent_artifact_id: str | None = None,
-    ):
-        """
-        Upload a V2 benchmark to the Polaris Hub.
-        """
-        with track_progress(description="Uploading benchmark", total=1) as (progress, task):
-            # Get the serialized data-model
-            # We exclude the dataset as we expect it to exist on the Hub already.
-            benchmark.owner = HubOwner.normalize(owner or benchmark.owner)
-            benchmark_json = benchmark.model_dump(exclude_none=True, by_alias=True)
-
-            # Uploading a V2 benchmark is a multistep process.
-            # 1. Upload the benchmark metadata to the Hub and prepare the Hub to receive the data
-            # 2. Upload each index set bitmap to the Hub storage
-
-            # Step 1: Upload metadata
-            url = f"/v2/benchmark/{benchmark.artifact_id}"
-            response = self._base_request_to_hub(
-                url=url,
-                method="PUT",
-                json={
-                    "access": access,
-                    "datasetArtifactId": benchmark.dataset.artifact_id,
-                    "parentArtifactId": parent_artifact_id,
-                    **benchmark_json,
-                },
-            )
-
-            inserted_benchmark = response.json()
-
-            # We modify the slug in the server
-            # Update benchmark.slug here so benchmark.urn is constructed correctly
-            benchmark.slug = inserted_benchmark["slug"]
-
-            with StorageSession(self, "write", benchmark.urn) as storage:
-                logger.info("Copying the benchmark split to the Hub. This may take a while.")
-
-                # 2. Upload each index set bitmap
-                with track_progress(
-                    description="Copying index sets", total=benchmark.split.n_test_sets + 1
-                ) as (progress_index_sets, task_index_sets):
-                    for label, index_set in benchmark.split:
-                        logger.info(f"Copying index set {label} to the Hub.")
-                        storage.set_file(label, index_set.serialize())
-                        progress_index_sets.update(task_index_sets, advance=1, refresh=True)
-
-            benchmark_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location"))
-            progress.log(
-                f"[green]Your benchmark has been successfully uploaded to the Hub.\nView it here: {benchmark_url}"
-            )
-
     def get_competition(self, artifact_id: str) -> CompetitionSpecification:
         """Load a competition from the Polaris Hub.
 

From 3a6ae1175c0e35aef5b8d604e82cdfe7757a7af6 Mon Sep 17 00:00:00 2001
From: Daniel Peng <daniel.peng898@gmail.com>
Date: Tue, 13 May 2025 10:16:11 -0400
Subject: [PATCH 2/5] chore: update README and Polaris documentation following
 dataset/benchmark upload function removals

---
 README.md                               |   1 -
 docs/tutorials/create_a_benchmark.ipynb | 288 -----------------
 docs/tutorials/create_a_dataset.ipynb   | 392 ------------------------
 mkdocs.yml                              |   3 -
 4 files changed, 684 deletions(-)
 delete mode 100644 docs/tutorials/create_a_benchmark.ipynb
 delete mode 100644 docs/tutorials/create_a_dataset.ipynb

diff --git a/README.md b/README.md
index 2f21f2b3..0f470ea4 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,6 @@ This library is a Python client to interact with the [Polaris Hub](https://polar
 
 - Download Polaris datasets and benchmarks.
 - Evaluate a custom method against a Polaris benchmark.
-- Create and upload new datasets and benchmarks.
 
 ## Quick API Tour
 
diff --git a/docs/tutorials/create_a_benchmark.ipynb b/docs/tutorials/create_a_benchmark.ipynb
deleted file mode 100644
index cbce3e15..00000000
--- a/docs/tutorials/create_a_benchmark.ipynb
+++ /dev/null
@@ -1,288 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Polaris explicitly distinguished datasets from benchmarks. A benchmark defines the ML task and evaluation logic (e.g. split and metrics) for a dataset. Because of this, a single dataset can be the basis of multiple benchmarks.\n",
-    "\n",
-    "## Create a Benchmark\n",
-    "\n",
-    "To create a benchmark, you need to instantiate the `BenchmarkV2Specification` class. This requires you to specify: \n",
-    "\n",
-    "1. The **dataset**, which can be stored either locally or on the Hub.\n",
-    "1. The **task**, where a task is defined by input and target columns.\n",
-    "2. The **split**, where a split is defined by a bunch of indices.\n",
-    "3. The **metric**, where a metric needs to be officially supported by Polaris.\n",
-    "4. The **metadata** to contextualize your benchmark.\n",
-    "\n",
-    "### Define the dataset\n",
-    "To learn how to create a dataset, see [this tutorial](./create_a_dataset.html). \n",
-    "\n",
-    "Alternatively, we can also load an existing dataset from the Hub.\n",
-    "\n",
-    "<div class=\"admonition warning\">\n",
-    "    <p class=\"admonition-title\">Not all Hub datasets are supported</p>\n",
-    "    <p>You can only create benchmarks for DatasetV2 instances, not for DatasetV1 instances. Some of the datasets stored on the Hub are still V1 datasets.</p>\n",
-    "</div>\n",
-    "\n",
-    "### Define the task\n",
-    "Currently, Polaris only supports predictive tasks. Specifying a predictive task is simply done by specifying the input and target columns."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_columns = [\"SMILES\"]\n",
-    "target_columns = [\"LOG_SOLUBILITY\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this case, we specified just a single input and target column, but a benchmark can have multiple (e.g. a multi-task benchmark).\n",
-    "\n",
-    "### Define the split\n",
-    "\n",
-    "To ensure reproducible results, Polaris represents a split through a bunch of sets of indices.\n",
-    "\n",
-    "_But there is a catch_: We want Polaris to scale to extra large datasets. If we are to naively store millions of indices as lists of integers, this would impose a significant memory footprint. We therefore use bitmaps, more specifically [roaring bitmaps](https://roaringbitmap.org/) to store the splits in a memory efficient way."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.benchmark._split_v2 import IndexSet\n",
-    "\n",
-    "# To specify a set of integers, you can directly pass in a list of integers\n",
-    "# This will automatically convert the indices to a BitMap\n",
-    "training = IndexSet(indices=[0, 1])\n",
-    "test = IndexSet(indices=[2])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pyroaring import BitMap\n",
-    "\n",
-    "# Or you can create the BitMap manually and iteratively\n",
-    "indices = BitMap()\n",
-    "indices.add(0)\n",
-    "indices.add(1)\n",
-    "\n",
-    "training = IndexSet(indices=indices)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.benchmark._split_v2 import SplitV2\n",
-    "\n",
-    "# Finally, we create the actual split object\n",
-    "split = SplitV2(training=training, test=test)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Define the metrics\n",
-    "Even something as widely used as Mean Absolute Error (MAE) can be implemented in subtly different ways. Some people apply a log transform first, others might clip outliers, and sometimes an off-by-one or a bug creeps in. Over time, these variations add up. We decided to codify each metric for a Polaris benchmark in a single, transparent implementation. Our priority here is eliminating “mystery differences” that have nothing to do with actual model performance. Learn more [here](https://polarishub.io/blog/reproducible-machine-learning-in-drug-discovery-how-polaris-serves-as-a-single-source-of-truth).\n",
-    "\n",
-    "Specifying a metric is easy. You can simply specify its label."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "metrics = [\"mean_absolute_error\", \"mean_squared_error\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can also specify a main metric, which will be the metric used to rank the leaderboard."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "main_metric = \"mean_absolute_error\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To get a list of all support metrics, you can use:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.evaluate._metric import DEFAULT_METRICS\n",
-    "\n",
-    "DEFAULT_METRICS.keys()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can also create more complex metrics that wrap these base metrics."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.evaluate import Metric\n",
-    "\n",
-    "mae_agg = Metric(label=\"mean_absolute_error\", config={\"group_by\": \"UNIQUE_ID\", \"on_error\": \"ignore\", \"aggregation\": \"mean\"})\n",
-    "metrics.append(mae_agg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"admonition info\">\n",
-    "    <p class=\"admonition-title\">What if my metric isn't supported yet?</p>\n",
-    "    <p>Using a metric that's not supported yet, currently requires adding it to the <a href=\"https://github.com/polaris-hub/polaris\">Polaris codebase</a>. We're always looking to improve support. Reach out to us over Github and we're happy to help!</p>\n",
-    "</div>\n",
-    "\n",
-    "### Bringing it all together\n",
-    "Now we can create the `BenchmarkV2Specification` instance."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "type(dataset)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.benchmark._benchmark_v2 import BenchmarkV2Specification\n",
-    "\n",
-    "benchmark = BenchmarkV2Specification(\n",
-    "    # 1. The dataset\n",
-    "    dataset=dataset,\n",
-    "    # 2. The task\n",
-    "    input_cols=input_columns,\n",
-    "    target_cols=target_columns,\n",
-    "    # 3. The split\n",
-    "    split=split,\n",
-    "    # 4. The metrics\n",
-    "    metrics=metrics,\n",
-    "    main_metric=main_metric,\n",
-    "    # 5. The metadata\n",
-    "    name=\"my-first-benchmark\",\n",
-    "    owner=\"your-username\", \n",
-    "    description=\"Created using the Polaris tutorial\",\n",
-    "    tags=[\"tutorial\"], \n",
-    "    user_attributes={\"Key\": \"Value\"}\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Share your benchmark\n",
-    "Want to share your benchmark with the community? Upload it to the Polaris Hub!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "benchmark.upload_to_hub(owner=\"your-username\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If you want to upload a new version of your benchmark, you can specify its previous version with the `parent_artifact_id` parameter. Don't forget to add a changelog describing your updates!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "benchmark.artifact_changelog = \"In this version, I added...\"\n",
-    "\n",
-    "benchmark.upload_to_hub(\n",
-    "  owner=\"your-username\",\n",
-    "  parent_artifact_id=\"your-username/my-first-benchmark\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "The End."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/docs/tutorials/create_a_dataset.ipynb b/docs/tutorials/create_a_dataset.ipynb
deleted file mode 100644
index 620b5657..00000000
--- a/docs/tutorials/create_a_dataset.ipynb
+++ /dev/null
@@ -1,392 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "On the surface, a dataset in Polaris is simply a tabular collection of data, storing datapoints in a row-wise manner. However, as you try create your own, you'll realize that there is some additional complexity under the hood.\n",
-    "\n",
-    "## Create a Dataset\n",
-    "\n",
-    "To create a dataset, you need to instantiate the `DatasetV2` class. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.dataset import DatasetV2, ColumnAnnotation\n",
-    "\n",
-    "dataset = DatasetV2(\n",
-    "    \n",
-    "    # Specify metadata on the dataset level\n",
-    "    name=\"tutorial-example\",\n",
-    "    owner=\"your-username\",\n",
-    "    tags=[\"small-molecules\", \"predictive\", \"admet\"],\n",
-    "    source=\"https://example.com\",\n",
-    "    license=\"CC-BY-4.0\",\n",
-    "    \n",
-    "    # Specify metadata on the column level\n",
-    "    annotations = {\n",
-    "        \"Ligand Pose\": ColumnAnnotation(\n",
-    "            description=\"The 3D pose of the ligand\", \n",
-    "            user_attributes={\"Object Type\": \"rdkit.Chem.Mol\"}, \n",
-    "            modality=\"MOLECULE_3D\"\n",
-    "        ),\n",
-    "        \"Ligand SMILES\": ColumnAnnotation(\n",
-    "            description=\"The 2D graph structure of the ligand, as SMILES\", \n",
-    "            user_attributes={\"Object Type\": \"str\"}, \n",
-    "            modality=\"MOLECULE\"\n",
-    "        ),\n",
-    "        \"Permeability\": ColumnAnnotation(\n",
-    "            description=\"MDR1-MDCK efflux ratio (B-A/A-B)\", \n",
-    "            user_attributes={\"Unit\": \"mL/min/kg\"}\n",
-    "        )\n",
-    "    },\n",
-    "    \n",
-    "    # Specify the actual data\n",
-    "    zarr_root_path=\"path/to/root.zarr\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For the rest of this tutorial, we will take a deeper look at the `zarr_root_path` parameter.\n",
-    "\n",
-    "First, some context.\n",
-    "\n",
-    "## Universal and ML-ready\n",
-    "\n",
-    "![image](../images/zarr.png)\n",
-    "_An illustration of Zarr, which is core to Polaris its datamodel_\n",
-    "\n",
-    "With the Polaris Hub we set out to design a universal data format for ML scientists in drug discovery. Whether you’re working with phenomics, small molecules, or protein structures, you shouldn’t have to spend time learning about domain-specific file formats, APIs, and software tools to be able to run some ML experiments. Beyond modalities, drug discovery datasets also come in different sizes, from kilobytes to terabytes.\n",
-    "</br>\n",
-    "\n",
-    "We found such a universal data format in [Zarr](https://zarr.readthedocs.io/). Zarr is a powerful library for storage of n-dimensional arrays, supporting chunking, compression, and various backends, making it a versatile choice for scientific and large-scale data. It's similar to HDF5, if you're familiar with that. \n",
-    "\n",
-    "Want to learn more? \n",
-    "- Learn about the motivation of our dataset implementation [here](https://polarishub.io/blog/dataset-v2-built-to-scale).\n",
-    "- Learn what we mean by ML-ready [here](https://polarishub.io/blog/dataset-v2-built-to-scale).\n",
-    "\n",
-    "## Zarr basics\n",
-    "Zarr is well [documented](https://zarr.readthedocs.io/en/stable/index.html) and before continuing this tutorial, we recommend you to at least read through the [Quickstart](https://zarr.readthedocs.io/en/stable/quickstart.html).\n",
-    "\n",
-    "## Converting to Zarr\n",
-    "In its most basic form, a Polaris compatible Zarr archive is a single Zarr group (the _root_) with equal length Zarr arrays for each of the columns in the dataset.\n",
-    "\n",
-    "Chances are that your dataset is currently not stored in a Zarr archive. We will show you how to convert a few common formats to a Polaris compatible Zarr archive.\n",
-    "\n",
-    "### From a Numpy Array\n",
-    "The most simple case is if you have your data in a NumPy array."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "\n",
-    "data = np.random.random(2048)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import zarr\n",
-    "\n",
-    "# Create an empty Zarr group\n",
-    "root = zarr.open(path, \"w\")\n",
-    "\n",
-    "# Populate it with the array\n",
-    "root.array(\"column_name\", data)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### From a DataFrame\n",
-    "Since Pandas DataFrames can be thought of as labeled NumPy arrays, converting a DataFrame is straight-forward too."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "df = pd.DataFrame({\n",
-    "    \"A\": np.random.random(2048),\n",
-    "    \"B\": np.random.random(2048)\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Converting it to Zarr is as simple as creating equally named Zarr Arrays."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "import zarr\n",
-    "\n",
-    "# Create an empty Zarr group\n",
-    "root = zarr.open(zarr_root_path, \"w\")\n",
-    "\n",
-    "# Populate it with the arrays\n",
-    "for col in set(df.columns):\n",
-    "    root.array(col, data=df[col].values)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Things get a little more tricky if you have columns with the `object` dtype, for example text."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[\"C\"] = [\"test\"] * 2048"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In that case you need to tell Zarr how to encode the Python object."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numcodecs\n",
-    "\n",
-    "root.array(\"C\", data=df[\"C\"].values, dtype=object, object_codec=numcodecs.VLenUTF8())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### From RDKit (e.g. SDF)\n",
-    "\n",
-    "The ability to encode custom Python objects is powerful. \n",
-    "\n",
-    "Using custom object codecs that Polaris provides, we can for example also store RDKit [`Chem.Mol`](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol) objects in a Zarr array."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create an exemplary molecule\n",
-    "mol = Chem.MolFromSmiles('Cc1ccccc1')\n",
-    "mol"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.dataset.zarr.codecs import RDKitMolCodec\n",
-    "\n",
-    "# Write it to a Zarr array\n",
-    "root = zarr.open(zarr_root_path, \"w\")\n",
-    "root.array(\"molecules\", data=[mol] * 100, dtype=object, object_codec=RDKitMolCodec())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A common use case of this is to convert a number of **SDF files** to a Zarr array.\n",
-    "\n",
-    "1. Load the SDF files using RDKit to `Chem.Mol` objects.\n",
-    "2. Create a Zarr array with the `RDKitMolCodec`.\n",
-    "3. Store all RDKit objects in the Zarr array.\n",
-    "\n",
-    "### From Biotite (e.g. mmCIF)\n",
-    "Similarly, we can also store entire protein structures, as represented by the Biotite [`AtomArray`](https://www.biotite-python.org/latest/apidoc/biotite.structure.AtomArray.html) class."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from tempfile import TemporaryDirectory\n",
-    "\n",
-    "import biotite.database.rcsb as rcsb\n",
-    "from biotite.structure.io import load_structure\n",
-    "\n",
-    "# Load an exemplary structure\n",
-    "with TemporaryDirectory() as tmpdir: \n",
-    "    path = rcsb.fetch(\"1l2y\", \"pdb\", tmpdir)\n",
-    "    struct = load_structure(path, model=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from polaris.dataset.zarr.codecs import AtomArrayCodec\n",
-    "\n",
-    "# Write it to a Zarr array\n",
-    "root = zarr.open(zarr_root_path, \"w\")\n",
-    "root.array(\"molecules\", data=[struct] * 100, dtype=object, object_codec=AtomArrayCodec())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### From Images (e.g. PNG)\n",
-    "For more convential formats, such as images, codecs likely exist already.\n",
-    "\n",
-    "For images for example, these codecs are bundled in [`imagecodecs`](https://github.com/cgohlke/imagecodecs), which is an optional dependency of Polaris.\n",
-    "\n",
-    "An image is commonly represented as a 3D array (i.e. width x height x channels). It's therefore not needed to use object_codecs here. Instead, we specify the _compressor_ Zarr should use to compress its _chunks_."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from imagecodecs.numcodecs import Jpeg2k\n",
-    "\n",
-    "# You need to explicitly register the codec\n",
-    "numcodecs.register_codec(Jpeg2k)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "root = zarr.open(zarr_root_path, \"w\")\n",
-    "\n",
-    "# Array with a single 3 channel image\n",
-    "arr = root.zeros(\n",
-    "    \"image\",\n",
-    "    shape=(1, 512, 512, 3),\n",
-    "    chunks=(1, 512, 512, 3),\n",
-    "    dtype='u1',\n",
-    "    compressor=Jpeg2k(level=52, reversible=True),\n",
-    ")\n",
-    "\n",
-    "arr[0] = img"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Share your dataset\n",
-    "Want to share your dataset with the community? Upload it to the Polaris Hub!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset.upload_to_hub(owner=\"your-username\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "If you want to upload a new version of your dataset, you can specify its previous version with the `parent_artifact_id` parameter. Don't forget to add a changelog describing your updates!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset.artifact_changelog = \"In this version, I added...\"\n",
-    "\n",
-    "dataset.upload_to_hub(\n",
-    "  owner=\"your-username\",\n",
-    "  parent_artifact_id=\"your-username/tutorial-example\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Advanced: Optimization\n",
-    "In this tutorial, we only briefly touched on the high-level concepts that need to be understood to create a Polaris compatible dataset using Zarr. However, Zarr has a lot more to offer and tweaking the settings **can drastically improve storage or data access efficiency.**\n",
-    "\n",
-    "If you would like to learn more, please see the [Zarr documentation](https://zarr.readthedocs.io/en/stable/user-guide/performance.html#changing-chunk-shapes-rechunking).\n",
-    "\n",
-    "---\n",
-    "\n",
-    "The End."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.8"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/mkdocs.yml b/mkdocs.yml
index 91da959c..08f62438 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -21,9 +21,6 @@ nav:
       - Submit:      
         - Submit to a Benchmark: tutorials/submit_to_benchmark.ipynb
         - Submit to a Competition: tutorials/submit_to_competition.ipynb
-      - Create:
-        - Create a Dataset: tutorials/create_a_dataset.ipynb
-        - Create a Benchmark: tutorials/create_a_benchmark.ipynb
   - API Reference:
       - Load: api/load.md
       - Core:

From 151bb26baa85ceb31fb35dc4c03e90431a862a01 Mon Sep 17 00:00:00 2001
From: Daniel Peng <daniel.peng898@gmail.com>
Date: Tue, 13 May 2025 10:58:44 -0400
Subject: [PATCH 3/5] chore: removed unused imports

---
 polaris/benchmark/_base.py     | 3 ---
 polaris/dataset/_base.py       | 2 --
 polaris/dataset/_dataset.py    | 2 --
 polaris/dataset/_dataset_v2.py | 2 +-
 polaris/hub/client.py          | 7 -------
 5 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 3014e753..a2fa6c9f 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -24,13 +24,10 @@
 from polaris.dataset._base import BaseDataset
 from polaris.evaluate import BenchmarkResultsV1, BenchmarkResultsV2
 from polaris.evaluate.utils import evaluate_benchmark
-from polaris.hub.settings import PolarisHubSettings
 from polaris.mixins import ChecksumMixin
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidBenchmarkError
 from polaris.utils.types import (
-    AccessType,
-    HubOwner,
     IncomingPredictionsType,
     TargetType,
 )
diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py
index 115c38ba..86d60bed 100644
--- a/polaris/dataset/_base.py
+++ b/polaris/dataset/_base.py
@@ -29,11 +29,9 @@
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidDatasetError
 from polaris.utils.types import (
-    AccessType,
     ChecksumStrategy,
     DatasetIndex,
     HttpUrlString,
-    HubOwner,
     SupportedLicenseType,
     ZarrConflictResolution,
 )
diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py
index a2b4fa8a..df097d66 100644
--- a/polaris/dataset/_dataset.py
+++ b/polaris/dataset/_dataset.py
@@ -18,9 +18,7 @@
 from polaris.mixins._checksum import ChecksumMixin
 from polaris.utils.errors import InvalidDatasetError
 from polaris.utils.types import (
-    AccessType,
     ChecksumStrategy,
-    HubOwner,
     ZarrConflictResolution,
 )
 
diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py
index 69cf3d5c..98a817bf 100644
--- a/polaris/dataset/_dataset_v2.py
+++ b/polaris/dataset/_dataset_v2.py
@@ -15,7 +15,7 @@
 from polaris.dataset._base import BaseDataset
 from polaris.dataset.zarr._manifest import calculate_file_md5, generate_zarr_manifest
 from polaris.utils.errors import InvalidDatasetError
-from polaris.utils.types import AccessType, ChecksumStrategy, HubOwner, ZarrConflictResolution
+from polaris.utils.types import ChecksumStrategy, ZarrConflictResolution
 
 logger = logging.getLogger(__name__)
 
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
index c4b6ee74..f5a72fab 100644
--- a/polaris/hub/client.py
+++ b/polaris/hub/client.py
@@ -1,13 +1,10 @@
 import json
 import logging
-from hashlib import md5
 from io import BytesIO
-from typing import get_args
 from urllib.parse import urljoin
 
 import httpx
 import pandas as pd
-import zarr
 from authlib.integrations.base_client.errors import InvalidTokenError, MissingTokenError
 from authlib.integrations.httpx_client import OAuth2Client, OAuthError
 from authlib.oauth2 import OAuth2Error, TokenAuth
@@ -31,7 +28,6 @@
 from polaris.hub.storage import StorageSession
 from polaris.utils.context import track_progress
 from polaris.utils.errors import (
-    InvalidDatasetError,
     PolarisCreateArtifactError,
     PolarisHubError,
     PolarisRetrieveArtifactError,
@@ -42,9 +38,6 @@
     AccessType,
     ChecksumStrategy,
     HubOwner,
-    SupportedLicenseType,
-    TimeoutTypes,
-    ZarrConflictResolution,
 )
 
 logger = logging.getLogger(__name__)

From fcc78cfedaf07bfb2b29055080d8e1f386cff32e Mon Sep 17 00:00:00 2001
From: Daniel Peng <daniel.peng898@gmail.com>
Date: Thu, 15 May 2025 12:38:22 -0400
Subject: [PATCH 4/5] restored artifact-specific upload_to_hub functions,
 raised PolarisDeprecatedError for client.py upload_to_hub functions

---
 polaris/benchmark/_base.py     | 27 +++++++++++++++++++++++++++
 polaris/dataset/_base.py       | 12 ++++++++++++
 polaris/dataset/_dataset.py    | 17 +++++++++++++++++
 polaris/dataset/_dataset_v2.py | 17 ++++++++++++++++-
 polaris/hub/client.py          | 27 +++++++++++++++++++++++++++
 polaris/utils/errors.py        |  9 +++++++++
 6 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index a2fa6c9f..67d73ba8 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -24,10 +24,13 @@
 from polaris.dataset._base import BaseDataset
 from polaris.evaluate import BenchmarkResultsV1, BenchmarkResultsV2
 from polaris.evaluate.utils import evaluate_benchmark
+from polaris.hub.settings import PolarisHubSettings
 from polaris.mixins import ChecksumMixin
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidBenchmarkError
 from polaris.utils.types import (
+    AccessType,
+    HubOwner,
     IncomingPredictionsType,
     TargetType,
 )
@@ -164,6 +167,30 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs
             featurization_fn=featurization_fn,
         )
 
+    def upload_to_hub(
+        self,
+        settings: PolarisHubSettings | None = None,
+        cache_auth_token: bool = True,
+        access: AccessType = "private",
+        owner: HubOwner | str | None = None,
+        parent_artifact_id: str | None = None,
+        **kwargs: dict,
+    ):
+        """
+        Very light, convenient wrapper around the
+        [`PolarisHubClient.upload_benchmark`][polaris.hub.client.PolarisHubClient.upload_benchmark] method.
+        """
+        from polaris.hub.client import PolarisHubClient
+
+        with PolarisHubClient(
+            settings=settings,
+            cache_auth_token=cache_auth_token,
+            **kwargs,
+        ) as client:
+            return client.upload_benchmark(
+                self, access=access, owner=owner, parent_artifact_id=parent_artifact_id
+            )
+
     def to_json(self, destination: str) -> str:
         """Save the benchmark to a destination directory as a JSON file.
 
diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py
index 86d60bed..8ecccf98 100644
--- a/polaris/dataset/_base.py
+++ b/polaris/dataset/_base.py
@@ -29,9 +29,11 @@
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidDatasetError
 from polaris.utils.types import (
+    AccessType,
     ChecksumStrategy,
     DatasetIndex,
     HttpUrlString,
+    HubOwner,
     SupportedLicenseType,
     ZarrConflictResolution,
 )
@@ -299,6 +301,16 @@ def get_data(
         """
         raise NotImplementedError
 
+    @abc.abstractmethod
+    def upload_to_hub(
+        self,
+        access: AccessType = "private",
+        owner: HubOwner | str | None = None,
+        parent_artifact_id: str | None = None,
+    ):
+        """Uploads the dataset to the Polaris Hub."""
+        raise NotImplementedError
+
     @classmethod
     @abc.abstractmethod
     def from_json(cls, path: str):
diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py
index df097d66..df2325c1 100644
--- a/polaris/dataset/_dataset.py
+++ b/polaris/dataset/_dataset.py
@@ -18,7 +18,9 @@
 from polaris.mixins._checksum import ChecksumMixin
 from polaris.utils.errors import InvalidDatasetError
 from polaris.utils.types import (
+    AccessType,
     ChecksumStrategy,
+    HubOwner,
     ZarrConflictResolution,
 )
 
@@ -216,6 +218,21 @@ def get_data(
 
         return arr
 
+    def upload_to_hub(
+        self,
+        access: AccessType = "private",
+        owner: HubOwner | str | None = None,
+        parent_artifact_id: str | None = None,
+    ):
+        """
+        Very light, convenient wrapper around the
+        [`PolarisHubClient.upload_dataset`][polaris.hub.client.PolarisHubClient.upload_dataset] method.
+        """
+        from polaris.hub.client import PolarisHubClient
+
+        with PolarisHubClient() as client:
+            client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id)
+
     @classmethod
     def from_json(cls, path: str):
         """Loads a dataset from a JSON file.
diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py
index 98a817bf..faa842d6 100644
--- a/polaris/dataset/_dataset_v2.py
+++ b/polaris/dataset/_dataset_v2.py
@@ -15,7 +15,7 @@
 from polaris.dataset._base import BaseDataset
 from polaris.dataset.zarr._manifest import calculate_file_md5, generate_zarr_manifest
 from polaris.utils.errors import InvalidDatasetError
-from polaris.utils.types import ChecksumStrategy, ZarrConflictResolution
+from polaris.utils.types import AccessType, ChecksumStrategy, HubOwner, ZarrConflictResolution
 
 logger = logging.getLogger(__name__)
 
@@ -191,6 +191,21 @@ def get_data(self, row: int, col: str, adapters: dict[str, Adapter] | None = Non
             arr = adapter(arr)
 
         return arr
+    
+    def upload_to_hub(
+        self,
+        access: AccessType = "private",
+        owner: HubOwner | str | None = None,
+        parent_artifact_id: str | None = None,
+    ):
+        """
+        Uploads the dataset to the Polaris Hub.
+        """
+
+        from polaris.hub.client import PolarisHubClient
+
+        with PolarisHubClient() as client:
+            client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id)
 
     @classmethod
     def from_json(cls, path: str):
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
index f5a72fab..8e403765 100644
--- a/polaris/hub/client.py
+++ b/polaris/hub/client.py
@@ -33,11 +33,14 @@
     PolarisRetrieveArtifactError,
     PolarisSSLError,
     PolarisUnauthorizedError,
+    PolarisDeprecatedError,
 )
 from polaris.utils.types import (
     AccessType,
     ChecksumStrategy,
     HubOwner,
+    TimeoutTypes,
+    ZarrConflictResolution,
 )
 
 logger = logging.getLogger(__name__)
@@ -528,6 +531,30 @@ def upload_results(
                 f"[green]Your result has been successfully uploaded to the Hub. View it here: {result_url}"
             )
 
+    # Note: Unused parameters are included in signature for backwards compatibility.
+    # Removing these parameters results in a TypeError before the PolarisDeprecatedError is raised.
+    def upload_dataset(
+        self,
+        dataset: DatasetV1 | DatasetV2,
+        access: AccessType = "private",
+        timeout: TimeoutTypes = (10, 200),
+        owner: HubOwner | str | None = None,
+        if_exists: ZarrConflictResolution = "replace",
+        parent_artifact_id: str | None = None,
+    ):
+        raise PolarisDeprecatedError("dataset uploading")
+
+    # Note: Unused parameters are included in signature for backwards compatibility.
+    # Removing these parameters results in a TypeError before the PolarisDeprecatedError is raised.
+    def upload_benchmark(
+        self,
+        benchmark: BenchmarkV1Specification | BenchmarkV2Specification,
+        access: AccessType = "private",
+        owner: HubOwner | str | None = None,
+        parent_artifact_id: str | None = None,
+    ):
+        raise PolarisDeprecatedError("benchmark uploading")
+
     def get_competition(self, artifact_id: str) -> CompetitionSpecification:
         """Load a competition from the Polaris Hub.
 
diff --git a/polaris/utils/errors.py b/polaris/utils/errors.py
index f647f63d..e42124dd 100644
--- a/polaris/utils/errors.py
+++ b/polaris/utils/errors.py
@@ -98,3 +98,12 @@ def __init__(self, response_text: str = ""):
             "SSL verification by setting the POLARIS_CA_BUNDLE environment variable to `false`."
         )
         super().__init__(message, response_text)
+
+
+class PolarisDeprecatedError(PolarisHubError):
+    def __init__(self, feature: str, response_text: str = ""):
+        message = (
+            f"The '{feature}' feature has been deprecated and is no longer supported. "
+            "Please contact the Polaris team for more information about alternative approaches."
+        )
+        super().__init__(message, response_text)

From 0f15faf3b8009b9f85066c9a03db44ba03b553f1 Mon Sep 17 00:00:00 2001
From: Daniel Peng <daniel.peng898@gmail.com>
Date: Thu, 15 May 2025 13:23:18 -0400
Subject: [PATCH 5/5] ruff formatting fix, mkdocs fix

---
 polaris/dataset/_dataset_v2.py |  2 +-
 polaris/hub/client.py          | 51 ++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py
index faa842d6..40ba3ba5 100644
--- a/polaris/dataset/_dataset_v2.py
+++ b/polaris/dataset/_dataset_v2.py
@@ -191,7 +191,7 @@ def get_data(self, row: int, col: str, adapters: dict[str, Adapter] | None = Non
             arr = adapter(arr)
 
         return arr
-    
+
     def upload_to_hub(
         self,
         access: AccessType = "private",
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
index 8e403765..0677cc1c 100644
--- a/polaris/hub/client.py
+++ b/polaris/hub/client.py
@@ -542,6 +542,33 @@ def upload_dataset(
         if_exists: ZarrConflictResolution = "replace",
         parent_artifact_id: str | None = None,
     ):
+        """Upload a dataset to the Polaris Hub.
+
+        This functionality has been deprecated.
+
+        Info: Owner
+            You have to manually specify the owner in the dataset data model. Because the owner could
+            be a user or an organization, we cannot automatically infer this from just the logged-in user.
+
+        Note: Required metadata
+            The Polaris client and Hub maintain different requirements as to which metadata is required.
+            The requirements by the Hub are stricter, so when uploading to the Hub you might
+            get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible
+            before uploading.
+
+        Args:
+            dataset: The dataset to upload.
+            access: Grant public or private access to result
+            timeout: Request timeout values. User can modify the value when uploading large dataset as needed.
+                This can be a single value with the timeout in seconds for all IO operations, or a more granular
+                tuple with (connect_timeout, write_timeout). The type of the the timout parameter comes from `httpx`.
+                Since datasets can get large, it might be needed to increase the write timeout for larger datasets.
+                See also: https://www.python-httpx.org/advanced/#timeout-configuration
+            owner: Which Hub user or organization owns the artifact. Takes precedence over `dataset.owner`.
+            if_exists: Action for handling existing files in the Zarr archive. Options are 'raise' to throw
+                an error, 'replace' to overwrite, or 'skip' to proceed without altering the existing files.
+            parent_artifact_id: The `owner/slug` of the parent dataset, if uploading a new version of a dataset.
+        """
         raise PolarisDeprecatedError("dataset uploading")
 
     # Note: Unused parameters are included in signature for backwards compatibility.
@@ -553,6 +580,30 @@ def upload_benchmark(
         owner: HubOwner | str | None = None,
         parent_artifact_id: str | None = None,
     ):
+        """Upload a benchmark to the Polaris Hub.
+
+        This functionality has been deprecated.
+
+        Info: Owner
+            You have to manually specify the owner in the benchmark data model. Because the owner could
+            be a user or an organization, we cannot automatically infer this from the logged-in user.
+
+        Note: Required metadata
+            The Polaris client and Hub maintain different requirements as to which metadata is required.
+            The requirements by the Hub are stricter, so when uploading to the Hub you might
+            get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible
+            before uploading.
+
+        Note: Non-existent datasets
+            The client will _not_ upload the associated dataset to the Hub if it does not yet exist.
+            Make sure to specify an existing dataset or upload the dataset first.
+
+        Args:
+            benchmark: The benchmark to upload.
+            access: Grant public or private access to result
+            owner: Which Hub user or organization owns the artifact. Takes precedence over `benchmark.owner`.
+            parent_artifact_id: The `owner/slug` of the parent benchmark, if uploading a new version of a benchmark.
+        """
         raise PolarisDeprecatedError("benchmark uploading")
 
     def get_competition(self, artifact_id: str) -> CompetitionSpecification: