From 08cd7b43bf2c1f690c38264910e4c5697f48a52e Mon Sep 17 00:00:00 2001 From: Daniel Peng Date: Mon, 12 May 2025 16:48:30 -0400 Subject: [PATCH 1/5] refactor/chore: remove dataset and benchmark 'upload...', 'upload_to_hub' functions from Python client --- polaris/benchmark/_base.py | 24 --- polaris/dataset/_base.py | 10 - polaris/dataset/_dataset.py | 15 -- polaris/dataset/_dataset_v2.py | 15 -- polaris/hub/client.py | 329 --------------------------------- 5 files changed, 393 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 67d73ba8..3014e753 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -167,30 +167,6 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs featurization_fn=featurization_fn, ) - def upload_to_hub( - self, - settings: PolarisHubSettings | None = None, - cache_auth_token: bool = True, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - **kwargs: dict, - ): - """ - Very light, convenient wrapper around the - [`PolarisHubClient.upload_benchmark`][polaris.hub.client.PolarisHubClient.upload_benchmark] method. - """ - from polaris.hub.client import PolarisHubClient - - with PolarisHubClient( - settings=settings, - cache_auth_token=cache_auth_token, - **kwargs, - ) as client: - return client.upload_benchmark( - self, access=access, owner=owner, parent_artifact_id=parent_artifact_id - ) - def to_json(self, destination: str) -> str: """Save the benchmark to a destination directory as a JSON file. diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py index 8ecccf98..115c38ba 100644 --- a/polaris/dataset/_base.py +++ b/polaris/dataset/_base.py @@ -301,16 +301,6 @@ def get_data( """ raise NotImplementedError - @abc.abstractmethod - def upload_to_hub( - self, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - ): - """Uploads the dataset to the Polaris Hub.""" - raise NotImplementedError - @classmethod @abc.abstractmethod def from_json(cls, path: str): diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index df2325c1..a2b4fa8a 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -218,21 +218,6 @@ def get_data( return arr - def upload_to_hub( - self, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - ): - """ - Very light, convenient wrapper around the - [`PolarisHubClient.upload_dataset`][polaris.hub.client.PolarisHubClient.upload_dataset] method. - """ - from polaris.hub.client import PolarisHubClient - - with PolarisHubClient() as client: - client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id) - @classmethod def from_json(cls, path: str): """Loads a dataset from a JSON file. diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py index 40ba3ba5..69cf3d5c 100644 --- a/polaris/dataset/_dataset_v2.py +++ b/polaris/dataset/_dataset_v2.py @@ -192,21 +192,6 @@ def get_data(self, row: int, col: str, adapters: dict[str, Adapter] | None = Non return arr - def upload_to_hub( - self, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - ): - """ - Uploads the dataset to the Polaris Hub. - """ - - from polaris.hub.client import PolarisHubClient - - with PolarisHubClient() as client: - client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id) - @classmethod def from_json(cls, path: str): """ diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 30627312..c4b6ee74 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -535,335 +535,6 @@ def upload_results( f"[green]Your result has been successfully uploaded to the Hub. View it here: {result_url}" ) - def upload_dataset( - self, - dataset: DatasetV1 | DatasetV2, - access: AccessType = "private", - timeout: TimeoutTypes = (10, 200), - owner: HubOwner | str | None = None, - if_exists: ZarrConflictResolution = "replace", - parent_artifact_id: str | None = None, - ): - """Upload a dataset to the Polaris Hub. - - Info: Owner - You have to manually specify the owner in the dataset data model. Because the owner could - be a user or an organization, we cannot automatically infer this from just the logged-in user. - - Note: Required metadata - The Polaris client and Hub maintain different requirements as to which metadata is required. - The requirements by the Hub are stricter, so when uploading to the Hub you might - get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible - before uploading. - - Args: - dataset: The dataset to upload. - access: Grant public or private access to result - timeout: Request timeout values. User can modify the value when uploading large dataset as needed. - This can be a single value with the timeout in seconds for all IO operations, or a more granular - tuple with (connect_timeout, write_timeout). The type of the the timout parameter comes from `httpx`. - Since datasets can get large, it might be needed to increase the write timeout for larger datasets. - See also: https://www.python-httpx.org/advanced/#timeout-configuration - owner: Which Hub user or organization owns the artifact. Takes precedence over `dataset.owner`. - if_exists: Action for handling existing files in the Zarr archive. Options are 'raise' to throw - an error, 'replace' to overwrite, or 'skip' to proceed without altering the existing files. - parent_artifact_id: The `owner/slug` of the parent dataset, if uploading a new version of a dataset. - """ - # Normalize timeout - if timeout is None: - timeout = self.settings.default_timeout - - # Check if a dataset license was specified prior to upload - if not dataset.license: - raise InvalidDatasetError( - f"\nPlease specify a supported license for this dataset prior to uploading to the Polaris Hub.\nOnly some licenses are supported - {get_args(SupportedLicenseType)}." - ) - - if isinstance(dataset, DatasetV1): - self._upload_v1_dataset(dataset, timeout, access, owner, if_exists, parent_artifact_id) - elif isinstance(dataset, DatasetV2): - self._upload_v2_dataset(dataset, timeout, access, owner, if_exists, parent_artifact_id) - - def _upload_v1_dataset( - self, - dataset: DatasetV1, - timeout: TimeoutTypes, - access: AccessType, - owner: HubOwner | str | None, - if_exists: ZarrConflictResolution, - parent_artifact_id: str | None, - ): - """ - Upload a V1 dataset to the Polaris Hub. - """ - - with track_progress(description="Uploading dataset", total=1) as (progress, task): - # Get the serialized data-model - # We exclude the table as it handled separately - dataset.owner = HubOwner.normalize(owner or dataset.owner) - dataset_json = dataset.model_dump(exclude={"table"}, exclude_none=True, by_alias=True) - - # If the dataset uses Zarr, we will save the Zarr archive to the Hub as well - if dataset.uses_zarr: - dataset_json["zarrRootPath"] = f"{StorageSession.polaris_protocol}://data.zarr" - - # Uploading a dataset is a three-step process. - # 1. Upload the dataset metadata to the Hub and prepare the Hub to receive the data - # 2. Upload the parquet file to Hub storage - # 3. Upload the associated Zarr archive to Hub storage - - # Prepare the parquet file - in_memory_parquet = BytesIO() - dataset.table.to_parquet(in_memory_parquet) - parquet_size = len(in_memory_parquet.getbuffer()) - parquet_md5 = md5(in_memory_parquet.getbuffer()).hexdigest() - - # Step 1: Upload metadata - # Instead of directly uploading the data, we announce to the Hub that we intend to upload it. - # We do so separately for the Zarr archive and Parquet file. - url = f"/v1/dataset/{dataset.artifact_id}" - response = self._base_request_to_hub( - url=url, - method="PUT", - json={ - "tableContent": { - "size": parquet_size, - "fileType": "parquet", - "md5Sum": parquet_md5, - }, - "zarrContent": [md5sum.model_dump() for md5sum in dataset._zarr_md5sum_manifest], - "access": access, - "parentArtifactId": parent_artifact_id, - **dataset_json, - }, - timeout=timeout, - ) - - inserted_dataset = response.json() - - # We modify the slug in the server - # Update dataset.slug here so dataset.urn is constructed correctly - dataset.slug = inserted_dataset["slug"] - - with StorageSession(self, "write", dataset.urn) as storage: - with track_progress(description="Copying Parquet file", total=1) as (progress, task): - # Step 2: Upload the parquet file - progress.log("[yellow]This may take a while.") - storage.set_file("root", in_memory_parquet.getvalue()) - - # Step 3: Upload any associated Zarr archive - if dataset.uses_zarr: - with track_progress(description="Copying Zarr archive", total=1): - destination = storage.store("extension") - - # Locally consolidate Zarr archive metadata. Future updates on handling consolidated - # metadata based on Zarr developers' recommendations can be tracked at: - # https://github.com/zarr-developers/zarr-python/issues/1731 - zarr.consolidate_metadata(dataset.zarr_root.store.store) - zmetadata_content = dataset.zarr_root.store.store[".zmetadata"] - destination[".zmetadata"] = zmetadata_content - - # Copy the Zarr archive to the Hub - destination.copy_from_source( - dataset.zarr_root.store.store, if_exists=if_exists, log=logger.info - ) - - dataset_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location")) - progress.log( - f"[green]Your dataset has been successfully uploaded to the Hub.\nView it here: {dataset_url}" - ) - - def _upload_v2_dataset( - self, - dataset: DatasetV2, - timeout: TimeoutTypes, - access: AccessType, - owner: HubOwner | str | None, - if_exists: ZarrConflictResolution, - parent_artifact_id: str | None, - ): - """ - Upload a V2 dataset to the Polaris Hub. - """ - - with track_progress(description="Uploading dataset", total=1) as (progress, task): - # Get the serialized data-model - dataset.owner = HubOwner.normalize(owner or dataset.owner) - dataset_json = dataset.model_dump(exclude_none=True, by_alias=True) - - # Step 1: Upload dataset metadata - url = f"/v2/dataset/{dataset.artifact_id}" - response = self._base_request_to_hub( - url=url, - method="PUT", - json={ - "zarrManifestFileContent": { - "md5Sum": dataset.zarr_manifest_md5sum, - }, - "access": access, - "parentArtifactId": parent_artifact_id, - **dataset_json, - }, - timeout=timeout, - ) - - inserted_dataset = response.json() - - # We modify the slug in the server - # Update dataset.slug here so dataset.urn is constructed correctly - dataset.slug = inserted_dataset["slug"] - - with StorageSession(self, "write", dataset.urn) as storage: - # Step 2: Upload the manifest file - with track_progress(description="Copying manifest file", total=1): - with open(dataset.zarr_manifest_path, "rb") as manifest_file: - storage.set_file("manifest", manifest_file.read()) - - # Step 3: Upload the Zarr archive - with track_progress(description="Copying Zarr archive", total=1) as ( - progress_zarr, - task_zarr, - ): - progress_zarr.log("[yellow]This may take a while.") - - destination = storage.store("root") - - # Locally consolidate Zarr archive metadata. Future updates on handling consolidated - # metadata based on Zarr developers' recommendations can be tracked at: - # https://github.com/zarr-developers/zarr-python/issues/1731 - zarr.consolidate_metadata(dataset.zarr_root.store.store) - zmetadata_content = dataset.zarr_root.store.store[".zmetadata"] - destination[".zmetadata"] = zmetadata_content - - # Copy the Zarr archive to the Hub - destination.copy_from_source( - dataset.zarr_root.store.store, if_exists=if_exists, log=logger.info - ) - - dataset_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location")) - progress.log( - f"[green]Your V2 dataset has been successfully uploaded to the Hub.\nView it here: {dataset_url}" - ) - - def upload_benchmark( - self, - benchmark: BenchmarkV1Specification | BenchmarkV2Specification, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - ): - """Upload a benchmark to the Polaris Hub. - - Info: Owner - You have to manually specify the owner in the benchmark data model. Because the owner could - be a user or an organization, we cannot automatically infer this from the logged-in user. - - Note: Required metadata - The Polaris client and Hub maintain different requirements as to which metadata is required. - The requirements by the Hub are stricter, so when uploading to the Hub you might - get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible - before uploading. - - Note: Non-existent datasets - The client will _not_ upload the associated dataset to the Hub if it does not yet exist. - Make sure to specify an existing dataset or upload the dataset first. - - Args: - benchmark: The benchmark to upload. - access: Grant public or private access to result - owner: Which Hub user or organization owns the artifact. Takes precedence over `benchmark.owner`. - parent_artifact_id: The `owner/slug` of the parent benchmark, if uploading a new version of a benchmark. - """ - match benchmark: - case BenchmarkV1Specification(): - self._upload_v1_benchmark(benchmark, access, owner, parent_artifact_id) - case BenchmarkV2Specification(): - self._upload_v2_benchmark(benchmark, access, owner, parent_artifact_id) - - def _upload_v1_benchmark( - self, - benchmark: BenchmarkV1Specification, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - ): - """ - Upload a V1 benchmark to the Polaris Hub. - """ - with track_progress(description="Uploading benchmark", total=1) as (progress, task): - # Get the serialized data-model - # We exclude the dataset as we expect it to exist on the Hub already. - benchmark.owner = HubOwner.normalize(owner or benchmark.owner) - benchmark_json = benchmark.model_dump(exclude={"dataset"}, exclude_none=True, by_alias=True) - benchmark_json["datasetArtifactId"] = benchmark.dataset.artifact_id - benchmark_json["access"] = access - - url = f"/v1/benchmark/{benchmark.artifact_id}" - response = self._base_request_to_hub( - url=url, method="PUT", json={"parentArtifactId": parent_artifact_id, **benchmark_json} - ) - - benchmark_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location")) - progress.log( - f"[green]Your benchmark has been successfully uploaded to the Hub.\nView it here: {benchmark_url}" - ) - - def _upload_v2_benchmark( - self, - benchmark: BenchmarkV2Specification, - access: AccessType = "private", - owner: HubOwner | str | None = None, - parent_artifact_id: str | None = None, - ): - """ - Upload a V2 benchmark to the Polaris Hub. - """ - with track_progress(description="Uploading benchmark", total=1) as (progress, task): - # Get the serialized data-model - # We exclude the dataset as we expect it to exist on the Hub already. - benchmark.owner = HubOwner.normalize(owner or benchmark.owner) - benchmark_json = benchmark.model_dump(exclude_none=True, by_alias=True) - - # Uploading a V2 benchmark is a multistep process. - # 1. Upload the benchmark metadata to the Hub and prepare the Hub to receive the data - # 2. Upload each index set bitmap to the Hub storage - - # Step 1: Upload metadata - url = f"/v2/benchmark/{benchmark.artifact_id}" - response = self._base_request_to_hub( - url=url, - method="PUT", - json={ - "access": access, - "datasetArtifactId": benchmark.dataset.artifact_id, - "parentArtifactId": parent_artifact_id, - **benchmark_json, - }, - ) - - inserted_benchmark = response.json() - - # We modify the slug in the server - # Update benchmark.slug here so benchmark.urn is constructed correctly - benchmark.slug = inserted_benchmark["slug"] - - with StorageSession(self, "write", benchmark.urn) as storage: - logger.info("Copying the benchmark split to the Hub. This may take a while.") - - # 2. Upload each index set bitmap - with track_progress( - description="Copying index sets", total=benchmark.split.n_test_sets + 1 - ) as (progress_index_sets, task_index_sets): - for label, index_set in benchmark.split: - logger.info(f"Copying index set {label} to the Hub.") - storage.set_file(label, index_set.serialize()) - progress_index_sets.update(task_index_sets, advance=1, refresh=True) - - benchmark_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location")) - progress.log( - f"[green]Your benchmark has been successfully uploaded to the Hub.\nView it here: {benchmark_url}" - ) - def get_competition(self, artifact_id: str) -> CompetitionSpecification: """Load a competition from the Polaris Hub. From 3a6ae1175c0e35aef5b8d604e82cdfe7757a7af6 Mon Sep 17 00:00:00 2001 From: Daniel Peng Date: Tue, 13 May 2025 10:16:11 -0400 Subject: [PATCH 2/5] chore: update README and Polaris documentation following dataset/benchmark upload function removals --- README.md | 1 - docs/tutorials/create_a_benchmark.ipynb | 288 ----------------- docs/tutorials/create_a_dataset.ipynb | 392 ------------------------ mkdocs.yml | 3 - 4 files changed, 684 deletions(-) delete mode 100644 docs/tutorials/create_a_benchmark.ipynb delete mode 100644 docs/tutorials/create_a_dataset.ipynb diff --git a/README.md b/README.md index 2f21f2b3..0f470ea4 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,6 @@ This library is a Python client to interact with the [Polaris Hub](https://polar - Download Polaris datasets and benchmarks. - Evaluate a custom method against a Polaris benchmark. -- Create and upload new datasets and benchmarks. ## Quick API Tour diff --git a/docs/tutorials/create_a_benchmark.ipynb b/docs/tutorials/create_a_benchmark.ipynb deleted file mode 100644 index cbce3e15..00000000 --- a/docs/tutorials/create_a_benchmark.ipynb +++ /dev/null @@ -1,288 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Polaris explicitly distinguished datasets from benchmarks. A benchmark defines the ML task and evaluation logic (e.g. split and metrics) for a dataset. Because of this, a single dataset can be the basis of multiple benchmarks.\n", - "\n", - "## Create a Benchmark\n", - "\n", - "To create a benchmark, you need to instantiate the `BenchmarkV2Specification` class. This requires you to specify: \n", - "\n", - "1. The **dataset**, which can be stored either locally or on the Hub.\n", - "1. The **task**, where a task is defined by input and target columns.\n", - "2. The **split**, where a split is defined by a bunch of indices.\n", - "3. The **metric**, where a metric needs to be officially supported by Polaris.\n", - "4. The **metadata** to contextualize your benchmark.\n", - "\n", - "### Define the dataset\n", - "To learn how to create a dataset, see [this tutorial](./create_a_dataset.html). \n", - "\n", - "Alternatively, we can also load an existing dataset from the Hub.\n", - "\n", - "
\n", - "

Not all Hub datasets are supported

\n", - "

You can only create benchmarks for DatasetV2 instances, not for DatasetV1 instances. Some of the datasets stored on the Hub are still V1 datasets.

\n", - "
\n", - "\n", - "### Define the task\n", - "Currently, Polaris only supports predictive tasks. Specifying a predictive task is simply done by specifying the input and target columns." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "input_columns = [\"SMILES\"]\n", - "target_columns = [\"LOG_SOLUBILITY\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case, we specified just a single input and target column, but a benchmark can have multiple (e.g. a multi-task benchmark).\n", - "\n", - "### Define the split\n", - "\n", - "To ensure reproducible results, Polaris represents a split through a bunch of sets of indices.\n", - "\n", - "_But there is a catch_: We want Polaris to scale to extra large datasets. If we are to naively store millions of indices as lists of integers, this would impose a significant memory footprint. We therefore use bitmaps, more specifically [roaring bitmaps](https://roaringbitmap.org/) to store the splits in a memory efficient way." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.benchmark._split_v2 import IndexSet\n", - "\n", - "# To specify a set of integers, you can directly pass in a list of integers\n", - "# This will automatically convert the indices to a BitMap\n", - "training = IndexSet(indices=[0, 1])\n", - "test = IndexSet(indices=[2])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyroaring import BitMap\n", - "\n", - "# Or you can create the BitMap manually and iteratively\n", - "indices = BitMap()\n", - "indices.add(0)\n", - "indices.add(1)\n", - "\n", - "training = IndexSet(indices=indices)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.benchmark._split_v2 import SplitV2\n", - "\n", - "# Finally, we create the actual split object\n", - "split = SplitV2(training=training, test=test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define the metrics\n", - "Even something as widely used as Mean Absolute Error (MAE) can be implemented in subtly different ways. Some people apply a log transform first, others might clip outliers, and sometimes an off-by-one or a bug creeps in. Over time, these variations add up. We decided to codify each metric for a Polaris benchmark in a single, transparent implementation. Our priority here is eliminating “mystery differences” that have nothing to do with actual model performance. Learn more [here](https://polarishub.io/blog/reproducible-machine-learning-in-drug-discovery-how-polaris-serves-as-a-single-source-of-truth).\n", - "\n", - "Specifying a metric is easy. You can simply specify its label." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "metrics = [\"mean_absolute_error\", \"mean_squared_error\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also specify a main metric, which will be the metric used to rank the leaderboard." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "main_metric = \"mean_absolute_error\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To get a list of all support metrics, you can use:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.evaluate._metric import DEFAULT_METRICS\n", - "\n", - "DEFAULT_METRICS.keys()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also create more complex metrics that wrap these base metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.evaluate import Metric\n", - "\n", - "mae_agg = Metric(label=\"mean_absolute_error\", config={\"group_by\": \"UNIQUE_ID\", \"on_error\": \"ignore\", \"aggregation\": \"mean\"})\n", - "metrics.append(mae_agg)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "

What if my metric isn't supported yet?

\n", - "

Using a metric that's not supported yet, currently requires adding it to the Polaris codebase. We're always looking to improve support. Reach out to us over Github and we're happy to help!

\n", - "
\n", - "\n", - "### Bringing it all together\n", - "Now we can create the `BenchmarkV2Specification` instance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "type(dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.benchmark._benchmark_v2 import BenchmarkV2Specification\n", - "\n", - "benchmark = BenchmarkV2Specification(\n", - " # 1. The dataset\n", - " dataset=dataset,\n", - " # 2. The task\n", - " input_cols=input_columns,\n", - " target_cols=target_columns,\n", - " # 3. The split\n", - " split=split,\n", - " # 4. The metrics\n", - " metrics=metrics,\n", - " main_metric=main_metric,\n", - " # 5. The metadata\n", - " name=\"my-first-benchmark\",\n", - " owner=\"your-username\", \n", - " description=\"Created using the Polaris tutorial\",\n", - " tags=[\"tutorial\"], \n", - " user_attributes={\"Key\": \"Value\"}\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Share your benchmark\n", - "Want to share your benchmark with the community? Upload it to the Polaris Hub!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "benchmark.upload_to_hub(owner=\"your-username\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to upload a new version of your benchmark, you can specify its previous version with the `parent_artifact_id` parameter. Don't forget to add a changelog describing your updates!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "benchmark.artifact_changelog = \"In this version, I added...\"\n", - "\n", - "benchmark.upload_to_hub(\n", - " owner=\"your-username\",\n", - " parent_artifact_id=\"your-username/my-first-benchmark\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "The End." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/docs/tutorials/create_a_dataset.ipynb b/docs/tutorials/create_a_dataset.ipynb deleted file mode 100644 index 620b5657..00000000 --- a/docs/tutorials/create_a_dataset.ipynb +++ /dev/null @@ -1,392 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "On the surface, a dataset in Polaris is simply a tabular collection of data, storing datapoints in a row-wise manner. However, as you try create your own, you'll realize that there is some additional complexity under the hood.\n", - "\n", - "## Create a Dataset\n", - "\n", - "To create a dataset, you need to instantiate the `DatasetV2` class. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.dataset import DatasetV2, ColumnAnnotation\n", - "\n", - "dataset = DatasetV2(\n", - " \n", - " # Specify metadata on the dataset level\n", - " name=\"tutorial-example\",\n", - " owner=\"your-username\",\n", - " tags=[\"small-molecules\", \"predictive\", \"admet\"],\n", - " source=\"https://example.com\",\n", - " license=\"CC-BY-4.0\",\n", - " \n", - " # Specify metadata on the column level\n", - " annotations = {\n", - " \"Ligand Pose\": ColumnAnnotation(\n", - " description=\"The 3D pose of the ligand\", \n", - " user_attributes={\"Object Type\": \"rdkit.Chem.Mol\"}, \n", - " modality=\"MOLECULE_3D\"\n", - " ),\n", - " \"Ligand SMILES\": ColumnAnnotation(\n", - " description=\"The 2D graph structure of the ligand, as SMILES\", \n", - " user_attributes={\"Object Type\": \"str\"}, \n", - " modality=\"MOLECULE\"\n", - " ),\n", - " \"Permeability\": ColumnAnnotation(\n", - " description=\"MDR1-MDCK efflux ratio (B-A/A-B)\", \n", - " user_attributes={\"Unit\": \"mL/min/kg\"}\n", - " )\n", - " },\n", - " \n", - " # Specify the actual data\n", - " zarr_root_path=\"path/to/root.zarr\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For the rest of this tutorial, we will take a deeper look at the `zarr_root_path` parameter.\n", - "\n", - "First, some context.\n", - "\n", - "## Universal and ML-ready\n", - "\n", - "![image](../images/zarr.png)\n", - "_An illustration of Zarr, which is core to Polaris its datamodel_\n", - "\n", - "With the Polaris Hub we set out to design a universal data format for ML scientists in drug discovery. Whether you’re working with phenomics, small molecules, or protein structures, you shouldn’t have to spend time learning about domain-specific file formats, APIs, and software tools to be able to run some ML experiments. Beyond modalities, drug discovery datasets also come in different sizes, from kilobytes to terabytes.\n", - "
\n", - "\n", - "We found such a universal data format in [Zarr](https://zarr.readthedocs.io/). Zarr is a powerful library for storage of n-dimensional arrays, supporting chunking, compression, and various backends, making it a versatile choice for scientific and large-scale data. It's similar to HDF5, if you're familiar with that. \n", - "\n", - "Want to learn more? \n", - "- Learn about the motivation of our dataset implementation [here](https://polarishub.io/blog/dataset-v2-built-to-scale).\n", - "- Learn what we mean by ML-ready [here](https://polarishub.io/blog/dataset-v2-built-to-scale).\n", - "\n", - "## Zarr basics\n", - "Zarr is well [documented](https://zarr.readthedocs.io/en/stable/index.html) and before continuing this tutorial, we recommend you to at least read through the [Quickstart](https://zarr.readthedocs.io/en/stable/quickstart.html).\n", - "\n", - "## Converting to Zarr\n", - "In its most basic form, a Polaris compatible Zarr archive is a single Zarr group (the _root_) with equal length Zarr arrays for each of the columns in the dataset.\n", - "\n", - "Chances are that your dataset is currently not stored in a Zarr archive. We will show you how to convert a few common formats to a Polaris compatible Zarr archive.\n", - "\n", - "### From a Numpy Array\n", - "The most simple case is if you have your data in a NumPy array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "data = np.random.random(2048)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import zarr\n", - "\n", - "# Create an empty Zarr group\n", - "root = zarr.open(path, \"w\")\n", - "\n", - "# Populate it with the array\n", - "root.array(\"column_name\", data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### From a DataFrame\n", - "Since Pandas DataFrames can be thought of as labeled NumPy arrays, converting a DataFrame is straight-forward too." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.DataFrame({\n", - " \"A\": np.random.random(2048),\n", - " \"B\": np.random.random(2048)\n", - "})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Converting it to Zarr is as simple as creating equally named Zarr Arrays." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import zarr\n", - "\n", - "# Create an empty Zarr group\n", - "root = zarr.open(zarr_root_path, \"w\")\n", - "\n", - "# Populate it with the arrays\n", - "for col in set(df.columns):\n", - " root.array(col, data=df[col].values)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Things get a little more tricky if you have columns with the `object` dtype, for example text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[\"C\"] = [\"test\"] * 2048" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In that case you need to tell Zarr how to encode the Python object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numcodecs\n", - "\n", - "root.array(\"C\", data=df[\"C\"].values, dtype=object, object_codec=numcodecs.VLenUTF8())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### From RDKit (e.g. SDF)\n", - "\n", - "The ability to encode custom Python objects is powerful. \n", - "\n", - "Using custom object codecs that Polaris provides, we can for example also store RDKit [`Chem.Mol`](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html#rdkit.Chem.rdchem.Mol) objects in a Zarr array." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create an exemplary molecule\n", - "mol = Chem.MolFromSmiles('Cc1ccccc1')\n", - "mol" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.dataset.zarr.codecs import RDKitMolCodec\n", - "\n", - "# Write it to a Zarr array\n", - "root = zarr.open(zarr_root_path, \"w\")\n", - "root.array(\"molecules\", data=[mol] * 100, dtype=object, object_codec=RDKitMolCodec())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A common use case of this is to convert a number of **SDF files** to a Zarr array.\n", - "\n", - "1. Load the SDF files using RDKit to `Chem.Mol` objects.\n", - "2. Create a Zarr array with the `RDKitMolCodec`.\n", - "3. Store all RDKit objects in the Zarr array.\n", - "\n", - "### From Biotite (e.g. mmCIF)\n", - "Similarly, we can also store entire protein structures, as represented by the Biotite [`AtomArray`](https://www.biotite-python.org/latest/apidoc/biotite.structure.AtomArray.html) class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tempfile import TemporaryDirectory\n", - "\n", - "import biotite.database.rcsb as rcsb\n", - "from biotite.structure.io import load_structure\n", - "\n", - "# Load an exemplary structure\n", - "with TemporaryDirectory() as tmpdir: \n", - " path = rcsb.fetch(\"1l2y\", \"pdb\", tmpdir)\n", - " struct = load_structure(path, model=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from polaris.dataset.zarr.codecs import AtomArrayCodec\n", - "\n", - "# Write it to a Zarr array\n", - "root = zarr.open(zarr_root_path, \"w\")\n", - "root.array(\"molecules\", data=[struct] * 100, dtype=object, object_codec=AtomArrayCodec())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### From Images (e.g. PNG)\n", - "For more convential formats, such as images, codecs likely exist already.\n", - "\n", - "For images for example, these codecs are bundled in [`imagecodecs`](https://github.com/cgohlke/imagecodecs), which is an optional dependency of Polaris.\n", - "\n", - "An image is commonly represented as a 3D array (i.e. width x height x channels). It's therefore not needed to use object_codecs here. Instead, we specify the _compressor_ Zarr should use to compress its _chunks_." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from imagecodecs.numcodecs import Jpeg2k\n", - "\n", - "# You need to explicitly register the codec\n", - "numcodecs.register_codec(Jpeg2k)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "root = zarr.open(zarr_root_path, \"w\")\n", - "\n", - "# Array with a single 3 channel image\n", - "arr = root.zeros(\n", - " \"image\",\n", - " shape=(1, 512, 512, 3),\n", - " chunks=(1, 512, 512, 3),\n", - " dtype='u1',\n", - " compressor=Jpeg2k(level=52, reversible=True),\n", - ")\n", - "\n", - "arr[0] = img" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Share your dataset\n", - "Want to share your dataset with the community? Upload it to the Polaris Hub!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset.upload_to_hub(owner=\"your-username\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to upload a new version of your dataset, you can specify its previous version with the `parent_artifact_id` parameter. Don't forget to add a changelog describing your updates!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset.artifact_changelog = \"In this version, I added...\"\n", - "\n", - "dataset.upload_to_hub(\n", - " owner=\"your-username\",\n", - " parent_artifact_id=\"your-username/tutorial-example\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Advanced: Optimization\n", - "In this tutorial, we only briefly touched on the high-level concepts that need to be understood to create a Polaris compatible dataset using Zarr. However, Zarr has a lot more to offer and tweaking the settings **can drastically improve storage or data access efficiency.**\n", - "\n", - "If you would like to learn more, please see the [Zarr documentation](https://zarr.readthedocs.io/en/stable/user-guide/performance.html#changing-chunk-shapes-rechunking).\n", - "\n", - "---\n", - "\n", - "The End." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/mkdocs.yml b/mkdocs.yml index 91da959c..08f62438 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -21,9 +21,6 @@ nav: - Submit: - Submit to a Benchmark: tutorials/submit_to_benchmark.ipynb - Submit to a Competition: tutorials/submit_to_competition.ipynb - - Create: - - Create a Dataset: tutorials/create_a_dataset.ipynb - - Create a Benchmark: tutorials/create_a_benchmark.ipynb - API Reference: - Load: api/load.md - Core: From 151bb26baa85ceb31fb35dc4c03e90431a862a01 Mon Sep 17 00:00:00 2001 From: Daniel Peng Date: Tue, 13 May 2025 10:58:44 -0400 Subject: [PATCH 3/5] chore: removed unused imports --- polaris/benchmark/_base.py | 3 --- polaris/dataset/_base.py | 2 -- polaris/dataset/_dataset.py | 2 -- polaris/dataset/_dataset_v2.py | 2 +- polaris/hub/client.py | 7 ------- 5 files changed, 1 insertion(+), 15 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 3014e753..a2fa6c9f 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -24,13 +24,10 @@ from polaris.dataset._base import BaseDataset from polaris.evaluate import BenchmarkResultsV1, BenchmarkResultsV2 from polaris.evaluate.utils import evaluate_benchmark -from polaris.hub.settings import PolarisHubSettings from polaris.mixins import ChecksumMixin from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidBenchmarkError from polaris.utils.types import ( - AccessType, - HubOwner, IncomingPredictionsType, TargetType, ) diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py index 115c38ba..86d60bed 100644 --- a/polaris/dataset/_base.py +++ b/polaris/dataset/_base.py @@ -29,11 +29,9 @@ from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidDatasetError from polaris.utils.types import ( - AccessType, ChecksumStrategy, DatasetIndex, HttpUrlString, - HubOwner, SupportedLicenseType, ZarrConflictResolution, ) diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index a2b4fa8a..df097d66 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -18,9 +18,7 @@ from polaris.mixins._checksum import ChecksumMixin from polaris.utils.errors import InvalidDatasetError from polaris.utils.types import ( - AccessType, ChecksumStrategy, - HubOwner, ZarrConflictResolution, ) diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py index 69cf3d5c..98a817bf 100644 --- a/polaris/dataset/_dataset_v2.py +++ b/polaris/dataset/_dataset_v2.py @@ -15,7 +15,7 @@ from polaris.dataset._base import BaseDataset from polaris.dataset.zarr._manifest import calculate_file_md5, generate_zarr_manifest from polaris.utils.errors import InvalidDatasetError -from polaris.utils.types import AccessType, ChecksumStrategy, HubOwner, ZarrConflictResolution +from polaris.utils.types import ChecksumStrategy, ZarrConflictResolution logger = logging.getLogger(__name__) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index c4b6ee74..f5a72fab 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -1,13 +1,10 @@ import json import logging -from hashlib import md5 from io import BytesIO -from typing import get_args from urllib.parse import urljoin import httpx import pandas as pd -import zarr from authlib.integrations.base_client.errors import InvalidTokenError, MissingTokenError from authlib.integrations.httpx_client import OAuth2Client, OAuthError from authlib.oauth2 import OAuth2Error, TokenAuth @@ -31,7 +28,6 @@ from polaris.hub.storage import StorageSession from polaris.utils.context import track_progress from polaris.utils.errors import ( - InvalidDatasetError, PolarisCreateArtifactError, PolarisHubError, PolarisRetrieveArtifactError, @@ -42,9 +38,6 @@ AccessType, ChecksumStrategy, HubOwner, - SupportedLicenseType, - TimeoutTypes, - ZarrConflictResolution, ) logger = logging.getLogger(__name__) From fcc78cfedaf07bfb2b29055080d8e1f386cff32e Mon Sep 17 00:00:00 2001 From: Daniel Peng Date: Thu, 15 May 2025 12:38:22 -0400 Subject: [PATCH 4/5] restored artifact-specific upload_to_hub functions, raised PolarisDeprecatedError for client.py upload_to_hub functions --- polaris/benchmark/_base.py | 27 +++++++++++++++++++++++++++ polaris/dataset/_base.py | 12 ++++++++++++ polaris/dataset/_dataset.py | 17 +++++++++++++++++ polaris/dataset/_dataset_v2.py | 17 ++++++++++++++++- polaris/hub/client.py | 27 +++++++++++++++++++++++++++ polaris/utils/errors.py | 9 +++++++++ 6 files changed, 108 insertions(+), 1 deletion(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index a2fa6c9f..67d73ba8 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -24,10 +24,13 @@ from polaris.dataset._base import BaseDataset from polaris.evaluate import BenchmarkResultsV1, BenchmarkResultsV2 from polaris.evaluate.utils import evaluate_benchmark +from polaris.hub.settings import PolarisHubSettings from polaris.mixins import ChecksumMixin from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidBenchmarkError from polaris.utils.types import ( + AccessType, + HubOwner, IncomingPredictionsType, TargetType, ) @@ -164,6 +167,30 @@ def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subs featurization_fn=featurization_fn, ) + def upload_to_hub( + self, + settings: PolarisHubSettings | None = None, + cache_auth_token: bool = True, + access: AccessType = "private", + owner: HubOwner | str | None = None, + parent_artifact_id: str | None = None, + **kwargs: dict, + ): + """ + Very light, convenient wrapper around the + [`PolarisHubClient.upload_benchmark`][polaris.hub.client.PolarisHubClient.upload_benchmark] method. + """ + from polaris.hub.client import PolarisHubClient + + with PolarisHubClient( + settings=settings, + cache_auth_token=cache_auth_token, + **kwargs, + ) as client: + return client.upload_benchmark( + self, access=access, owner=owner, parent_artifact_id=parent_artifact_id + ) + def to_json(self, destination: str) -> str: """Save the benchmark to a destination directory as a JSON file. diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py index 86d60bed..8ecccf98 100644 --- a/polaris/dataset/_base.py +++ b/polaris/dataset/_base.py @@ -29,9 +29,11 @@ from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidDatasetError from polaris.utils.types import ( + AccessType, ChecksumStrategy, DatasetIndex, HttpUrlString, + HubOwner, SupportedLicenseType, ZarrConflictResolution, ) @@ -299,6 +301,16 @@ def get_data( """ raise NotImplementedError + @abc.abstractmethod + def upload_to_hub( + self, + access: AccessType = "private", + owner: HubOwner | str | None = None, + parent_artifact_id: str | None = None, + ): + """Uploads the dataset to the Polaris Hub.""" + raise NotImplementedError + @classmethod @abc.abstractmethod def from_json(cls, path: str): diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index df097d66..df2325c1 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -18,7 +18,9 @@ from polaris.mixins._checksum import ChecksumMixin from polaris.utils.errors import InvalidDatasetError from polaris.utils.types import ( + AccessType, ChecksumStrategy, + HubOwner, ZarrConflictResolution, ) @@ -216,6 +218,21 @@ def get_data( return arr + def upload_to_hub( + self, + access: AccessType = "private", + owner: HubOwner | str | None = None, + parent_artifact_id: str | None = None, + ): + """ + Very light, convenient wrapper around the + [`PolarisHubClient.upload_dataset`][polaris.hub.client.PolarisHubClient.upload_dataset] method. + """ + from polaris.hub.client import PolarisHubClient + + with PolarisHubClient() as client: + client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id) + @classmethod def from_json(cls, path: str): """Loads a dataset from a JSON file. diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py index 98a817bf..faa842d6 100644 --- a/polaris/dataset/_dataset_v2.py +++ b/polaris/dataset/_dataset_v2.py @@ -15,7 +15,7 @@ from polaris.dataset._base import BaseDataset from polaris.dataset.zarr._manifest import calculate_file_md5, generate_zarr_manifest from polaris.utils.errors import InvalidDatasetError -from polaris.utils.types import ChecksumStrategy, ZarrConflictResolution +from polaris.utils.types import AccessType, ChecksumStrategy, HubOwner, ZarrConflictResolution logger = logging.getLogger(__name__) @@ -191,6 +191,21 @@ def get_data(self, row: int, col: str, adapters: dict[str, Adapter] | None = Non arr = adapter(arr) return arr + + def upload_to_hub( + self, + access: AccessType = "private", + owner: HubOwner | str | None = None, + parent_artifact_id: str | None = None, + ): + """ + Uploads the dataset to the Polaris Hub. + """ + + from polaris.hub.client import PolarisHubClient + + with PolarisHubClient() as client: + client.upload_dataset(self, owner=owner, access=access, parent_artifact_id=parent_artifact_id) @classmethod def from_json(cls, path: str): diff --git a/polaris/hub/client.py b/polaris/hub/client.py index f5a72fab..8e403765 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -33,11 +33,14 @@ PolarisRetrieveArtifactError, PolarisSSLError, PolarisUnauthorizedError, + PolarisDeprecatedError, ) from polaris.utils.types import ( AccessType, ChecksumStrategy, HubOwner, + TimeoutTypes, + ZarrConflictResolution, ) logger = logging.getLogger(__name__) @@ -528,6 +531,30 @@ def upload_results( f"[green]Your result has been successfully uploaded to the Hub. View it here: {result_url}" ) + # Note: Unused parameters are included in signature for backwards compatibility. + # Removing these parameters results in a TypeError before the PolarisDeprecatedError is raised. + def upload_dataset( + self, + dataset: DatasetV1 | DatasetV2, + access: AccessType = "private", + timeout: TimeoutTypes = (10, 200), + owner: HubOwner | str | None = None, + if_exists: ZarrConflictResolution = "replace", + parent_artifact_id: str | None = None, + ): + raise PolarisDeprecatedError("dataset uploading") + + # Note: Unused parameters are included in signature for backwards compatibility. + # Removing these parameters results in a TypeError before the PolarisDeprecatedError is raised. + def upload_benchmark( + self, + benchmark: BenchmarkV1Specification | BenchmarkV2Specification, + access: AccessType = "private", + owner: HubOwner | str | None = None, + parent_artifact_id: str | None = None, + ): + raise PolarisDeprecatedError("benchmark uploading") + def get_competition(self, artifact_id: str) -> CompetitionSpecification: """Load a competition from the Polaris Hub. diff --git a/polaris/utils/errors.py b/polaris/utils/errors.py index f647f63d..e42124dd 100644 --- a/polaris/utils/errors.py +++ b/polaris/utils/errors.py @@ -98,3 +98,12 @@ def __init__(self, response_text: str = ""): "SSL verification by setting the POLARIS_CA_BUNDLE environment variable to `false`." ) super().__init__(message, response_text) + + +class PolarisDeprecatedError(PolarisHubError): + def __init__(self, feature: str, response_text: str = ""): + message = ( + f"The '{feature}' feature has been deprecated and is no longer supported. " + "Please contact the Polaris team for more information about alternative approaches." + ) + super().__init__(message, response_text) From 0f15faf3b8009b9f85066c9a03db44ba03b553f1 Mon Sep 17 00:00:00 2001 From: Daniel Peng Date: Thu, 15 May 2025 13:23:18 -0400 Subject: [PATCH 5/5] ruff formatting fix, mkdocs fix --- polaris/dataset/_dataset_v2.py | 2 +- polaris/hub/client.py | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/polaris/dataset/_dataset_v2.py b/polaris/dataset/_dataset_v2.py index faa842d6..40ba3ba5 100644 --- a/polaris/dataset/_dataset_v2.py +++ b/polaris/dataset/_dataset_v2.py @@ -191,7 +191,7 @@ def get_data(self, row: int, col: str, adapters: dict[str, Adapter] | None = Non arr = adapter(arr) return arr - + def upload_to_hub( self, access: AccessType = "private", diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 8e403765..0677cc1c 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -542,6 +542,33 @@ def upload_dataset( if_exists: ZarrConflictResolution = "replace", parent_artifact_id: str | None = None, ): + """Upload a dataset to the Polaris Hub. + + This functionality has been deprecated. + + Info: Owner + You have to manually specify the owner in the dataset data model. Because the owner could + be a user or an organization, we cannot automatically infer this from just the logged-in user. + + Note: Required metadata + The Polaris client and Hub maintain different requirements as to which metadata is required. + The requirements by the Hub are stricter, so when uploading to the Hub you might + get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible + before uploading. + + Args: + dataset: The dataset to upload. + access: Grant public or private access to result + timeout: Request timeout values. User can modify the value when uploading large dataset as needed. + This can be a single value with the timeout in seconds for all IO operations, or a more granular + tuple with (connect_timeout, write_timeout). The type of the the timout parameter comes from `httpx`. + Since datasets can get large, it might be needed to increase the write timeout for larger datasets. + See also: https://www.python-httpx.org/advanced/#timeout-configuration + owner: Which Hub user or organization owns the artifact. Takes precedence over `dataset.owner`. + if_exists: Action for handling existing files in the Zarr archive. Options are 'raise' to throw + an error, 'replace' to overwrite, or 'skip' to proceed without altering the existing files. + parent_artifact_id: The `owner/slug` of the parent dataset, if uploading a new version of a dataset. + """ raise PolarisDeprecatedError("dataset uploading") # Note: Unused parameters are included in signature for backwards compatibility. @@ -553,6 +580,30 @@ def upload_benchmark( owner: HubOwner | str | None = None, parent_artifact_id: str | None = None, ): + """Upload a benchmark to the Polaris Hub. + + This functionality has been deprecated. + + Info: Owner + You have to manually specify the owner in the benchmark data model. Because the owner could + be a user or an organization, we cannot automatically infer this from the logged-in user. + + Note: Required metadata + The Polaris client and Hub maintain different requirements as to which metadata is required. + The requirements by the Hub are stricter, so when uploading to the Hub you might + get some errors on missing metadata. Make sure to fill-in as much of the metadata as possible + before uploading. + + Note: Non-existent datasets + The client will _not_ upload the associated dataset to the Hub if it does not yet exist. + Make sure to specify an existing dataset or upload the dataset first. + + Args: + benchmark: The benchmark to upload. + access: Grant public or private access to result + owner: Which Hub user or organization owns the artifact. Takes precedence over `benchmark.owner`. + parent_artifact_id: The `owner/slug` of the parent benchmark, if uploading a new version of a benchmark. + """ raise PolarisDeprecatedError("benchmark uploading") def get_competition(self, artifact_id: str) -> CompetitionSpecification: