From 75a1b6f17c1c48e9ff3b9c5f312c18ea37f27e59 Mon Sep 17 00:00:00 2001 From: cwognum Date: Wed, 18 Oct 2023 16:14:55 -0400 Subject: [PATCH 1/2] Redid the BenchmarkResults structure to be tabular --- polaris/benchmark/_base.py | 48 +++++++++++----- polaris/dataset/_dataset.py | 19 +++++++ polaris/evaluate/_results.py | 104 ++++++++++++++++++++++++++++------- polaris/hub/client.py | 25 +++++---- polaris/loader/__init__.py | 6 -- polaris/utils/errors.py | 4 +- tests/test_evaluate.py | 7 ++- 7 files changed, 160 insertions(+), 53 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index af97fb91..d3adbd66 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -1,9 +1,11 @@ import json +import os from hashlib import md5 from typing import Any, Optional, Union import fsspec import numpy as np +import pandas as pd from pydantic import ( ConfigDict, FieldValidationInfo, @@ -15,6 +17,7 @@ from polaris._artifact import BaseArtifactModel from polaris.dataset import Dataset, Subset from polaris.evaluate import BenchmarkResults, Metric, ResultsType +from polaris.hub.settings import PolarisHubSettings from polaris.utils import fs from polaris.utils.context import tmp_attribute_change from polaris.utils.dict2html import dict2html @@ -333,40 +336,59 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred): y_pred = {"test": y_pred} + if any(k not in y_pred for k in test.keys()): raise KeyError( f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" ) - scores: ResultsType = {} + # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. + scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) # For every test set... for test_label, y_true_subset in y_true.items(): - scores[test_label] = {} - # For every metric... for metric in self.metrics: - if metric.is_multitask or not isinstance(y_true_subset, dict): - # Either single-task or multi-task but with a metric across targets - scores[test_label][metric] = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + if metric.is_multitask: + # Multi-task but with a metric across targets + score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + scores.loc[len(scores)] = (test_label, "all", metric, score) + continue + + if not isinstance(y_true_subset, dict): + # Single task + score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + scores.loc[len(scores)] = (test_label, self.target_cols[0], metric, score) continue # Otherwise, for every target... for target_label, y_true_target in y_true_subset.items(): - if target_label not in scores[test_label]: - scores[test_label][target_label] = {} - # Single-task metrics for a multi-task benchmark # In such a setting, there can be NaN values, which we thus have to filter out. mask = ~np.isnan(y_true_target) score = metric(y_true=y_true_target[mask], y_pred=y_pred[test_label][target_label][mask]) - scores[test_label][target_label][metric] = score - - if len(scores) == 1: - scores = scores["test"] + scores.loc[len(scores)] = (test_label, target_label, metric, score) return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner) + def upload_to_hub( + self, + env_file: Optional[Union[str, os.PathLike]] = None, + settings: Optional[PolarisHubSettings] = None, + cache_auth_token: bool = True, + **kwargs: dict, + ): + """ + Very light, convenient wrapper around the + [`PolarisHubClient.upload_benchmark`][polaris.hub.client.PolarisHubClient.upload_benchmark] method. + """ + from polaris.hub.client import PolarisHubClient + + with PolarisHubClient( + env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs + ) as client: + return client.upload_benchmark(self) + def to_json(self, destination: str) -> str: """Save the benchmark to a destination directory as a JSON file. diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index 448a751a..7ba79405 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -18,6 +18,7 @@ from polaris._artifact import BaseArtifactModel from polaris.dataset._column import ColumnAnnotation +from polaris.hub.settings import PolarisHubSettings from polaris.utils import fs from polaris.utils.constants import DEFAULT_CACHE_DIR from polaris.utils.dict2html import dict2html @@ -201,6 +202,24 @@ def _load(p: str, index: Optional[int]) -> np.ndarray: self._has_been_warned = True return _load(value, index) + def upload_to_hub( + self, + env_file: Optional[Union[str, os.PathLike]] = None, + settings: Optional[PolarisHubSettings] = None, + cache_auth_token: bool = True, + **kwargs: dict, + ): + """ + Very light, convenient wrapper around the + [`PolarisHubClient.upload_dataset`][polaris.hub.client.PolarisHubClient.upload_dataset] method. + """ + from polaris.hub.client import PolarisHubClient + + with PolarisHubClient( + env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs + ) as client: + return client.upload_dataset(self) + @classmethod def from_zarr(cls, path: str) -> "Dataset": """Parse a [.zarr](https://zarr.readthedocs.io/en/stable/index.html) hierarchy into a Polaris `Dataset`. diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py index d5a47a26..9e483849 100644 --- a/polaris/evaluate/_results.py +++ b/polaris/evaluate/_results.py @@ -1,22 +1,23 @@ import json +import os from datetime import datetime -from typing import Optional, Union +from typing import ClassVar, Optional, Union -from pydantic import ConfigDict, Field, PrivateAttr, field_serializer +import pandas as pd +from pydantic import ConfigDict, Field, PrivateAttr, field_serializer, field_validator from polaris._artifact import BaseArtifactModel -from polaris.evaluate._metric import Metric +from polaris.evaluate import Metric +from polaris.hub.settings import PolarisHubSettings from polaris.utils.dict2html import dict2html +from polaris.utils.errors import InvalidResultError from polaris.utils.misc import to_lower_camel from polaris.utils.types import HttpUrlString, HubOwner, HubUser # Define some helpful type aliases TestLabelType = str TargetLabelType = str -MetricScoresType = dict[Union[str, Metric], float] -ResultsType = Union[ - MetricScoresType, dict[TestLabelType, Union[MetricScoresType, dict[TargetLabelType, MetricScoresType]]] -] +ResultsType = Union[pd.DataFrame, dict] class BenchmarkResults(BaseArtifactModel): @@ -26,13 +27,23 @@ class BenchmarkResults(BaseArtifactModel): In addition to the metrics on the test set, it contains additional meta-data and logic to integrate the results with the Polaris Hub. + The actual results are saved in the `results` field using the following tabular format: + + | Test set | Target label | Metric | Score | + | -------- | ------------ | ------ | ----- | + | test_iid | EGFR_WT | AUC | 0.9 | + | test_ood | EGFR_WT | AUC | 0.75 | + | ... | ... | ... | ... | + | test_ood | EGFR_L858R | AUC | 0.79 | + question: Categorizing methods An open question is how to best categorize a methodology (e.g. a model). This is needed since we would like to be able to aggregate results across benchmarks too, to say something about which (type of) methods performs best _in general_. Attributes: - results: Benchmark results are stored as a dictionary + results: Benchmark results are stored directly in a dataframe or in a serialized, JSON compatible dict + with the split orientation that can be decoded into the associated tabular format. benchmark_name: The name of the benchmark for which these results were generated. Together with the benchmark owner, this uniquely identifies the benchmark on the Hub. benchmark_owner: The owner of the benchmark for which these results were generated. @@ -44,6 +55,9 @@ class BenchmarkResults(BaseArtifactModel): For additional meta-data attributes, see the [`BaseArtifactModel`][polaris._artifact.BaseArtifactModel] class. """ + # Define the columns of the results table + RESULTS_COLUMNS: ClassVar[list[str]] = ["Test set", "Target label", "Metric", "Score"] + # Data results: ResultsType benchmark_name: str = Field(..., frozen=True) @@ -57,23 +71,75 @@ class BenchmarkResults(BaseArtifactModel): # Private attributes _created_at: datetime = PrivateAttr(default_factory=datetime.now) - model_config = ConfigDict(alias_generator=to_lower_camel, populate_by_name=True) + # Model config + model_config = ConfigDict( + alias_generator=to_lower_camel, populate_by_name=True, arbitrary_types_allowed=True + ) + + @field_validator("results") + def _validate_results(cls, v): + """Ensure the results are a valid dataframe and have the expected columns""" + + # If not a dataframe, assume it is a JSON-serialized, split-oriented export of a dataframe. + if not isinstance(v, pd.DataFrame): + try: + v = pd.read_json(json.dumps(v), orient="split") + except (ValueError, UnicodeDecodeError) as error: + print(error) + raise InvalidResultError( + "The provided dictionary is not a valid, split-oriented JSON export of a Pandas dataframe" + ) from error + + # Check if the dataframe contains _only_ the expected columns + if set(v.columns) != set(cls.RESULTS_COLUMNS): + raise InvalidResultError( + f"The results dataframe should have the following columns: {cls.RESULTS_COLUMNS}" + ) + + # Check if the results are not empty + if v.empty: + raise InvalidResultError("The results dataframe is empty") + + # NOTE (cwognum): Since we have a reference to the benchmark, I considered validating the values in the + # columns as well (e.g. are all metrics, targets and test sets actually part of the benchmark). + # However, to keep this class light-weight, I did not want to add a strict dependency on the full benchmark class. + # Especially because validation will happen on the Hub as well before it is shown there. + return v @field_serializer("results") - def serialize_results(self, value: ResultsType): + def _serialize_results(self, value: ResultsType): """Change from the Metric enum to a string representation""" - - def _recursive_enum_to_str(d: dict): - """Utility function to easily traverse the nested dictionary""" - if not isinstance(d, dict): - return d - return {k.name if isinstance(k, Metric) else k: _recursive_enum_to_str(v) for k, v in d.items()} - - return _recursive_enum_to_str(value) + self.results["Metric"] = self.results["Metric"].apply( + lambda x: x.name if isinstance(x, Metric) else x + ) + return json.loads(value.to_json(orient="split")) + + def upload_to_hub( + self, + env_file: Optional[Union[str, os.PathLike]] = None, + settings: Optional[PolarisHubSettings] = None, + cache_auth_token: bool = True, + **kwargs: dict, + ): + """ + Very light, convenient wrapper around the + [`PolarisHubClient.upload_results`][polaris.hub.client.PolarisHubClient.upload_results] method. + """ + from polaris.hub.client import PolarisHubClient + + with PolarisHubClient( + env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs + ) as client: + return client.upload_results(self) def _repr_dict_(self) -> dict: """Utility function for pretty-printing to the command line and jupyter notebooks""" - repr_dict = self.model_dump() + repr_dict = self.model_dump(exclude=["results"]) + + df = self.results.copy(deep=True) + df["Metric"] = df["Metric"].apply(lambda x: x.name if isinstance(x, Metric) else x) + repr_dict["results"] = json.loads(df.to_json(orient="records")) + return repr_dict def _repr_html_(self): diff --git a/polaris/hub/client.py b/polaris/hub/client.py index de9b5c28..a48543b3 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -205,10 +205,7 @@ def request(self, method, url, withhold_token=False, auth=httpx.USE_CLIENT_DEFAU except (MissingTokenError, InvalidTokenError, httpx.HTTPStatusError, OAuthError) as error: if isinstance(error, httpx.HTTPStatusError) and error.response.status_code != 401: raise - raise PolarisUnauthorizedError( - "You are not logged in to Polaris or your login has expired. " - "You can use the Polaris CLI to easily authenticate yourself again, see `polaris login --help`." - ) from error + raise PolarisUnauthorizedError from error return response # ========================= @@ -224,7 +221,10 @@ def user_info(self) -> dict: # Because of this, we also have to copy some code from the base `request` method to # make auto-refresh a token if needed. For more info, see: https://stackoverflow.com/a/62687390 - if self.token is None or not self.ensure_active_token(self.token): + try: + if self.token is None or not self.ensure_active_token(self.token): + raise PolarisUnauthorizedError + except OAuthError: raise PolarisUnauthorizedError if self._user_info is None: @@ -257,12 +257,15 @@ def login(self, overwrite: bool = False, auto_open_browser: bool = True): # Check if the user is already logged in if self.token is not None and not overwrite: - info = self.user_info - logger.info( - f"You are already logged in to the Polaris Hub as {info['username']} ({info['email']}). " - "Set `overwrite=True` to force re-authentication." - ) - return + try: + info = self.user_info + logger.info( + f"You are already logged in to the Polaris Hub as {info['username']} ({info['email']}). " + "Set `overwrite=True` to force re-authentication." + ) + return + except PolarisUnauthorizedError: + pass # Step 1: Redirect user to the authorization URL authorization_url, _ = self.create_authorization_url() diff --git a/polaris/loader/__init__.py b/polaris/loader/__init__.py index bc897fc0..61162261 100644 --- a/polaris/loader/__init__.py +++ b/polaris/loader/__init__.py @@ -22,9 +22,6 @@ def load_dataset(path: str): if not is_file: # Load from the Hub client = PolarisHubClient() - options = client.list_datasets() - if path not in options: - raise InvalidDatasetError(f"{path} is not a valid dataset.") return client.get_dataset(*path.split("/")) if extension == "zarr": @@ -45,9 +42,6 @@ def load_benchmark(path: str): if not is_file: # Load from the Hub client = PolarisHubClient() - options = client.list_benchmarks() - if path not in options: - raise InvalidBenchmarkError(f"{path} is not a valid benchmark. Make sure it exists!") return client.get_benchmark(*path.split("/")) with fsspec.open(path, "r") as fd: diff --git a/polaris/utils/errors.py b/polaris/utils/errors.py index c6408604..1cb46581 100644 --- a/polaris/utils/errors.py +++ b/polaris/utils/errors.py @@ -20,8 +20,8 @@ class PolarisHubError(Exception): class PolarisUnauthorizedError(PolarisHubError): DEFAULT_ERROR_MSG = ( - "You are not logged in to the Polaris Hub. Please use the Polaris CLI to login. " - "Use `polaris --help` for more information." + "You are not logged in to Polaris or your login has expired. " + "You can use the Polaris CLI to easily authenticate yourself again, see `polaris login --help`." ) def __init__(self, message: str = DEFAULT_ERROR_MSG): diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 68a3a145..427d6604 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -1,18 +1,21 @@ import os -from polaris.evaluate._metric import Metric +import pandas as pd + from polaris.evaluate._results import BenchmarkResults from polaris.utils.types import HubOwner def test_result_to_json(tmpdir: str, test_user_owner: HubOwner): + scores = pd.DataFrame({"Test set": ["A"], "Target label": "B", "Metric": "C", "Score": 0.1}) + result = BenchmarkResults( name="test", description="Lorem ipsum!", tags=["test"], user_attributes={"key": "value"}, owner=test_user_owner, - results={"test": {Metric.mean_absolute_error: 1.0}}, + results=scores, benchmark_name="my-benchmark", benchmark_owner=test_user_owner, github_url="https://github.com/", From 4f47e96e1195dcb674f50fd58a4a9eba6127b168 Mon Sep 17 00:00:00 2001 From: cwognum Date: Wed, 18 Oct 2023 16:36:57 -0400 Subject: [PATCH 2/2] Don't save the index. We don't really use it anyways! --- polaris/evaluate/_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py index 9e483849..df2ee34a 100644 --- a/polaris/evaluate/_results.py +++ b/polaris/evaluate/_results.py @@ -112,7 +112,7 @@ def _serialize_results(self, value: ResultsType): self.results["Metric"] = self.results["Metric"].apply( lambda x: x.name if isinstance(x, Metric) else x ) - return json.loads(value.to_json(orient="split")) + return json.loads(value.to_json(orient="split", index=False)) def upload_to_hub( self,