diff --git a/docs/tutorials/basics.ipynb b/docs/tutorials/basics.ipynb index 8c91df01..22bbb0f0 100644 --- a/docs/tutorials/basics.ipynb +++ b/docs/tutorials/basics.ipynb @@ -63,7 +63,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2023-11-27 14:54:08.788\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as cwognum (cas@valencediscovery.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n" + "\u001b[32m2024-02-18 12:35:01.048\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as cwognum (cas@valencelabs.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n" ] } ], @@ -99,8 +99,8 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = po.load_dataset(\"polaris/hello_world_dataset\")\n", - "benchmark = po.load_benchmark(\"polaris/hello_world_benchmark\")" + "dataset = po.load_dataset(\"polaris/hello-world\")\n", + "benchmark = po.load_benchmark(\"polaris/hello-world-benchmark\")" ] }, { @@ -231,10 +231,422 @@ "id": "748dd278-0fd0-4c5b-ac6a-8d974143c3b9", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to find the pandas get_adjustment() function to patch\n", + "Failed to patch pandas - PandasTools will have limited functionality\n" + ] + }, { "data": { "text/html": [ - "
RandomForestRegressor(max_depth=2, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
RandomForestRegressor(max_depth=2, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestRegressor(max_depth=2, random_state=0)" @@ -249,12 +661,15 @@ "import datamol as dm\n", "from sklearn.ensemble import RandomForestRegressor\n", "\n", - "# Convert smiles to ECFP fingerprints\n", - "train_fps = [dm.to_fp(smi) for smi in train.inputs]\n", + "# Load the benchmark (automatically loads the underlying dataset as well)\n", + "benchmark = po.load_benchmark(\"polaris/hello_world_benchmark\")\n", + "\n", + "# Get the split and convert SMILES to ECFP fingerprints by specifying an featurize function.\n", + "train, test = benchmark.get_train_test_split(featurization_fn=dm.to_fp)\n", "\n", "# Define a model and train\n", "model = RandomForestRegressor(max_depth=2, random_state=0)\n", - "model.fit(train_fps, train.targets)" + "model.fit(train.X, train.y)" ] }, { @@ -272,8 +687,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_fps = [dm.to_fp(smi) for smi in test.inputs]\n", - "predictions = model.predict(test_fps)" + "predictions = model.predict(test.X)" ] }, { @@ -285,7 +699,7 @@ { "data": { "text/html": [ - "
nameNone
description
tags
user_attributes
ownerNone
benchmark_namehello_world_benchmark
benchmark_owner
slugpolaris
external_idorg_2WG9hRFgKNIRtGw4orsMPcr1F4S
typeorganization
github_urlNone
paper_urlNone
contributorsNone
artifact_idNone
benchmark_artifact_idpolaris/hello-world-benchmark
results
Test setTarget labelMetricScore
testSOLmean_squared_error2.6875139821
testSOLmean_absolute_error1.2735690161
" + "
nameNone
description
tags
user_attributes
ownerNone
benchmark_namehello-world-benchmark
benchmark_owner
slugpolaris
external_idorg_2WG9hRFgKNIRtGw4orsMPcr1F4S
typeorganization
github_urlNone
paper_urlNone
contributorsNone
artifact_idNone
benchmark_artifact_idpolaris/hello-world-benchmark
results
Test setTarget labelMetricScore
testSOLmean_squared_error2.6875139821
testSOLmean_absolute_error1.2735690161
" ], "text/plain": [ "{\n", @@ -294,7 +708,7 @@ " \"tags\": [],\n", " \"user_attributes\": {},\n", " \"owner\": null,\n", - " \"benchmark_name\": \"hello_world_benchmark\",\n", + " \"benchmark_name\": \"hello-world-benchmark\",\n", " \"benchmark_owner\": {\n", " \"slug\": \"polaris\",\n", " \"external_id\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\",\n", @@ -349,7 +763,7 @@ "source": [ "results.name = f\"hello-world-result\"\n", "results.github_url = \"https://github.com/polaris-hub/polaris-hub\"\n", - "results.paper_url = \"https://polaris-hub.vercel.app\"\n", + "results.paper_url = \"https://polarishub.io/\"\n", "results.description = \"Hello, World!\"" ] }, @@ -373,11 +787,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/cas/micromamba/envs/polaris/lib/python3.12/site-packages/pydantic/main.py:308: UserWarning: Pydantic serializer warnings:\n", + "/Users/cas.wognum/micromamba/envs/polaris/lib/python3.12/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n", " Expected `url` but got `str` - serialized value may not be as expected\n", " Expected `url` but got `str` - serialized value may not be as expected\n", " return self.__pydantic_serializer__.to_python(\n", - "\u001b[32m2023-11-27 14:54:46.649\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m428\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello_world_benchmark/ns4JrC3hQNK9M1hbVPchy\u001b[0m\n" + "\u001b[32m2024-02-18 12:35:09.465\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m431\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello-world-benchmark/l3uWzFBEyaD09Sa4Aik21\u001b[0m\n" ] } ], @@ -415,7 +829,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.0" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 1b9e8e93..895cf25e 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -1,7 +1,7 @@ import json import os from hashlib import md5 -from typing import Any, Optional, Union +from typing import Any, Callable, Optional, Union import fsspec import numpy as np @@ -353,7 +353,10 @@ def task_type(self) -> TaskType: return v.value def get_train_test_split( - self, input_format: DataFormat = "dict", target_format: DataFormat = "dict" + self, + input_format: DataFormat = "dict", + target_format: DataFormat = "dict", + featurization_fn: Optional[Callable] = None, ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]: """Construct the train and test sets, given the split in the benchmark specification. @@ -365,6 +368,8 @@ def get_train_test_split( input_format: How the input data is returned from the `Subset` object. target_format: How the target data is returned from the `Subset` object. This will only affect the train set. + featurization_fn: A function to apply to the input data. If a multi-input benchmark, this function + expects an input in the format specified by the `input_format` parameter. Returns: A tuple with the train `Subset` and test `Subset` objects. @@ -381,6 +386,7 @@ def _get_subset(indices, hide_targets): target_cols=self.target_cols, target_format=target_format, hide_targets=hide_targets, + featurization_fn=featurization_fn, ) train = _get_subset(self.split[0], hide_targets=False) @@ -388,6 +394,7 @@ def _get_subset(indices, hide_targets): test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()} else: test = _get_subset(self.split[1], hide_targets=True) + return train, test def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: @@ -406,8 +413,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: 5. There can be metrics which measure across tasks. Args: - y_pred: The predictions for the test set, as NumPy arrays. If there are multiple test sets, - this should be a dictionary with the test set names as keys. + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. + If there are multiple test sets, the predictions should be further wrapped in a dictionary + with the test subset labels as keys. Returns: A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. @@ -416,7 +425,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. # This simplifies the API, but also was added to make accidental access to the test set targets less likely. # See also the `hide_targets` parameter in the `Subset` class. - test = self.get_train_test_split()[1] + test = self.get_train_test_split(target_format="dict")[1] if not isinstance(test, dict): test = {"test": test} diff --git a/polaris/dataset/_subset.py b/polaris/dataset/_subset.py index e1d4b9ae..04f5df95 100644 --- a/polaris/dataset/_subset.py +++ b/polaris/dataset/_subset.py @@ -1,9 +1,8 @@ -from typing import List, Literal, Optional, Sequence, Union +from typing import Callable, List, Literal, Optional, Sequence, Union import numpy as np from polaris.dataset import Dataset -from polaris.utils.context import tmp_attribute_change from polaris.utils.errors import TestAccessError from polaris.utils.types import DataFormat, DatapointType @@ -11,6 +10,20 @@ class Subset: """The `Subset` class provides easy access to a single partition of a split dataset. + Info: No need to create this class manually + You should not have to create this class manually. In most use-cases, you can create a `Subset` through the + `get_train_test_split` method of a `BenchmarkSpecification` object. + + Tip: Featurize your inputs + Not all datasets are already featurized. For example, a small-molecule task might simply provide the SMILES string. + To easily featurize the inputs, you can pass or set a transformation function. For example: + + ```python + import datamol as dm + + benchmark.get_train_test_split(..., featurization_fn=dm.to_fp) + ``` + This should be the starting point for any framework-specific (e.g. PyTorch, Tensorflow) data-loader implementation. How the data is loaded in Polaris can be non-trivial, so this class is provided to abstract away the details. To easily build framework-specific data-loaders, a `Subset` supports various styles of accessing the data: @@ -45,8 +58,6 @@ class Subset: TestAccessError: When trying to access the targets of the test set (specified by the `hide_targets` attribute). """ - _SUPPORTED_FORMATS = ["dict", "tuple"] - def __init__( self, dataset: Dataset, @@ -54,26 +65,19 @@ def __init__( input_cols: Union[List[str], str], target_cols: Union[List[str], str], input_format: DataFormat = "dict", - target_format: DataFormat = "tuple", + target_format: DataFormat = "dict", + featurization_fn: Optional[Callable] = None, hide_targets: bool = False, ): self.dataset = dataset self.indices = indices self.target_cols = target_cols if isinstance(target_cols, list) else [target_cols] self.input_cols = input_cols if isinstance(input_cols, list) else [input_cols] - - # Validate the output format - if input_format not in self._SUPPORTED_FORMATS: - raise ValueError( - f"Unsupported output format {input_format}. Choose from {self._SUPPORTED_FORMATS}" - ) - if target_format not in self._SUPPORTED_FORMATS: - raise ValueError( - f"Unsupported output format {target_format}. Choose from {self._SUPPORTED_FORMATS}" - ) self._input_format = input_format self._target_format = target_format + self._featurization_fn = featurization_fn + # For the iterator implementation self._pointer = 0 @@ -90,43 +94,73 @@ def is_multi_input(self): @property def inputs(self): - """ - Scikit-learn style access to the inputs. - If the dataset is multi-input, this will return a dict of arrays. - """ + """Alias for `self.as_array("x")`""" + return self.as_array("x") + + @property + def X(self): + """Alias for `self.as_array("x")`""" return self.as_array("x") @property def targets(self): - """ - Scikit-learn style access to the targets. - If the dataset is multi-target, this will return a dict of arrays. - """ + """Alias for `self.as_array("y")`""" + return self.as_array("y") + + @property + def y(self): + """Alias for `self.as_array("y")`""" return self.as_array("y") @staticmethod - def _convert(data: dict, order: List[str], fmt: str): - """Converts from the default dict format to the specified format""" + def _format(data: dict, order: List[str], fmt: str): + """ + Converts the internally used dict format to the user-specified format. + If the user-specified format is a tuple, it orders the column according to the specified order. + """ if len(data) == 1: data = list(data.values())[0] elif fmt == "tuple": data = tuple(data[k] for k in order) return data - def _extract( + def _get_single( self, - data: DatapointType, - data_type: Union[Literal["x"], Literal["y"], Literal["xy"]], - key: Optional[str] = None, + row: str | int, + cols: List[str], + featurization_fn: Optional[Callable], + format: DataFormat, ): - """Helper function to extract data from the return format of this class""" - if self._hide_targets: - return data - x, y = data - ret = x if data_type == "x" else y - if not isinstance(ret, dict) or key is None: - return ret - return ret[key] + """ + Loads a subset of the variables for a single data-point from the datasets. + The dataset stores datapoint in a row-wise manner, so this method is used to access a single row. + + Args: + row: The row index of the datapoint. + cols: The columns (i.e. variables) to load for that data point. + featurization_fn: The transformation function to apply to the data-point. + format: The format to return the data-point in. + """ + # Load the data-point + # Also handles loading data stored in external files for pointer columns + ret = {col: self.dataset.get_data(row, col) for col in cols} + + # Format + ret = self._format(ret, cols, format) + + # Featurize + if featurization_fn is not None: + ret = featurization_fn(ret) + + return ret + + def _get_single_input(self, row: str | int): + """Get a single input for a specific data-point and given the benchmark specification.""" + return self._get_single(row, self.input_cols, self._featurization_fn, self._input_format) + + def _get_single_output(self, row: str | int): + """Get a single output for a specific data-point and given the benchmark specification.""" + return self._get_single(row, self.target_cols, None, self._target_format) def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]): """ @@ -138,21 +172,30 @@ def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]): return self.as_array("x"), self.as_array("y") if data_type == "y" and self._hide_targets: - raise TestAccessError("Within Polaris, you should not need to access the targets of the test set") - - if not self.is_multi_task: - return np.array([self._extract(ret, data_type) for ret in self]) - - out = {} - columns = self.input_cols if data_type == "x" else self.target_cols - - # Temporarily change the target format for easier conversion - with tmp_attribute_change(self, "_target_format", "dict"): - with tmp_attribute_change(self, "_input_format", "dict"): - for k in columns: - out[k] = np.array([self._extract(ret, data_type, k) for ret in self]) - - return self._convert(out, self.target_cols, self._target_format) + raise TestAccessError("Within Polaris you should not need to access the targets of the test set") + + if data_type == "x": + ret = [self._get_single_input(self.dataset.table.iloc[idx].name) for idx in self.indices] + else: + ret = [self._get_single_output(self.dataset.table.iloc[idx].name) for idx in self.indices] + + if not ((self.is_multi_input and data_type == "x") or (self.is_multi_task and data_type == "y")): + # If the target format is not a dict, we can just create the array directly. + # With a single-task or single-input data point, this will be a 1D array. + # With a multi-task or multi-input data point, this will be a 2D array. + return np.array(ret) + + # If the return format is a dict, we want to convert + # from an array of dicts to a dict of arrays. + if data_type == "y" and self._target_format == "dict": + ret = {k: np.array([v[k] for v in ret]) for k in self.target_cols} + elif data_type == "x" and self._input_format == "dict": + ret = {k: np.array([v[k] for v in ret]) for k in self.input_cols} + else: + # The format is a tuple, so we have list of tuples and convert this to an array + ret = np.array(ret) + + return ret def __len__(self): return len(self.indices) @@ -175,8 +218,7 @@ def __getitem__(self, item) -> DatapointType: row = self.dataset.table.iloc[idx] # Load the input modalities - ins = {col: self.dataset.get_data(row.name, col) for col in self.input_cols} - ins = self._convert(ins, self.input_cols, self._input_format) + ins = self._get_single_input(row.name) if self._hide_targets: # If we are not allowed to access the targets, we return the inputs only. @@ -184,9 +226,7 @@ def __getitem__(self, item) -> DatapointType: return ins # Retrieve the targets - outs = {col: self.dataset.get_data(row.name, col) for col in self.target_cols} - outs = self._convert(outs, self.target_cols, self._target_format) - + outs = self._get_single_output(row.name) return ins, outs def __iter__(self): diff --git a/polaris/utils/types.py b/polaris/utils/types.py index 402b98e9..8291f8b3 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -124,9 +124,9 @@ class License(BaseModel): Else it is required to manually specify this. """ - SPDX_LICENSE_DATA_PATH: ClassVar[ - str - ] = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" + SPDX_LICENSE_DATA_PATH: ClassVar[str] = ( + "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" + ) id: str reference: Optional[HttpUrlString] = None diff --git a/pyproject.toml b/pyproject.toml index b7594baa..b21af303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,8 +107,7 @@ lint.ignore = [ line-length = 110 target-version = "py310" -[tool.ruff.lint.per-file-ignores] -"__init__.py" = [ +lint.per-file-ignores."__init__.py" = [ "F401", # imported but unused "E402", # Module level import not at top of file ] diff --git a/tests/conftest.py b/tests/conftest.py index f5b0077b..cd71fcf2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,7 +74,7 @@ def test_zarr_archive_single_array(tmp_path): return _get_zarr_archive(tmp_path, datapoint_per_array=False) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def test_single_task_benchmark(test_dataset): train_indices = list(range(90)) test_indices = list(range(90, 100)) @@ -96,7 +96,7 @@ def test_single_task_benchmark(test_dataset): ) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def test_single_task_benchmark_clf(test_dataset): train_indices = list(range(90)) test_indices = list(range(90, 100)) @@ -111,7 +111,7 @@ def test_single_task_benchmark_clf(test_dataset): ) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def test_single_task_benchmark_multiple_test_sets(test_dataset): train_indices = list(range(90)) test_indices = {"test_1": list(range(90, 95)), "test_2": list(range(95, 100))} @@ -133,7 +133,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset): ) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def test_multi_task_benchmark(test_dataset): # For the sake of simplicity, just use a small set of indices train_indices = list(range(90)) @@ -157,7 +157,7 @@ def test_multi_task_benchmark(test_dataset): ) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def test_multi_task_benchmark_clf(test_dataset): # For the sake of simplicity, just use a small set of indices train_indices = list(range(90)) diff --git a/tests/test_integration.py b/tests/test_integration.py index 7b96a4e3..b2f4626c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -50,6 +50,7 @@ def test_multi_task_benchmark_loop(test_multi_task_benchmark): x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test.inputs]) y_pred = {} + print(multi_y) for k, y in multi_y.items(): model = RandomForestRegressor() diff --git a/tests/test_subset.py b/tests/test_subset.py index c06d1168..c6e1c4c5 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -1,3 +1,5 @@ +import datamol as dm +import numpy as np import pytest from polaris.dataset import Subset @@ -24,6 +26,8 @@ def test_consistency_across_access_methods(test_dataset): # Property assert (task.inputs == expected_smiles).all() assert (task.targets == expected_targets).all() + assert (task.X == expected_smiles).all() + assert (task.y == expected_targets).all() def test_access_to_test_set(test_single_task_benchmark): @@ -47,3 +51,76 @@ def test_access_to_test_set(test_single_task_benchmark): # For the train set it should work assert all(isinstance(y, float) for x, y in train) assert all(isinstance(train[i][1], float) for i in range(len(train))) + + +def test_input_featurization(test_single_task_benchmark): + # Without a transformation, we expect a SMILES string + train, test = test_single_task_benchmark.get_train_test_split() + test_single_task_benchmark._n_splits_since_evaluate = 0 # Manually reset for sake of test + + x, y = train[0] + assert isinstance(x, str) + + x = test[0] + assert isinstance(x, str) + + train, test = test_single_task_benchmark.get_train_test_split(featurization_fn=dm.to_fp) + + # For all different flavours of accessing the data + # Make sure the input is now featurized + x, y = train[0] + assert isinstance(x, np.ndarray) + + x = test[0] + assert isinstance(x, np.ndarray) + + x, y = next(train) + assert isinstance(x, np.ndarray) + + x = next(test) + assert isinstance(x, np.ndarray) + + x = train.X[0] + assert isinstance(x, np.ndarray) + + x = test.X[0] + assert isinstance(x, np.ndarray) + + +@pytest.mark.parametrize("fmt", ["dict", "tuple"]) +def test_different_subset_formats_single_task(test_single_task_benchmark, fmt): + train, _ = test_single_task_benchmark.get_train_test_split(target_format=fmt) + assert isinstance(train.y, np.ndarray) + assert train.y.shape == (len(train),) + assert isinstance(train[0][1], float) + assert isinstance(next(train)[1], float) + + +def test_different_subset_formats_multi_task_dict(test_multi_task_benchmark): + train, _ = test_multi_task_benchmark.get_train_test_split(target_format="dict") + assert isinstance(train.y, dict) + assert all(c in test_multi_task_benchmark.target_cols for c in train.y) + assert all(isinstance(v, np.ndarray) and v.shape == (len(train),) for v in train.y.values()) + assert isinstance(train[0][1], dict) + assert isinstance(next(train)[1], dict) + + +def test_different_subset_formats_multi_task_tuple(test_multi_task_benchmark): + train, _ = test_multi_task_benchmark.get_train_test_split(target_format="tuple") + assert isinstance(train.y, np.ndarray) + assert train.y.shape == (len(train), len(train.target_cols)) + assert isinstance(train[0][1], tuple) + assert isinstance(next(train)[1], tuple) + + +def test_consistency_between_different_formats(test_multi_task_benchmark): + train_tup, _ = test_multi_task_benchmark.get_train_test_split(target_format="tuple") + train_dict, _ = test_multi_task_benchmark.get_train_test_split(target_format="dict") + + t = train_tup[0][1] + d = train_dict[0][1] + + assert len(d) == len(t) + for k, v in d.items(): + idx = test_multi_task_benchmark.target_cols.index(k) + assert t[idx] == v