Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
444 changes: 429 additions & 15 deletions docs/tutorials/basics.ipynb

Large diffs are not rendered by default.

19 changes: 14 additions & 5 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os
from hashlib import md5
from typing import Any, Optional, Union
from typing import Any, Callable, Optional, Union

import fsspec
import numpy as np
Expand Down Expand Up @@ -353,7 +353,10 @@ def task_type(self) -> TaskType:
return v.value

def get_train_test_split(
self, input_format: DataFormat = "dict", target_format: DataFormat = "dict"
self,
input_format: DataFormat = "dict",
target_format: DataFormat = "dict",
featurization_fn: Optional[Callable] = None,
) -> tuple[Subset, Union["Subset", dict[str, Subset]]]:
"""Construct the train and test sets, given the split in the benchmark specification.

Expand All @@ -365,6 +368,8 @@ def get_train_test_split(
input_format: How the input data is returned from the `Subset` object.
target_format: How the target data is returned from the `Subset` object.
This will only affect the train set.
featurization_fn: A function to apply to the input data. If a multi-input benchmark, this function
expects an input in the format specified by the `input_format` parameter.

Returns:
A tuple with the train `Subset` and test `Subset` objects.
Expand All @@ -381,13 +386,15 @@ def _get_subset(indices, hide_targets):
target_cols=self.target_cols,
target_format=target_format,
hide_targets=hide_targets,
featurization_fn=featurization_fn,
)

train = _get_subset(self.split[0], hide_targets=False)
if isinstance(self.split[1], dict):
test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()}
else:
test = _get_subset(self.split[1], hide_targets=True)

return train, test

def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
Expand All @@ -406,8 +413,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
5. There can be metrics which measure across tasks.

Args:
y_pred: The predictions for the test set, as NumPy arrays. If there are multiple test sets,
this should be a dictionary with the test set names as keys.
y_pred: The predictions for the test set, as NumPy arrays.
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
If there are multiple test sets, the predictions should be further wrapped in a dictionary
with the test subset labels as keys.

Returns:
A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
Expand All @@ -416,7 +425,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
# Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
# This simplifies the API, but also was added to make accidental access to the test set targets less likely.
# See also the `hide_targets` parameter in the `Subset` class.
test = self.get_train_test_split()[1]
test = self.get_train_test_split(target_format="dict")[1]

if not isinstance(test, dict):
test = {"test": test}
Expand Down
154 changes: 97 additions & 57 deletions polaris/dataset/_subset.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,29 @@
from typing import List, Literal, Optional, Sequence, Union
from typing import Callable, List, Literal, Optional, Sequence, Union

import numpy as np

from polaris.dataset import Dataset
from polaris.utils.context import tmp_attribute_change
from polaris.utils.errors import TestAccessError
from polaris.utils.types import DataFormat, DatapointType


class Subset:
"""The `Subset` class provides easy access to a single partition of a split dataset.

Info: No need to create this class manually
You should not have to create this class manually. In most use-cases, you can create a `Subset` through the
`get_train_test_split` method of a `BenchmarkSpecification` object.

Tip: Featurize your inputs
Not all datasets are already featurized. For example, a small-molecule task might simply provide the SMILES string.
To easily featurize the inputs, you can pass or set a transformation function. For example:

```python
import datamol as dm

benchmark.get_train_test_split(..., featurization_fn=dm.to_fp)
```

This should be the starting point for any framework-specific (e.g. PyTorch, Tensorflow) data-loader implementation.
How the data is loaded in Polaris can be non-trivial, so this class is provided to abstract away the details.
To easily build framework-specific data-loaders, a `Subset` supports various styles of accessing the data:
Expand Down Expand Up @@ -45,35 +58,26 @@ class Subset:
TestAccessError: When trying to access the targets of the test set (specified by the `hide_targets` attribute).
"""

_SUPPORTED_FORMATS = ["dict", "tuple"]

def __init__(
self,
dataset: Dataset,
indices: List[Union[int, Sequence[int]]],
input_cols: Union[List[str], str],
target_cols: Union[List[str], str],
input_format: DataFormat = "dict",
target_format: DataFormat = "tuple",
target_format: DataFormat = "dict",
featurization_fn: Optional[Callable] = None,
hide_targets: bool = False,
):
self.dataset = dataset
self.indices = indices
self.target_cols = target_cols if isinstance(target_cols, list) else [target_cols]
self.input_cols = input_cols if isinstance(input_cols, list) else [input_cols]

# Validate the output format
if input_format not in self._SUPPORTED_FORMATS:
raise ValueError(
f"Unsupported output format {input_format}. Choose from {self._SUPPORTED_FORMATS}"
)
if target_format not in self._SUPPORTED_FORMATS:
raise ValueError(
f"Unsupported output format {target_format}. Choose from {self._SUPPORTED_FORMATS}"
)
self._input_format = input_format
self._target_format = target_format

self._featurization_fn = featurization_fn

# For the iterator implementation
self._pointer = 0

Expand All @@ -90,43 +94,73 @@ def is_multi_input(self):

@property
def inputs(self):
"""
Scikit-learn style access to the inputs.
If the dataset is multi-input, this will return a dict of arrays.
"""
"""Alias for `self.as_array("x")`"""
return self.as_array("x")

@property
def X(self):
"""Alias for `self.as_array("x")`"""
return self.as_array("x")

@property
def targets(self):
"""
Scikit-learn style access to the targets.
If the dataset is multi-target, this will return a dict of arrays.
"""
"""Alias for `self.as_array("y")`"""
return self.as_array("y")

@property
def y(self):
"""Alias for `self.as_array("y")`"""
return self.as_array("y")

@staticmethod
def _convert(data: dict, order: List[str], fmt: str):
"""Converts from the default dict format to the specified format"""
def _format(data: dict, order: List[str], fmt: str):
"""
Converts the internally used dict format to the user-specified format.
If the user-specified format is a tuple, it orders the column according to the specified order.
"""
if len(data) == 1:
data = list(data.values())[0]
elif fmt == "tuple":
data = tuple(data[k] for k in order)
return data

def _extract(
def _get_single(
self,
data: DatapointType,
data_type: Union[Literal["x"], Literal["y"], Literal["xy"]],
key: Optional[str] = None,
row: str | int,
cols: List[str],
featurization_fn: Optional[Callable],
format: DataFormat,
):
"""Helper function to extract data from the return format of this class"""
if self._hide_targets:
return data
x, y = data
ret = x if data_type == "x" else y
if not isinstance(ret, dict) or key is None:
return ret
return ret[key]
"""
Loads a subset of the variables for a single data-point from the datasets.
The dataset stores datapoint in a row-wise manner, so this method is used to access a single row.

Args:
row: The row index of the datapoint.
cols: The columns (i.e. variables) to load for that data point.
featurization_fn: The transformation function to apply to the data-point.
format: The format to return the data-point in.
"""
# Load the data-point
# Also handles loading data stored in external files for pointer columns
ret = {col: self.dataset.get_data(row, col) for col in cols}

# Format
ret = self._format(ret, cols, format)

# Featurize
if featurization_fn is not None:
ret = featurization_fn(ret)

return ret

def _get_single_input(self, row: str | int):
"""Get a single input for a specific data-point and given the benchmark specification."""
return self._get_single(row, self.input_cols, self._featurization_fn, self._input_format)

def _get_single_output(self, row: str | int):
"""Get a single output for a specific data-point and given the benchmark specification."""
return self._get_single(row, self.target_cols, None, self._target_format)

def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
"""
Expand All @@ -138,21 +172,30 @@ def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
return self.as_array("x"), self.as_array("y")

if data_type == "y" and self._hide_targets:
raise TestAccessError("Within Polaris, you should not need to access the targets of the test set")

if not self.is_multi_task:
return np.array([self._extract(ret, data_type) for ret in self])

out = {}
columns = self.input_cols if data_type == "x" else self.target_cols

# Temporarily change the target format for easier conversion
with tmp_attribute_change(self, "_target_format", "dict"):
with tmp_attribute_change(self, "_input_format", "dict"):
for k in columns:
out[k] = np.array([self._extract(ret, data_type, k) for ret in self])

return self._convert(out, self.target_cols, self._target_format)
raise TestAccessError("Within Polaris you should not need to access the targets of the test set")

if data_type == "x":
ret = [self._get_single_input(self.dataset.table.iloc[idx].name) for idx in self.indices]
else:
ret = [self._get_single_output(self.dataset.table.iloc[idx].name) for idx in self.indices]

if not ((self.is_multi_input and data_type == "x") or (self.is_multi_task and data_type == "y")):
# If the target format is not a dict, we can just create the array directly.
# With a single-task or single-input data point, this will be a 1D array.
# With a multi-task or multi-input data point, this will be a 2D array.
return np.array(ret)

# If the return format is a dict, we want to convert
# from an array of dicts to a dict of arrays.
if data_type == "y" and self._target_format == "dict":
ret = {k: np.array([v[k] for v in ret]) for k in self.target_cols}
elif data_type == "x" and self._input_format == "dict":
ret = {k: np.array([v[k] for v in ret]) for k in self.input_cols}
else:
# The format is a tuple, so we have list of tuples and convert this to an array
ret = np.array(ret)

return ret

def __len__(self):
return len(self.indices)
Expand All @@ -175,18 +218,15 @@ def __getitem__(self, item) -> DatapointType:
row = self.dataset.table.iloc[idx]

# Load the input modalities
ins = {col: self.dataset.get_data(row.name, col) for col in self.input_cols}
ins = self._convert(ins, self.input_cols, self._input_format)
ins = self._get_single_input(row.name)

if self._hide_targets:
# If we are not allowed to access the targets, we return the inputs only.
# This is useful to make accidental access to the test set less likely.
return ins

# Retrieve the targets
outs = {col: self.dataset.get_data(row.name, col) for col in self.target_cols}
outs = self._convert(outs, self.target_cols, self._target_format)

outs = self._get_single_output(row.name)
return ins, outs

def __iter__(self):
Expand Down
6 changes: 3 additions & 3 deletions polaris/utils/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ class License(BaseModel):
Else it is required to manually specify this.
"""

SPDX_LICENSE_DATA_PATH: ClassVar[
str
] = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
SPDX_LICENSE_DATA_PATH: ClassVar[str] = (
"https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
)

id: str
reference: Optional[HttpUrlString] = None
Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ lint.ignore = [
line-length = 110
target-version = "py310"

[tool.ruff.lint.per-file-ignores]
"__init__.py" = [
lint.per-file-ignores."__init__.py" = [
"F401", # imported but unused
"E402", # Module level import not at top of file
]
10 changes: 5 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_zarr_archive_single_array(tmp_path):
return _get_zarr_archive(tmp_path, datapoint_per_array=False)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def test_single_task_benchmark(test_dataset):
train_indices = list(range(90))
test_indices = list(range(90, 100))
Expand All @@ -96,7 +96,7 @@ def test_single_task_benchmark(test_dataset):
)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def test_single_task_benchmark_clf(test_dataset):
train_indices = list(range(90))
test_indices = list(range(90, 100))
Expand All @@ -111,7 +111,7 @@ def test_single_task_benchmark_clf(test_dataset):
)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def test_single_task_benchmark_multiple_test_sets(test_dataset):
train_indices = list(range(90))
test_indices = {"test_1": list(range(90, 95)), "test_2": list(range(95, 100))}
Expand All @@ -133,7 +133,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def test_multi_task_benchmark(test_dataset):
# For the sake of simplicity, just use a small set of indices
train_indices = list(range(90))
Expand All @@ -157,7 +157,7 @@ def test_multi_task_benchmark(test_dataset):
)


@pytest.fixture(scope="module")
@pytest.fixture(scope="function")
def test_multi_task_benchmark_clf(test_dataset):
# For the sake of simplicity, just use a small set of indices
train_indices = list(range(90))
Expand Down
1 change: 1 addition & 0 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def test_multi_task_benchmark_loop(test_multi_task_benchmark):
x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test.inputs])

y_pred = {}
print(multi_y)
for k, y in multi_y.items():
model = RandomForestRegressor()

Expand Down
Loading