polaris-hub · cwognum · Mar 6, 2024 · Feb 16, 2024 · Feb 18, 2024 · Feb 19, 2024
@@ -1,7 +1,7 @@
 import json
 import os
 from hashlib import md5
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import fsspec
 import numpy as np
@@ -353,7 +353,10 @@ def task_type(self) -> TaskType:
         return v.value
 
     def get_train_test_split(
-        self, input_format: DataFormat = "dict", target_format: DataFormat = "dict"
+        self,
+        input_format: DataFormat = "dict",
+        target_format: DataFormat = "dict",
+        featurization_fn: Optional[Callable] = None,
     ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]:
         """Construct the train and test sets, given the split in the benchmark specification.
 
@@ -365,6 +368,8 @@ def get_train_test_split(
             input_format: How the input data is returned from the `Subset` object.
             target_format: How the target data is returned from the `Subset` object.
                 This will only affect the train set.
+            featurization_fn: A function to apply to the input data. If a multi-input benchmark, this function
+                expects an input in the format specified by the `input_format` parameter.
 
         Returns:
             A tuple with the train `Subset` and test `Subset` objects.
@@ -381,13 +386,15 @@ def _get_subset(indices, hide_targets):
                 target_cols=self.target_cols,
                 target_format=target_format,
                 hide_targets=hide_targets,
+                featurization_fn=featurization_fn,
             )
 
         train = _get_subset(self.split[0], hide_targets=False)
         if isinstance(self.split[1], dict):
             test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()}
         else:
             test = _get_subset(self.split[1], hide_targets=True)
+
         return train, test
 
     def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
@@ -406,8 +413,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         5. There can be metrics which measure across tasks.
 
         Args:
-            y_pred: The predictions for the test set, as NumPy arrays. If there are multiple test sets,
-                this should be a dictionary with the test set names as keys.
+            y_pred: The predictions for the test set, as NumPy arrays.
+                If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
+                If there are multiple test sets, the predictions should be further wrapped in a dictionary
+                    with the test subset labels as keys.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
@@ -416,7 +425,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
         # This simplifies the API, but also was added to make accidental access to the test set targets less likely.
         # See also the `hide_targets` parameter in the `Subset` class.
-        test = self.get_train_test_split()[1]
+        test = self.get_train_test_split(target_format="dict")[1]
 
         if not isinstance(test, dict):
             test = {"test": test}

@@ -1,16 +1,29 @@
-from typing import List, Literal, Optional, Sequence, Union
+from typing import Callable, List, Literal, Optional, Sequence, Union
 
 import numpy as np
 
 from polaris.dataset import Dataset
-from polaris.utils.context import tmp_attribute_change
 from polaris.utils.errors import TestAccessError
 from polaris.utils.types import DataFormat, DatapointType
 
 
 class Subset:
     """The `Subset` class provides easy access to a single partition of a split dataset.
 
+    Info: No need to create this class manually
+        You should not have to create this class manually. In most use-cases, you can create a `Subset` through the
+        `get_train_test_split` method of a `BenchmarkSpecification` object.
+
+    Tip: Featurize your inputs
+        Not all datasets are already featurized. For example, a small-molecule task might simply provide the SMILES string.
+        To easily featurize the inputs, you can pass or set a transformation function. For example:
+
+        ```python
+        import datamol as dm
+
+        benchmark.get_train_test_split(..., featurization_fn=dm.to_fp)
+        ```
+
     This should be the starting point for any framework-specific (e.g. PyTorch, Tensorflow) data-loader implementation.
     How the data is loaded in Polaris can be non-trivial, so this class is provided to abstract away the details.
     To easily build framework-specific data-loaders, a `Subset` supports various styles of accessing the data:
@@ -45,35 +58,26 @@ class Subset:
         TestAccessError: When trying to access the targets of the test set (specified by the `hide_targets` attribute).
     """
 
-    _SUPPORTED_FORMATS = ["dict", "tuple"]
-
     def __init__(
         self,
         dataset: Dataset,
         indices: List[Union[int, Sequence[int]]],
         input_cols: Union[List[str], str],
         target_cols: Union[List[str], str],
         input_format: DataFormat = "dict",
-        target_format: DataFormat = "tuple",
+        target_format: DataFormat = "dict",
+        featurization_fn: Optional[Callable] = None,
         hide_targets: bool = False,
     ):
         self.dataset = dataset
         self.indices = indices
         self.target_cols = target_cols if isinstance(target_cols, list) else [target_cols]
         self.input_cols = input_cols if isinstance(input_cols, list) else [input_cols]
-
-        # Validate the output format
-        if input_format not in self._SUPPORTED_FORMATS:
-            raise ValueError(
-                f"Unsupported output format {input_format}. Choose from {self._SUPPORTED_FORMATS}"
-            )
-        if target_format not in self._SUPPORTED_FORMATS:
-            raise ValueError(
-                f"Unsupported output format {target_format}. Choose from {self._SUPPORTED_FORMATS}"
-            )
         self._input_format = input_format
         self._target_format = target_format
 
+        self._featurization_fn = featurization_fn
+
         # For the iterator implementation
         self._pointer = 0
 
@@ -90,43 +94,73 @@ def is_multi_input(self):
 
     @property
     def inputs(self):
-        """
-        Scikit-learn style access to the inputs.
-        If the dataset is multi-input, this will return a dict of arrays.
-        """
+        """Alias for `self.as_array("x")`"""
+        return self.as_array("x")
+
+    @property
+    def X(self):
+        """Alias for `self.as_array("x")`"""
         return self.as_array("x")
 
     @property
     def targets(self):
-        """
-        Scikit-learn style access to the targets.
-        If the dataset is multi-target, this will return a dict of arrays.
-        """
+        """Alias for `self.as_array("y")`"""
+        return self.as_array("y")
+
+    @property
+    def y(self):
+        """Alias for `self.as_array("y")`"""
         return self.as_array("y")
 
     @staticmethod
-    def _convert(data: dict, order: List[str], fmt: str):
-        """Converts from the default dict format to the specified format"""
+    def _format(data: dict, order: List[str], fmt: str):
+        """
+        Converts the internally used dict format to the user-specified format.
+        If the user-specified format is a tuple, it orders the column according to the specified order.
+        """
         if len(data) == 1:
             data = list(data.values())[0]
         elif fmt == "tuple":
             data = tuple(data[k] for k in order)
         return data
 
-    def _extract(
+    def _get_single(
         self,
-        data: DatapointType,
-        data_type: Union[Literal["x"], Literal["y"], Literal["xy"]],
-        key: Optional[str] = None,
+        row: str | int,
+        cols: List[str],
+        featurization_fn: Optional[Callable],
+        format: DataFormat,
     ):
-        """Helper function to extract data from the return format of this class"""
-        if self._hide_targets:
-            return data
-        x, y = data
-        ret = x if data_type == "x" else y
-        if not isinstance(ret, dict) or key is None:
-            return ret
-        return ret[key]
+        """
+        Loads a subset of the variables for a single data-point from the datasets.
+        The dataset stores datapoint in a row-wise manner, so this method is used to access a single row.
+
+        Args:
+            row: The row index of the datapoint.
+            cols: The columns (i.e. variables) to load for that data point.
+            featurization_fn: The transformation function to apply to the data-point.
+            format: The format to return the data-point in.
+        """
+        # Load the data-point
+        # Also handles loading data stored in external files for pointer columns
+        ret = {col: self.dataset.get_data(row, col) for col in cols}
+
+        # Format
+        ret = self._format(ret, cols, format)
+
+        # Featurize
+        if featurization_fn is not None:
+            ret = featurization_fn(ret)
+
+        return ret
+
+    def _get_single_input(self, row: str | int):
+        """Get a single input for a specific data-point and given the benchmark specification."""
+        return self._get_single(row, self.input_cols, self._featurization_fn, self._input_format)
+
+    def _get_single_output(self, row: str | int):
+        """Get a single output for a specific data-point and given the benchmark specification."""
+        return self._get_single(row, self.target_cols, None, self._target_format)
 
     def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
         """
@@ -138,21 +172,30 @@ def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
             return self.as_array("x"), self.as_array("y")
 
         if data_type == "y" and self._hide_targets:
-            raise TestAccessError("Within Polaris, you should not need to access the targets of the test set")
-
-        if not self.is_multi_task:
-            return np.array([self._extract(ret, data_type) for ret in self])
-
-        out = {}
-        columns = self.input_cols if data_type == "x" else self.target_cols
-
-        # Temporarily change the target format for easier conversion
-        with tmp_attribute_change(self, "_target_format", "dict"):
-            with tmp_attribute_change(self, "_input_format", "dict"):
-                for k in columns:
-                    out[k] = np.array([self._extract(ret, data_type, k) for ret in self])
-
-        return self._convert(out, self.target_cols, self._target_format)
+            raise TestAccessError("Within Polaris you should not need to access the targets of the test set")
+
+        if data_type == "x":
+            ret = [self._get_single_input(self.dataset.table.iloc[idx].name) for idx in self.indices]
+        else:
+            ret = [self._get_single_output(self.dataset.table.iloc[idx].name) for idx in self.indices]
+
+        if not ((self.is_multi_input and data_type == "x") or (self.is_multi_task and data_type == "y")):
+            # If the target format is not a dict, we can just create the array directly.
+            # With a single-task or single-input data point, this will be a 1D array.
+            # With a multi-task or multi-input data point, this will be a 2D array.
+            return np.array(ret)
+
+        # If the return format is a dict, we want to convert
+        # from an array of dicts to a dict of arrays.
+        if data_type == "y" and self._target_format == "dict":
+            ret = {k: np.array([v[k] for v in ret]) for k in self.target_cols}
+        elif data_type == "x" and self._input_format == "dict":
+            ret = {k: np.array([v[k] for v in ret]) for k in self.input_cols}
+        else:
+            # The format is a tuple, so we have list of tuples and convert this to an array
+            ret = np.array(ret)
+
+        return ret
 
     def __len__(self):
         return len(self.indices)
@@ -175,18 +218,15 @@ def __getitem__(self, item) -> DatapointType:
         row = self.dataset.table.iloc[idx]
 
         # Load the input modalities
-        ins = {col: self.dataset.get_data(row.name, col) for col in self.input_cols}
-        ins = self._convert(ins, self.input_cols, self._input_format)
+        ins = self._get_single_input(row.name)
 
         if self._hide_targets:
             # If we are not allowed to access the targets, we return the inputs only.
             # This is useful to make accidental access to the test set less likely.
             return ins
 
         # Retrieve the targets
-        outs = {col: self.dataset.get_data(row.name, col) for col in self.target_cols}
-        outs = self._convert(outs, self.target_cols, self._target_format)
-
+        outs = self._get_single_output(row.name)
         return ins, outs
 
     def __iter__(self):

@@ -124,9 +124,9 @@ class License(BaseModel):
             Else it is required to manually specify this.
     """
 
-    SPDX_LICENSE_DATA_PATH: ClassVar[
-        str
-    ] = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
+    SPDX_LICENSE_DATA_PATH: ClassVar[str] = (
+        "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
+    )
 
     id: str
     reference: Optional[HttpUrlString] = None

@@ -107,8 +107,7 @@ lint.ignore = [
 line-length = 110
 target-version = "py310"
 
-[tool.ruff.lint.per-file-ignores]
-"__init__.py" = [
+lint.per-file-ignores."__init__.py" = [
     "F401", # imported but unused
     "E402", # Module level import not at top of file
 ]
@@ -74,7 +74,7 @@ def test_zarr_archive_single_array(tmp_path):
     return _get_zarr_archive(tmp_path, datapoint_per_array=False)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_single_task_benchmark(test_dataset):
     train_indices = list(range(90))
     test_indices = list(range(90, 100))
@@ -96,7 +96,7 @@ def test_single_task_benchmark(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_single_task_benchmark_clf(test_dataset):
     train_indices = list(range(90))
     test_indices = list(range(90, 100))
@@ -111,7 +111,7 @@ def test_single_task_benchmark_clf(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_single_task_benchmark_multiple_test_sets(test_dataset):
     train_indices = list(range(90))
     test_indices = {"test_1": list(range(90, 95)), "test_2": list(range(95, 100))}
@@ -133,7 +133,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_multi_task_benchmark(test_dataset):
     # For the sake of simplicity, just use a small set of indices
     train_indices = list(range(90))
@@ -157,7 +157,7 @@ def test_multi_task_benchmark(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_multi_task_benchmark_clf(test_dataset):
     # For the sake of simplicity, just use a small set of indices
     train_indices = list(range(90))

@@ -50,6 +50,7 @@ def test_multi_task_benchmark_loop(test_multi_task_benchmark):
     x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test.inputs])
 
     y_pred = {}
+    print(multi_y)
     for k, y in multi_y.items():
         model = RandomForestRegressor()