From a8343aae702723d307f9da4a2e0533ac46c0958e Mon Sep 17 00:00:00 2001
From: cwognum <caswognum@outlook.com>
Date: Fri, 16 Feb 2024 18:35:52 -0500
Subject: [PATCH 1/8] First implementation to add featurization to the Subset
 class

---
 polaris/benchmark/_base.py |  14 +++-
 polaris/dataset/_subset.py | 137 +++++++++++++++++++++++--------------
 tests/test_integration.py  |   1 +
 3 files changed, 100 insertions(+), 52 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 1b9e8e93..a942067d 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -1,7 +1,7 @@
 import json
 import os
 from hashlib import md5
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import fsspec
 import numpy as np
@@ -353,7 +353,11 @@ def task_type(self) -> TaskType:
         return v.value
 
     def get_train_test_split(
-        self, input_format: DataFormat = "dict", target_format: DataFormat = "dict"
+        self,
+        input_format: DataFormat = "dict",
+        target_format: DataFormat = "dict",
+        input_transform_fn: Optional[Callable] = None,
+        target_transform_fn: Optional[Callable] = None,
     ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]:
         """Construct the train and test sets, given the split in the benchmark specification.
 
@@ -365,6 +369,10 @@ def get_train_test_split(
             input_format: How the input data is returned from the `Subset` object.
             target_format: How the target data is returned from the `Subset` object.
                 This will only affect the train set.
+            input_transform_fn: A function to apply to the input data. If a multi-input benchmark, this function
+                receives a dict with the columns as keys.
+            target_transform_fn: A function to apply to the target data. If a multi-target benchmark, this function
+                receives a dict with the columns as keys.
 
         Returns:
             A tuple with the train `Subset` and test `Subset` objects.
@@ -381,6 +389,8 @@ def _get_subset(indices, hide_targets):
                 target_cols=self.target_cols,
                 target_format=target_format,
                 hide_targets=hide_targets,
+                input_transform_fn=input_transform_fn,
+                target_transform_fn=target_transform_fn,
             )
 
         train = _get_subset(self.split[0], hide_targets=False)
diff --git a/polaris/dataset/_subset.py b/polaris/dataset/_subset.py
index e1d4b9ae..f776d1fa 100644
--- a/polaris/dataset/_subset.py
+++ b/polaris/dataset/_subset.py
@@ -1,9 +1,8 @@
-from typing import List, Literal, Optional, Sequence, Union
+from typing import Callable, List, Literal, Optional, Sequence, Union
 
 import numpy as np
 
 from polaris.dataset import Dataset
-from polaris.utils.context import tmp_attribute_change
 from polaris.utils.errors import TestAccessError
 from polaris.utils.types import DataFormat, DatapointType
 
@@ -11,6 +10,19 @@
 class Subset:
     """The `Subset` class provides easy access to a single partition of a split dataset.
 
+    Info: No need to create this class manually
+        You should not have to create this class manually. In most use-cases, you can create a `Subset` through the
+        `get_train_test_split` method of a `BenchmarkSpecification` object.
+
+    Tip: Featurize your inputs
+        Not all datasets are already featurized. For example, a small-molecule task might simply provide the SMILES string.
+        To easily featurize the inputs, you can pass or set a transformation function. For example:
+
+        ```python
+        import datamol as dm
+        benchmark.get_train_test_split(..., input_transform_fn=dm.to_fp)
+        ```
+
     This should be the starting point for any framework-specific (e.g. PyTorch, Tensorflow) data-loader implementation.
     How the data is loaded in Polaris can be non-trivial, so this class is provided to abstract away the details.
     To easily build framework-specific data-loaders, a `Subset` supports various styles of accessing the data:
@@ -55,24 +67,18 @@ def __init__(
         target_cols: Union[List[str], str],
         input_format: DataFormat = "dict",
         target_format: DataFormat = "tuple",
+        input_transform_fn: Optional[Callable] = None,
+        target_transform_fn: Optional[Callable] = None,
         hide_targets: bool = False,
     ):
         self.dataset = dataset
         self.indices = indices
         self.target_cols = target_cols if isinstance(target_cols, list) else [target_cols]
         self.input_cols = input_cols if isinstance(input_cols, list) else [input_cols]
-
-        # Validate the output format
-        if input_format not in self._SUPPORTED_FORMATS:
-            raise ValueError(
-                f"Unsupported output format {input_format}. Choose from {self._SUPPORTED_FORMATS}"
-            )
-        if target_format not in self._SUPPORTED_FORMATS:
-            raise ValueError(
-                f"Unsupported output format {target_format}. Choose from {self._SUPPORTED_FORMATS}"
-            )
         self._input_format = input_format
         self._target_format = target_format
+        self._input_transform_fn = input_transform_fn
+        self._target_transform_fn = target_transform_fn
 
         # For the iterator implementation
         self._pointer = 0
@@ -90,43 +96,71 @@ def is_multi_input(self):
 
     @property
     def inputs(self):
-        """
-        Scikit-learn style access to the inputs.
-        If the dataset is multi-input, this will return a dict of arrays.
-        """
+        """Alias for `self.as_array("x")`"""
+        return self.as_array("x")
+
+    @property
+    def X(self):
+        """Alias for `self.as_array("x")`"""
         return self.as_array("x")
 
     @property
     def targets(self):
-        """
-        Scikit-learn style access to the targets.
-        If the dataset is multi-target, this will return a dict of arrays.
-        """
+        """Alias for `self.as_array("y")`"""
+        return self.as_array("y")
+
+    @property
+    def y(self):
+        """Alias for `self.as_array("y")`"""
         return self.as_array("y")
 
     @staticmethod
-    def _convert(data: dict, order: List[str], fmt: str):
-        """Converts from the default dict format to the specified format"""
+    def _format(data: dict, order: List[str], fmt: str):
+        """
+        Converts the internally used dict format to the user-specified format.
+        If the user-specified format is a tuple, it orders the column according to the specified order.
+        """
         if len(data) == 1:
             data = list(data.values())[0]
         elif fmt == "tuple":
             data = tuple(data[k] for k in order)
         return data
 
-    def _extract(
+    def _get_single(
         self,
-        data: DatapointType,
-        data_type: Union[Literal["x"], Literal["y"], Literal["xy"]],
-        key: Optional[str] = None,
+        row: str | int,
+        cols: List[str],
+        transform_fn: Optional[Callable],
+        format: DataFormat,
     ):
-        """Helper function to extract data from the return format of this class"""
-        if self._hide_targets:
-            return data
-        x, y = data
-        ret = x if data_type == "x" else y
-        if not isinstance(ret, dict) or key is None:
-            return ret
-        return ret[key]
+        """
+        Loads a subset of the variables for a single data-point from the datasets.
+        The dataset stores datapoint in a row-wise manner, so this method is used to access a single row.
+
+        Args:
+            row: The row index of the datapoint.
+            cols: The columns (i.e. variables) to load for that data point.
+            transform_fn: The transformation function to apply to the data-point.
+            format: The format to return the data-point in.
+        """
+        # Load the data-point
+        # Also handles loading data stored in external files for pointer columns
+        ret = {col: self.dataset.get_data(row, col) for col in cols}
+
+        # Transform
+        if transform_fn is not None:
+            ret = transform_fn(ret)
+
+        # Format
+        return self._format(ret, cols, format)
+
+    def _get_single_input(self, row: str | int):
+        """Get a single input for a specific data-point and given the benchmark specification."""
+        return self._get_single(row, self.input_cols, self._input_transform_fn, self._input_format)
+
+    def _get_single_output(self, row: str | int):
+        """Get a single output for a specific data-point and given the benchmark specification."""
+        return self._get_single(row, self.target_cols, self._target_transform_fn, self._target_format)
 
     def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
         """
@@ -138,21 +172,27 @@ def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
             return self.as_array("x"), self.as_array("y")
 
         if data_type == "y" and self._hide_targets:
-            raise TestAccessError("Within Polaris, you should not need to access the targets of the test set")
+            raise TestAccessError("Within Polaris you should not need to access the targets of the test set")
 
-        if not self.is_multi_task:
-            return np.array([self._extract(ret, data_type) for ret in self])
+        if data_type == "x":
+            ret = [self._get_single_input(self.dataset.table.iloc[idx].name) for idx in self.indices]
+        else:
+            ret = [self._get_single_output(self.dataset.table.iloc[idx].name) for idx in self.indices]
 
-        out = {}
-        columns = self.input_cols if data_type == "x" else self.target_cols
+        if not ((self.is_multi_input and data_type == "x") or (self.is_multi_task and data_type == "y")):
+            # If the target format is not a dict, we can just create the array directly.
+            # With a single-task or single-input data point, this will be a 1D array.
+            # With a multi-task or multi-input data point, this will be a 2D array.
+            return np.array(ret)
 
-        # Temporarily change the target format for easier conversion
-        with tmp_attribute_change(self, "_target_format", "dict"):
-            with tmp_attribute_change(self, "_input_format", "dict"):
-                for k in columns:
-                    out[k] = np.array([self._extract(ret, data_type, k) for ret in self])
+        # If the return type is a dict, we want to convert
+        # from an array of dicts to a dict of arrays.
+        if data_type == "y":
+            ret = {k: np.array([v[k] for v in ret]) for k in self.target_cols}
+        else:
+            ret = {k: np.array([v[k] for v in ret]) for k in self.input_cols}
 
-        return self._convert(out, self.target_cols, self._target_format)
+        return ret
 
     def __len__(self):
         return len(self.indices)
@@ -175,8 +215,7 @@ def __getitem__(self, item) -> DatapointType:
         row = self.dataset.table.iloc[idx]
 
         # Load the input modalities
-        ins = {col: self.dataset.get_data(row.name, col) for col in self.input_cols}
-        ins = self._convert(ins, self.input_cols, self._input_format)
+        ins = self._get_single_input(row.name)
 
         if self._hide_targets:
             # If we are not allowed to access the targets, we return the inputs only.
@@ -184,9 +223,7 @@ def __getitem__(self, item) -> DatapointType:
             return ins
 
         # Retrieve the targets
-        outs = {col: self.dataset.get_data(row.name, col) for col in self.target_cols}
-        outs = self._convert(outs, self.target_cols, self._target_format)
-
+        outs = self._get_single_output(row.name)
         return ins, outs
 
     def __iter__(self):
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 7b96a4e3..b2f4626c 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -50,6 +50,7 @@ def test_multi_task_benchmark_loop(test_multi_task_benchmark):
     x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test.inputs])
 
     y_pred = {}
+    print(multi_y)
     for k, y in multi_y.items():
         model = RandomForestRegressor()
 

From b99dfbfdce664419faa1adf87f8b1b12c8f7300c Mon Sep 17 00:00:00 2001
From: Cas Wognum <cas.wognum@recursionpharma.com>
Date: Sun, 18 Feb 2024 12:37:13 -0500
Subject: [PATCH 2/8] Added test cases and throw an error when there is test
 set ambiguity

---
 docs/tutorials/basics.ipynb | 444 ++++++++++++++++++++++++++++++++++--
 polaris/benchmark/_base.py  |  40 +++-
 polaris/dataset/_subset.py  |   7 +-
 polaris/utils/errors.py     |   4 +
 tests/conftest.py           |  10 +-
 tests/test_subset.py        |  85 ++++++-
 6 files changed, 562 insertions(+), 28 deletions(-)

diff --git a/docs/tutorials/basics.ipynb b/docs/tutorials/basics.ipynb
index 8c91df01..8dde27ba 100644
--- a/docs/tutorials/basics.ipynb
+++ b/docs/tutorials/basics.ipynb
@@ -63,7 +63,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32m2023-11-27 14:54:08.788\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as cwognum (cas@valencediscovery.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n"
+      "\u001b[32m2024-02-18 12:35:01.048\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as cwognum (cas@valencelabs.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n"
      ]
     }
    ],
@@ -99,8 +99,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = po.load_dataset(\"polaris/hello_world_dataset\")\n",
-    "benchmark = po.load_benchmark(\"polaris/hello_world_benchmark\")"
+    "dataset = po.load_dataset(\"polaris/hello-world\")\n",
+    "benchmark = po.load_benchmark(\"polaris/hello-world-benchmark\")"
    ]
   },
   {
@@ -231,10 +231,422 @@
    "id": "748dd278-0fd0-4c5b-ac6a-8d974143c3b9",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Failed to find the pandas get_adjustment() function to patch\n",
+      "Failed to patch pandas - PandasTools will have limited functionality\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
-       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestRegressor(max_depth=2, random_state=0)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestRegressor</label><div class=\"sk-toggleable__content\"><pre>RandomForestRegressor(max_depth=2, random_state=0)</pre></div></div></div></div></div>"
+       "<style>#sk-container-id-1 {\n",
+       "  /* Definition of color scheme common for light and dark mode */\n",
+       "  --sklearn-color-text: black;\n",
+       "  --sklearn-color-line: gray;\n",
+       "  /* Definition of color scheme for unfitted estimators */\n",
+       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
+       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
+       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
+       "  --sklearn-color-unfitted-level-3: chocolate;\n",
+       "  /* Definition of color scheme for fitted estimators */\n",
+       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
+       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
+       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
+       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
+       "\n",
+       "  /* Specific color for light theme */\n",
+       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
+       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
+       "  --sklearn-color-icon: #696969;\n",
+       "\n",
+       "  @media (prefers-color-scheme: dark) {\n",
+       "    /* Redefinition of color scheme for dark theme */\n",
+       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
+       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
+       "    --sklearn-color-icon: #878787;\n",
+       "  }\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 pre {\n",
+       "  padding: 0;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-hidden--visually {\n",
+       "  border: 0;\n",
+       "  clip: rect(1px 1px 1px 1px);\n",
+       "  clip: rect(1px, 1px, 1px, 1px);\n",
+       "  height: 1px;\n",
+       "  margin: -1px;\n",
+       "  overflow: hidden;\n",
+       "  padding: 0;\n",
+       "  position: absolute;\n",
+       "  width: 1px;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-dashed-wrapped {\n",
+       "  border: 1px dashed var(--sklearn-color-line);\n",
+       "  margin: 0 0.4em 0.5em 0.4em;\n",
+       "  box-sizing: border-box;\n",
+       "  padding-bottom: 0.4em;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-container {\n",
+       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
+       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
+       "     so we also need the `!important` here to be able to override the\n",
+       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
+       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
+       "  display: inline-block !important;\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-text-repr-fallback {\n",
+       "  display: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-parallel-item,\n",
+       "div.sk-serial,\n",
+       "div.sk-item {\n",
+       "  /* draw centered vertical line to link estimators */\n",
+       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
+       "  background-size: 2px 100%;\n",
+       "  background-repeat: no-repeat;\n",
+       "  background-position: center center;\n",
+       "}\n",
+       "\n",
+       "/* Parallel-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item::after {\n",
+       "  content: \"\";\n",
+       "  width: 100%;\n",
+       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
+       "  flex-grow: 1;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel {\n",
+       "  display: flex;\n",
+       "  align-items: stretch;\n",
+       "  justify-content: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  position: relative;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
+       "  align-self: flex-end;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
+       "  align-self: flex-start;\n",
+       "  width: 50%;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
+       "  width: 0;\n",
+       "}\n",
+       "\n",
+       "/* Serial-specific style estimator block */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-serial {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "  align-items: center;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  padding-right: 1em;\n",
+       "  padding-left: 1em;\n",
+       "}\n",
+       "\n",
+       "\n",
+       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
+       "clickable and can be expanded/collapsed.\n",
+       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
+       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
+       "*/\n",
+       "\n",
+       "/* Pipeline and ColumnTransformer style (default) */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable {\n",
+       "  /* Default theme specific background. It is overwritten whether we have a\n",
+       "  specific estimator or a Pipeline/ColumnTransformer */\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable label */\n",
+       "#sk-container-id-1 label.sk-toggleable__label {\n",
+       "  cursor: pointer;\n",
+       "  display: block;\n",
+       "  width: 100%;\n",
+       "  margin-bottom: 0;\n",
+       "  padding: 0.5em;\n",
+       "  box-sizing: border-box;\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
+       "  /* Arrow on the left of the label */\n",
+       "  content: \"▸\";\n",
+       "  float: left;\n",
+       "  margin-right: 0.25em;\n",
+       "  color: var(--sklearn-color-icon);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "}\n",
+       "\n",
+       "/* Toggleable content - dropdown */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content {\n",
+       "  max-height: 0;\n",
+       "  max-width: 0;\n",
+       "  overflow: hidden;\n",
+       "  text-align: left;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content pre {\n",
+       "  margin: 0.2em;\n",
+       "  border-radius: 0.25em;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
+       "  /* Expand drop-down */\n",
+       "  max-height: 200px;\n",
+       "  max-width: 100%;\n",
+       "  overflow: auto;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
+       "  content: \"▾\";\n",
+       "}\n",
+       "\n",
+       "/* Pipeline/ColumnTransformer-specific style */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific style */\n",
+       "\n",
+       "/* Colorize estimator box */\n",
+       "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  /* The background is the default theme color */\n",
+       "  color: var(--sklearn-color-text-on-default-background);\n",
+       "}\n",
+       "\n",
+       "/* On hover, darken the color of the background */\n",
+       "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Label box, darken color on hover, fitted */\n",
+       "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Estimator label */\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label label {\n",
+       "  font-family: monospace;\n",
+       "  font-weight: bold;\n",
+       "  display: inline-block;\n",
+       "  line-height: 1.2em;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-label-container {\n",
+       "  text-align: center;\n",
+       "}\n",
+       "\n",
+       "/* Estimator-specific */\n",
+       "#sk-container-id-1 div.sk-estimator {\n",
+       "  font-family: monospace;\n",
+       "  border: 1px dotted var(--sklearn-color-border-box);\n",
+       "  border-radius: 0.25em;\n",
+       "  box-sizing: border-box;\n",
+       "  margin-bottom: 0.5em;\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-0);\n",
+       "}\n",
+       "\n",
+       "/* on hover */\n",
+       "#sk-container-id-1 div.sk-estimator:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-2);\n",
+       "}\n",
+       "\n",
+       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
+       "\n",
+       "/* Common style for \"i\" and \"?\" */\n",
+       "\n",
+       ".sk-estimator-doc-link,\n",
+       "a:link.sk-estimator-doc-link,\n",
+       "a:visited.sk-estimator-doc-link {\n",
+       "  float: right;\n",
+       "  font-size: smaller;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1em;\n",
+       "  height: 1em;\n",
+       "  width: 1em;\n",
+       "  text-decoration: none !important;\n",
+       "  margin-left: 1ex;\n",
+       "  /* unfitted */\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted,\n",
+       "a:link.sk-estimator-doc-link.fitted,\n",
+       "a:visited.sk-estimator-doc-link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
+       ".sk-estimator-doc-link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover,\n",
+       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
+       ".sk-estimator-doc-link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "/* Span, style for the box shown on hovering the info icon */\n",
+       ".sk-estimator-doc-link span {\n",
+       "  display: none;\n",
+       "  z-index: 9999;\n",
+       "  position: relative;\n",
+       "  font-weight: normal;\n",
+       "  right: .2ex;\n",
+       "  padding: .5ex;\n",
+       "  margin: .5ex;\n",
+       "  width: min-content;\n",
+       "  min-width: 20ex;\n",
+       "  max-width: 50ex;\n",
+       "  color: var(--sklearn-color-text);\n",
+       "  box-shadow: 2pt 2pt 4pt #999;\n",
+       "  /* unfitted */\n",
+       "  background: var(--sklearn-color-unfitted-level-0);\n",
+       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link.fitted span {\n",
+       "  /* fitted */\n",
+       "  background: var(--sklearn-color-fitted-level-0);\n",
+       "  border: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "\n",
+       ".sk-estimator-doc-link:hover span {\n",
+       "  display: block;\n",
+       "}\n",
+       "\n",
+       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link {\n",
+       "  float: right;\n",
+       "  font-size: 1rem;\n",
+       "  line-height: 1em;\n",
+       "  font-family: monospace;\n",
+       "  background-color: var(--sklearn-color-background);\n",
+       "  border-radius: 1rem;\n",
+       "  height: 1rem;\n",
+       "  width: 1rem;\n",
+       "  text-decoration: none;\n",
+       "  /* unfitted */\n",
+       "  color: var(--sklearn-color-unfitted-level-1);\n",
+       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted {\n",
+       "  /* fitted */\n",
+       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
+       "  color: var(--sklearn-color-fitted-level-1);\n",
+       "}\n",
+       "\n",
+       "/* On hover */\n",
+       "#sk-container-id-1 a.estimator_doc_link:hover {\n",
+       "  /* unfitted */\n",
+       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
+       "  color: var(--sklearn-color-background);\n",
+       "  text-decoration: none;\n",
+       "}\n",
+       "\n",
+       "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
+       "  /* fitted */\n",
+       "  background-color: var(--sklearn-color-fitted-level-3);\n",
+       "}\n",
+       "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestRegressor(max_depth=2, random_state=0)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestRegressor<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.ensemble.RandomForestRegressor.html\">?<span>Documentation for RandomForestRegressor</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestRegressor(max_depth=2, random_state=0)</pre></div> </div></div></div></div>"
       ],
       "text/plain": [
        "RandomForestRegressor(max_depth=2, random_state=0)"
@@ -249,12 +661,15 @@
     "import datamol as dm\n",
     "from sklearn.ensemble import RandomForestRegressor\n",
     "\n",
-    "# Convert smiles to ECFP fingerprints\n",
-    "train_fps = [dm.to_fp(smi) for smi in train.inputs]\n",
+    "# Load the benchmark (automatically loads the underlying dataset as well)\n",
+    "benchmark = po.load_benchmark(\"polaris/hello_world_benchmark\")\n",
+    "\n",
+    "# Get the split and convert SMILES to ECFP fingerprints by specifying an featurize function.\n",
+    "train, test = benchmark.get_train_test_split(input_transform_fn=dm.to_fp)\n",
     "\n",
     "# Define a model and train\n",
     "model = RandomForestRegressor(max_depth=2, random_state=0)\n",
-    "model.fit(train_fps, train.targets)"
+    "model.fit(train.X, train.y)"
    ]
   },
   {
@@ -272,8 +687,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_fps = [dm.to_fp(smi) for smi in test.inputs]\n",
-    "predictions = model.predict(test_fps)"
+    "predictions = model.predict(test.X)"
    ]
   },
   {
@@ -285,7 +699,7 @@
     {
      "data": {
       "text/html": [
-       "<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>benchmark_name</th><td>hello_world_benchmark</td></tr><tr><th>benchmark_owner</th><td><table border=\"1\"><tr><th>slug</th><td>polaris</td></tr><tr><th>external_id</th><td>org_2WG9hRFgKNIRtGw4orsMPcr1F4S</td></tr><tr><th>type</th><td>organization</td></tr></table></td></tr><tr><th>github_url</th><td>None</td></tr><tr><th>paper_url</th><td>None</td></tr><tr><th>contributors</th><td>None</td></tr><tr><th>artifact_id</th><td>None</td></tr><tr><th>benchmark_artifact_id</th><td>polaris/hello-world-benchmark</td></tr><tr><th>results</th><td><table border=\"1\"><thead><tr><th>Test set</th><th>Target label</th><th>Metric</th><th>Score</th></tr></thead><tbody><tr><td>test</td><td>SOL</td><td>mean_squared_error</td><td>2.6875139821</td></tr><tr><td>test</td><td>SOL</td><td>mean_absolute_error</td><td>1.2735690161</td></tr></tbody></table></td></tr></table>"
+       "<table border=\"1\"><tr><th>name</th><td>None</td></tr><tr><th>description</th><td></td></tr><tr><th>tags</th><td></td></tr><tr><th>user_attributes</th><td></td></tr><tr><th>owner</th><td>None</td></tr><tr><th>benchmark_name</th><td>hello-world-benchmark</td></tr><tr><th>benchmark_owner</th><td><table border=\"1\"><tr><th>slug</th><td>polaris</td></tr><tr><th>external_id</th><td>org_2WG9hRFgKNIRtGw4orsMPcr1F4S</td></tr><tr><th>type</th><td>organization</td></tr></table></td></tr><tr><th>github_url</th><td>None</td></tr><tr><th>paper_url</th><td>None</td></tr><tr><th>contributors</th><td>None</td></tr><tr><th>artifact_id</th><td>None</td></tr><tr><th>benchmark_artifact_id</th><td>polaris/hello-world-benchmark</td></tr><tr><th>results</th><td><table border=\"1\"><thead><tr><th>Test set</th><th>Target label</th><th>Metric</th><th>Score</th></tr></thead><tbody><tr><td>test</td><td>SOL</td><td>mean_squared_error</td><td>2.6875139821</td></tr><tr><td>test</td><td>SOL</td><td>mean_absolute_error</td><td>1.2735690161</td></tr></tbody></table></td></tr></table>"
       ],
       "text/plain": [
        "{\n",
@@ -294,7 +708,7 @@
        "  \"tags\": [],\n",
        "  \"user_attributes\": {},\n",
        "  \"owner\": null,\n",
-       "  \"benchmark_name\": \"hello_world_benchmark\",\n",
+       "  \"benchmark_name\": \"hello-world-benchmark\",\n",
        "  \"benchmark_owner\": {\n",
        "    \"slug\": \"polaris\",\n",
        "    \"external_id\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\",\n",
@@ -349,7 +763,7 @@
    "source": [
     "results.name = f\"hello-world-result\"\n",
     "results.github_url = \"https://github.com/polaris-hub/polaris-hub\"\n",
-    "results.paper_url = \"https://polaris-hub.vercel.app\"\n",
+    "results.paper_url = \"https://polarishub.io/\"\n",
     "results.description = \"Hello, World!\""
    ]
   },
@@ -373,11 +787,11 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/cas/micromamba/envs/polaris/lib/python3.12/site-packages/pydantic/main.py:308: UserWarning: Pydantic serializer warnings:\n",
+      "/Users/cas.wognum/micromamba/envs/polaris/lib/python3.12/site-packages/pydantic/main.py:314: UserWarning: Pydantic serializer warnings:\n",
       "  Expected `url` but got `str` - serialized value may not be as expected\n",
       "  Expected `url` but got `str` - serialized value may not be as expected\n",
       "  return self.__pydantic_serializer__.to_python(\n",
-      "\u001b[32m2023-11-27 14:54:46.649\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m428\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello_world_benchmark/ns4JrC3hQNK9M1hbVPchy\u001b[0m\n"
+      "\u001b[32m2024-02-18 12:35:09.465\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m431\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello-world-benchmark/l3uWzFBEyaD09Sa4Aik21\u001b[0m\n"
      ]
     }
    ],
@@ -415,7 +829,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.0"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index a942067d..244df5ac 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -9,6 +9,7 @@
 from pydantic import (
     Field,
     FieldValidationInfo,
+    PrivateAttr,
     computed_field,
     field_serializer,
     field_validator,
@@ -23,7 +24,7 @@
 from polaris.utils import fs
 from polaris.utils.context import tmp_attribute_change
 from polaris.utils.dict2html import dict2html
-from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
+from polaris.utils.errors import EvaluationError, InvalidBenchmarkError, PolarisChecksumError
 from polaris.utils.misc import listit
 from polaris.utils.types import (
     AccessType,
@@ -112,6 +113,10 @@ class BenchmarkSpecification(BaseArtifactModel):
         default_factory=dict, validate_default=True
     )
 
+    # Private attributes to track the internal state
+    _test_subset: Optional[Subset] = PrivateAttr(None)
+    _n_splits_since_evaluate: int = PrivateAttr(0)
+
     @field_validator("dataset")
     def _validate_dataset(cls, v):
         """
@@ -370,9 +375,9 @@ def get_train_test_split(
             target_format: How the target data is returned from the `Subset` object.
                 This will only affect the train set.
             input_transform_fn: A function to apply to the input data. If a multi-input benchmark, this function
-                receives a dict with the columns as keys.
+                expects an input in the format specified by the `input_format` parameter.
             target_transform_fn: A function to apply to the target data. If a multi-target benchmark, this function
-                receives a dict with the columns as keys.
+                expects an input in the format specified by the `target_format` parameter.
 
         Returns:
             A tuple with the train `Subset` and test `Subset` objects.
@@ -398,6 +403,29 @@ def _get_subset(indices, hide_targets):
             test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()}
         else:
             test = _get_subset(self.split[1], hide_targets=True)
+
+        # Polaris is designed to reduce the risk of accidental access to the test set.
+        # One of the design decisions was to ask the users to just provide the predictions in evaluate(), not y_true.
+        # Because of this, we do need to check if a user has created multiple test objects with different parameters
+        # without calling evaluate in between. This would lead to ambiguity as to which test object to use.
+        previous_test_set = (
+            list(self._test_subset.values())[0] if isinstance(self._test_subset, dict) else self._test_subset
+        )
+        different_parameters = previous_test_set is not None and (
+            previous_test_set._target_format != target_format
+            or previous_test_set._target_transform_fn is not target_transform_fn
+            or previous_test_set._input_format != input_format
+            or previous_test_set._input_transform_fn is not input_transform_fn
+        )
+        self._n_splits_since_evaluate += 1
+        if self._n_splits_since_evaluate > 1 and different_parameters:
+            raise EvaluationError(
+                "You have called get_train_test_split() multiple times with different parameters, "
+                "without calling evaluate() in between. This leads to ambiguity when evaluating the model "
+                "because Polaris cannot infer which test set object should be used."
+            )
+
+        self._test_subset = test
         return train, test
 
     def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
@@ -423,11 +451,13 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
         """
 
-        # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
+        # Instead of having the user pass the ground truth, we maintain a copy of the last created test set internally.
         # This simplifies the API, but also was added to make accidental access to the test set targets less likely.
         # See also the `hide_targets` parameter in the `Subset` class.
-        test = self.get_train_test_split()[1]
 
+        self._n_splits_since_evaluate = 0
+
+        test = self._test_subset
         if not isinstance(test, dict):
             test = {"test": test}
 
diff --git a/polaris/dataset/_subset.py b/polaris/dataset/_subset.py
index f776d1fa..acfea682 100644
--- a/polaris/dataset/_subset.py
+++ b/polaris/dataset/_subset.py
@@ -20,6 +20,7 @@ class Subset:
 
         ```python
         import datamol as dm
+
         benchmark.get_train_test_split(..., input_transform_fn=dm.to_fp)
         ```
 
@@ -147,12 +148,14 @@ def _get_single(
         # Also handles loading data stored in external files for pointer columns
         ret = {col: self.dataset.get_data(row, col) for col in cols}
 
+        # Format
+        ret = self._format(ret, cols, format)
+
         # Transform
         if transform_fn is not None:
             ret = transform_fn(ret)
 
-        # Format
-        return self._format(ret, cols, format)
+        return ret
 
     def _get_single_input(self, row: str | int):
         """Get a single input for a specific data-point and given the benchmark specification."""
diff --git a/polaris/utils/errors.py b/polaris/utils/errors.py
index 1cb46581..daf66c15 100644
--- a/polaris/utils/errors.py
+++ b/polaris/utils/errors.py
@@ -33,3 +33,7 @@ class TestAccessError(Exception):
     __test__ = False
 
     pass
+
+
+class EvaluationError(Exception):
+    pass
diff --git a/tests/conftest.py b/tests/conftest.py
index f5b0077b..cd71fcf2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -74,7 +74,7 @@ def test_zarr_archive_single_array(tmp_path):
     return _get_zarr_archive(tmp_path, datapoint_per_array=False)
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_single_task_benchmark(test_dataset):
     train_indices = list(range(90))
     test_indices = list(range(90, 100))
@@ -96,7 +96,7 @@ def test_single_task_benchmark(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_single_task_benchmark_clf(test_dataset):
     train_indices = list(range(90))
     test_indices = list(range(90, 100))
@@ -111,7 +111,7 @@ def test_single_task_benchmark_clf(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_single_task_benchmark_multiple_test_sets(test_dataset):
     train_indices = list(range(90))
     test_indices = {"test_1": list(range(90, 95)), "test_2": list(range(95, 100))}
@@ -133,7 +133,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_multi_task_benchmark(test_dataset):
     # For the sake of simplicity, just use a small set of indices
     train_indices = list(range(90))
@@ -157,7 +157,7 @@ def test_multi_task_benchmark(test_dataset):
     )
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="function")
 def test_multi_task_benchmark_clf(test_dataset):
     # For the sake of simplicity, just use a small set of indices
     train_indices = list(range(90))
diff --git a/tests/test_subset.py b/tests/test_subset.py
index c06d1168..4538e325 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -1,7 +1,9 @@
+import datamol as dm
+import numpy as np
 import pytest
 
 from polaris.dataset import Subset
-from polaris.utils.errors import TestAccessError
+from polaris.utils.errors import EvaluationError, TestAccessError
 
 
 def test_consistency_across_access_methods(test_dataset):
@@ -24,6 +26,8 @@ def test_consistency_across_access_methods(test_dataset):
     # Property
     assert (task.inputs == expected_smiles).all()
     assert (task.targets == expected_targets).all()
+    assert (task.X == expected_smiles).all()
+    assert (task.y == expected_targets).all()
 
 
 def test_access_to_test_set(test_single_task_benchmark):
@@ -47,3 +51,82 @@ def test_access_to_test_set(test_single_task_benchmark):
     # For the train set it should work
     assert all(isinstance(y, float) for x, y in train)
     assert all(isinstance(train[i][1], float) for i in range(len(train)))
+
+
+def test_input_featurization(test_single_task_benchmark):
+
+    # Without a transformation, we expect a SMILES string
+    train, test = test_single_task_benchmark.get_train_test_split()
+    test_single_task_benchmark._n_splits_since_evaluate = 0  # Manually reset for sake of test
+
+    x, y = train[0]
+    assert isinstance(x, str)
+
+    x = test[0]
+    assert isinstance(x, str)
+
+    train, test = test_single_task_benchmark.get_train_test_split(input_transform_fn=dm.to_fp)
+
+    # For all different flavours of accessing the data
+    # Make sure the input is now featurized
+    x, y = train[0]
+    assert isinstance(x, np.ndarray)
+
+    x = test[0]
+    assert isinstance(x, np.ndarray)
+
+    x, y = next(train)
+    assert isinstance(x, np.ndarray)
+
+    x = next(test)
+    assert isinstance(x, np.ndarray)
+
+    x = train.X[0]
+    assert isinstance(x, np.ndarray)
+
+    x = test.X[0]
+    assert isinstance(x, np.ndarray)
+
+
+def test_target_transformation(test_single_task_benchmark):
+
+    # Get the normal value without transformation
+    train, test = test_single_task_benchmark.get_train_test_split()
+    test_single_task_benchmark._n_splits_since_evaluate = 0  # Manually reset for sake of test
+
+    x, original_y = train[0]
+
+    train, test = test_single_task_benchmark.get_train_test_split(target_transform_fn=lambda y: y * 2)
+
+    # For all different flavours of accessing the data
+    # Make sure the target is now transformed
+    # We do not need to test the test set, because this
+    x, y = train[0]
+    assert y == original_y * 2
+
+    x, y = next(train)
+    assert y == original_y * 2
+
+    y = train.y[0]
+    assert y == original_y * 2
+
+
+def test_expected_exception_with_test_set_ambiguity(test_single_task_benchmark):
+
+    y_pred = np.random.random(len(test_single_task_benchmark.split[1]))
+
+    # When getting the same split twice, we do not expect an error
+    test_single_task_benchmark.get_train_test_split()
+    test_single_task_benchmark.get_train_test_split()
+    test_single_task_benchmark.evaluate(y_pred)
+
+    # With two different split, we do expect an error
+    test_single_task_benchmark.get_train_test_split()
+    with pytest.raises(EvaluationError):
+        test_single_task_benchmark.get_train_test_split(target_transform_fn=lambda x: x * 2)
+    test_single_task_benchmark.evaluate(y_pred)
+
+    # With two different splits, but an evaluate in between, all is good again!
+    test_single_task_benchmark.get_train_test_split()
+    test_single_task_benchmark.evaluate(y_pred)
+    test_single_task_benchmark.get_train_test_split(target_transform_fn=lambda x: x * 2)

From 83403d9ff53281935f35024847c085a62b783b07 Mon Sep 17 00:00:00 2001
From: Cas Wognum <cas.wognum@recursionpharma.com>
Date: Mon, 19 Feb 2024 11:36:57 -0500
Subject: [PATCH 3/8] Removed target transformation function

---
 polaris/benchmark/_base.py  | 13 ++++---------
 polaris/dataset/__init__.py |  3 ++-
 polaris/dataset/_subset.py  | 28 ++++++++++++++--------------
 pyproject.toml              |  6 ++----
 tests/test_subset.py        | 29 +++--------------------------
 5 files changed, 25 insertions(+), 54 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 244df5ac..aab58d12 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -361,8 +361,7 @@ def get_train_test_split(
         self,
         input_format: DataFormat = "dict",
         target_format: DataFormat = "dict",
-        input_transform_fn: Optional[Callable] = None,
-        target_transform_fn: Optional[Callable] = None,
+        featurization_fn: Optional[Callable] = None,
     ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]:
         """Construct the train and test sets, given the split in the benchmark specification.
 
@@ -374,10 +373,8 @@ def get_train_test_split(
             input_format: How the input data is returned from the `Subset` object.
             target_format: How the target data is returned from the `Subset` object.
                 This will only affect the train set.
-            input_transform_fn: A function to apply to the input data. If a multi-input benchmark, this function
+            featurization_fn: A function to apply to the input data. If a multi-input benchmark, this function
                 expects an input in the format specified by the `input_format` parameter.
-            target_transform_fn: A function to apply to the target data. If a multi-target benchmark, this function
-                expects an input in the format specified by the `target_format` parameter.
 
         Returns:
             A tuple with the train `Subset` and test `Subset` objects.
@@ -394,8 +391,7 @@ def _get_subset(indices, hide_targets):
                 target_cols=self.target_cols,
                 target_format=target_format,
                 hide_targets=hide_targets,
-                input_transform_fn=input_transform_fn,
-                target_transform_fn=target_transform_fn,
+                featurization_fn=featurization_fn,
             )
 
         train = _get_subset(self.split[0], hide_targets=False)
@@ -413,9 +409,8 @@ def _get_subset(indices, hide_targets):
         )
         different_parameters = previous_test_set is not None and (
             previous_test_set._target_format != target_format
-            or previous_test_set._target_transform_fn is not target_transform_fn
             or previous_test_set._input_format != input_format
-            or previous_test_set._input_transform_fn is not input_transform_fn
+            or previous_test_set._featurization_fn is not featurization_fn
         )
         self._n_splits_since_evaluate += 1
         if self._n_splits_since_evaluate > 1 and different_parameters:
diff --git a/polaris/dataset/__init__.py b/polaris/dataset/__init__.py
index ecd3098b..0b420ebb 100644
--- a/polaris/dataset/__init__.py
+++ b/polaris/dataset/__init__.py
@@ -1,4 +1,4 @@
-from polaris.dataset._column import ColumnAnnotation
+from polaris.dataset._column import ColumnAnnotation, Modality
 from polaris.dataset._dataset import Dataset
 from polaris.dataset._subset import Subset
 
@@ -6,4 +6,5 @@
     "ColumnAnnotation",
     "Dataset",
     "Subset",
+    "Modality"
 ]
diff --git a/polaris/dataset/_subset.py b/polaris/dataset/_subset.py
index acfea682..a083d941 100644
--- a/polaris/dataset/_subset.py
+++ b/polaris/dataset/_subset.py
@@ -1,9 +1,10 @@
 from typing import Callable, List, Literal, Optional, Sequence, Union
+from loguru import logger
 
 import numpy as np
 
-from polaris.dataset import Dataset
-from polaris.utils.errors import TestAccessError
+from polaris.dataset import Dataset, Modality
+from polaris.utils.errors import EvaluationError, TestAccessError
 from polaris.utils.types import DataFormat, DatapointType
 
 
@@ -21,7 +22,7 @@ class Subset:
         ```python
         import datamol as dm
 
-        benchmark.get_train_test_split(..., input_transform_fn=dm.to_fp)
+        benchmark.get_train_test_split(..., featurization_fn=dm.to_fp)
         ```
 
     This should be the starting point for any framework-specific (e.g. PyTorch, Tensorflow) data-loader implementation.
@@ -68,8 +69,7 @@ def __init__(
         target_cols: Union[List[str], str],
         input_format: DataFormat = "dict",
         target_format: DataFormat = "tuple",
-        input_transform_fn: Optional[Callable] = None,
-        target_transform_fn: Optional[Callable] = None,
+        featurization_fn: Optional[Callable] = None,
         hide_targets: bool = False,
     ):
         self.dataset = dataset
@@ -78,8 +78,8 @@ def __init__(
         self.input_cols = input_cols if isinstance(input_cols, list) else [input_cols]
         self._input_format = input_format
         self._target_format = target_format
-        self._input_transform_fn = input_transform_fn
-        self._target_transform_fn = target_transform_fn
+
+        self._featurization_fn = featurization_fn
 
         # For the iterator implementation
         self._pointer = 0
@@ -131,7 +131,7 @@ def _get_single(
         self,
         row: str | int,
         cols: List[str],
-        transform_fn: Optional[Callable],
+        featurization_fn: Optional[Callable],
         format: DataFormat,
     ):
         """
@@ -141,7 +141,7 @@ def _get_single(
         Args:
             row: The row index of the datapoint.
             cols: The columns (i.e. variables) to load for that data point.
-            transform_fn: The transformation function to apply to the data-point.
+            featurization_fn: The transformation function to apply to the data-point.
             format: The format to return the data-point in.
         """
         # Load the data-point
@@ -151,19 +151,19 @@ def _get_single(
         # Format
         ret = self._format(ret, cols, format)
 
-        # Transform
-        if transform_fn is not None:
-            ret = transform_fn(ret)
+        # Featurize
+        if featurization_fn is not None:
+            ret = featurization_fn(ret)
 
         return ret
 
     def _get_single_input(self, row: str | int):
         """Get a single input for a specific data-point and given the benchmark specification."""
-        return self._get_single(row, self.input_cols, self._input_transform_fn, self._input_format)
+        return self._get_single(row, self.input_cols, self._featurization_fn, self._input_format)
 
     def _get_single_output(self, row: str | int):
         """Get a single output for a specific data-point and given the benchmark specification."""
-        return self._get_single(row, self.target_cols, self._target_transform_fn, self._target_format)
+        return self._get_single(row, self.target_cols, None, self._target_format)
 
     def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
         """
diff --git a/pyproject.toml b/pyproject.toml
index 74e2edcb..6c0be7fd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,14 +106,12 @@ omit = [
 output = "coverage.xml"
 
 [tool.ruff]
-ignore = [
+lint.ignore = [
     "E501", # Never enforce `E501` (line length violations).
 ]
 line-length = 110
 target-version = "py310"
-
-[tool.ruff.per-file-ignores]
-"__init__.py" = [
+lint.per-file-ignores."__init__.py" = [
     "F401", # imported but unused
     "E402", # Module level import not at top of file
 ]
diff --git a/tests/test_subset.py b/tests/test_subset.py
index 4538e325..900d1062 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -65,7 +65,7 @@ def test_input_featurization(test_single_task_benchmark):
     x = test[0]
     assert isinstance(x, str)
 
-    train, test = test_single_task_benchmark.get_train_test_split(input_transform_fn=dm.to_fp)
+    train, test = test_single_task_benchmark.get_train_test_split(featurization_fn=dm.to_fp)
 
     # For all different flavours of accessing the data
     # Make sure the input is now featurized
@@ -88,29 +88,6 @@ def test_input_featurization(test_single_task_benchmark):
     assert isinstance(x, np.ndarray)
 
 
-def test_target_transformation(test_single_task_benchmark):
-
-    # Get the normal value without transformation
-    train, test = test_single_task_benchmark.get_train_test_split()
-    test_single_task_benchmark._n_splits_since_evaluate = 0  # Manually reset for sake of test
-
-    x, original_y = train[0]
-
-    train, test = test_single_task_benchmark.get_train_test_split(target_transform_fn=lambda y: y * 2)
-
-    # For all different flavours of accessing the data
-    # Make sure the target is now transformed
-    # We do not need to test the test set, because this
-    x, y = train[0]
-    assert y == original_y * 2
-
-    x, y = next(train)
-    assert y == original_y * 2
-
-    y = train.y[0]
-    assert y == original_y * 2
-
-
 def test_expected_exception_with_test_set_ambiguity(test_single_task_benchmark):
 
     y_pred = np.random.random(len(test_single_task_benchmark.split[1]))
@@ -123,10 +100,10 @@ def test_expected_exception_with_test_set_ambiguity(test_single_task_benchmark):
     # With two different split, we do expect an error
     test_single_task_benchmark.get_train_test_split()
     with pytest.raises(EvaluationError):
-        test_single_task_benchmark.get_train_test_split(target_transform_fn=lambda x: x * 2)
+        test_single_task_benchmark.get_train_test_split(featurization_fn=dm.to_fp)
     test_single_task_benchmark.evaluate(y_pred)
 
     # With two different splits, but an evaluate in between, all is good again!
     test_single_task_benchmark.get_train_test_split()
     test_single_task_benchmark.evaluate(y_pred)
-    test_single_task_benchmark.get_train_test_split(target_transform_fn=lambda x: x * 2)
+    test_single_task_benchmark.get_train_test_split(featurization_fn=dm.to_fp)

From 1c5ca97c5541e3aa31682c4fbb0b80519861ee01 Mon Sep 17 00:00:00 2001
From: Cas Wognum <cas.wognum@recursionpharma.com>
Date: Mon, 19 Feb 2024 11:41:32 -0500
Subject: [PATCH 4/8] Update notebook

---
 docs/tutorials/basics.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorials/basics.ipynb b/docs/tutorials/basics.ipynb
index 8dde27ba..22bbb0f0 100644
--- a/docs/tutorials/basics.ipynb
+++ b/docs/tutorials/basics.ipynb
@@ -665,7 +665,7 @@
     "benchmark = po.load_benchmark(\"polaris/hello_world_benchmark\")\n",
     "\n",
     "# Get the split and convert SMILES to ECFP fingerprints by specifying an featurize function.\n",
-    "train, test = benchmark.get_train_test_split(input_transform_fn=dm.to_fp)\n",
+    "train, test = benchmark.get_train_test_split(featurization_fn=dm.to_fp)\n",
     "\n",
     "# Define a model and train\n",
     "model = RandomForestRegressor(max_depth=2, random_state=0)\n",

From 19b20298ff75d4c62b5685d5224063726e56abe9 Mon Sep 17 00:00:00 2001
From: Cas Wognum <cas.wognum@recursionpharma.com>
Date: Mon, 19 Feb 2024 12:52:19 -0500
Subject: [PATCH 5/8] Added test cases for the different Subset formats

---
 polaris/benchmark/_base.py  | 38 +++++----------------------
 polaris/dataset/__init__.py |  3 +--
 polaris/dataset/_subset.py  | 18 ++++++-------
 polaris/utils/errors.py     |  4 ---
 polaris/utils/types.py      | 10 +++++---
 tests/test_subset.py        | 51 +++++++++++++++++++++++++------------
 6 files changed, 58 insertions(+), 66 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index aab58d12..97847e2a 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -9,7 +9,6 @@
 from pydantic import (
     Field,
     FieldValidationInfo,
-    PrivateAttr,
     computed_field,
     field_serializer,
     field_validator,
@@ -24,7 +23,7 @@
 from polaris.utils import fs
 from polaris.utils.context import tmp_attribute_change
 from polaris.utils.dict2html import dict2html
-from polaris.utils.errors import EvaluationError, InvalidBenchmarkError, PolarisChecksumError
+from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
 from polaris.utils.misc import listit
 from polaris.utils.types import (
     AccessType,
@@ -113,10 +112,6 @@ class BenchmarkSpecification(BaseArtifactModel):
         default_factory=dict, validate_default=True
     )
 
-    # Private attributes to track the internal state
-    _test_subset: Optional[Subset] = PrivateAttr(None)
-    _n_splits_since_evaluate: int = PrivateAttr(0)
-
     @field_validator("dataset")
     def _validate_dataset(cls, v):
         """
@@ -400,27 +395,6 @@ def _get_subset(indices, hide_targets):
         else:
             test = _get_subset(self.split[1], hide_targets=True)
 
-        # Polaris is designed to reduce the risk of accidental access to the test set.
-        # One of the design decisions was to ask the users to just provide the predictions in evaluate(), not y_true.
-        # Because of this, we do need to check if a user has created multiple test objects with different parameters
-        # without calling evaluate in between. This would lead to ambiguity as to which test object to use.
-        previous_test_set = (
-            list(self._test_subset.values())[0] if isinstance(self._test_subset, dict) else self._test_subset
-        )
-        different_parameters = previous_test_set is not None and (
-            previous_test_set._target_format != target_format
-            or previous_test_set._input_format != input_format
-            or previous_test_set._featurization_fn is not featurization_fn
-        )
-        self._n_splits_since_evaluate += 1
-        if self._n_splits_since_evaluate > 1 and different_parameters:
-            raise EvaluationError(
-                "You have called get_train_test_split() multiple times with different parameters, "
-                "without calling evaluate() in between. This leads to ambiguity when evaluating the model "
-                "because Polaris cannot infer which test set object should be used."
-            )
-
-        self._test_subset = test
         return train, test
 
     def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
@@ -439,8 +413,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         5. There can be metrics which measure across tasks.
 
         Args:
-            y_pred: The predictions for the test set, as NumPy arrays. If there are multiple test sets,
-                this should be a dictionary with the test set names as keys.
+            y_pred: The predictions for the test set, as NumPy arrays.
+                If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
+                If there are multiple test sets, the predictions should be further wrapped in a dictionary
+                    with the test subset labels as keys.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
@@ -449,10 +425,8 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         # Instead of having the user pass the ground truth, we maintain a copy of the last created test set internally.
         # This simplifies the API, but also was added to make accidental access to the test set targets less likely.
         # See also the `hide_targets` parameter in the `Subset` class.
+        test = self.get_train_test_split(target_format="dict")[1]
 
-        self._n_splits_since_evaluate = 0
-
-        test = self._test_subset
         if not isinstance(test, dict):
             test = {"test": test}
 
diff --git a/polaris/dataset/__init__.py b/polaris/dataset/__init__.py
index 0b420ebb..ecd3098b 100644
--- a/polaris/dataset/__init__.py
+++ b/polaris/dataset/__init__.py
@@ -1,4 +1,4 @@
-from polaris.dataset._column import ColumnAnnotation, Modality
+from polaris.dataset._column import ColumnAnnotation
 from polaris.dataset._dataset import Dataset
 from polaris.dataset._subset import Subset
 
@@ -6,5 +6,4 @@
     "ColumnAnnotation",
     "Dataset",
     "Subset",
-    "Modality"
 ]
diff --git a/polaris/dataset/_subset.py b/polaris/dataset/_subset.py
index a083d941..04f5df95 100644
--- a/polaris/dataset/_subset.py
+++ b/polaris/dataset/_subset.py
@@ -1,10 +1,9 @@
 from typing import Callable, List, Literal, Optional, Sequence, Union
-from loguru import logger
 
 import numpy as np
 
-from polaris.dataset import Dataset, Modality
-from polaris.utils.errors import EvaluationError, TestAccessError
+from polaris.dataset import Dataset
+from polaris.utils.errors import TestAccessError
 from polaris.utils.types import DataFormat, DatapointType
 
 
@@ -59,8 +58,6 @@ class Subset:
         TestAccessError: When trying to access the targets of the test set (specified by the `hide_targets` attribute).
     """
 
-    _SUPPORTED_FORMATS = ["dict", "tuple"]
-
     def __init__(
         self,
         dataset: Dataset,
@@ -68,7 +65,7 @@ def __init__(
         input_cols: Union[List[str], str],
         target_cols: Union[List[str], str],
         input_format: DataFormat = "dict",
-        target_format: DataFormat = "tuple",
+        target_format: DataFormat = "dict",
         featurization_fn: Optional[Callable] = None,
         hide_targets: bool = False,
     ):
@@ -188,12 +185,15 @@ def as_array(self, data_type: Union[Literal["x"], Literal["y"], Literal["xy"]]):
             # With a multi-task or multi-input data point, this will be a 2D array.
             return np.array(ret)
 
-        # If the return type is a dict, we want to convert
+        # If the return format is a dict, we want to convert
         # from an array of dicts to a dict of arrays.
-        if data_type == "y":
+        if data_type == "y" and self._target_format == "dict":
             ret = {k: np.array([v[k] for v in ret]) for k in self.target_cols}
-        else:
+        elif data_type == "x" and self._input_format == "dict":
             ret = {k: np.array([v[k] for v in ret]) for k in self.input_cols}
+        else:
+            # The format is a tuple, so we have list of tuples and convert this to an array
+            ret = np.array(ret)
 
         return ret
 
diff --git a/polaris/utils/errors.py b/polaris/utils/errors.py
index daf66c15..1cb46581 100644
--- a/polaris/utils/errors.py
+++ b/polaris/utils/errors.py
@@ -33,7 +33,3 @@ class TestAccessError(Exception):
     __test__ = False
 
     pass
-
-
-class EvaluationError(Exception):
-    pass
diff --git a/polaris/utils/types.py b/polaris/utils/types.py
index 91c586db..e57f440d 100644
--- a/polaris/utils/types.py
+++ b/polaris/utils/types.py
@@ -10,7 +10,7 @@
     BaseModel,
     ConfigDict,
     HttpUrl,
-    constr,
+    StringConstraints,
     model_validator,
 )
 from pydantic.alias_generators import to_camel
@@ -50,12 +50,16 @@
 The target formats that are supported by the `Subset` class. 
 """
 
-SlugStringType: TypeAlias = constr(pattern="^[a-z0-9-]+$", min_length=4, max_length=64)
+SlugStringType: TypeAlias = Annotated[
+    str, StringConstraints(pattern="^[a-z0-9-]+$", min_length=4, max_length=64)
+]
 """
 A URL-compatible string that can serve as slug on the hub.
 """
 
-SlugCompatibleStringType: TypeAlias = constr(pattern="^[A-Za-z0-9_-]+$", min_length=4, max_length=64)
+SlugCompatibleStringType: TypeAlias = Annotated[
+    str, StringConstraints(pattern="^[A-Za-z0-9_-]+$", min_length=4, max_length=64)
+]
 """
 A URL-compatible string that can be turned into a slug by the hub.
 
diff --git a/tests/test_subset.py b/tests/test_subset.py
index 900d1062..0b255dd6 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -3,7 +3,7 @@
 import pytest
 
 from polaris.dataset import Subset
-from polaris.utils.errors import EvaluationError, TestAccessError
+from polaris.utils.errors import TestAccessError
 
 
 def test_consistency_across_access_methods(test_dataset):
@@ -88,22 +88,41 @@ def test_input_featurization(test_single_task_benchmark):
     assert isinstance(x, np.ndarray)
 
 
-def test_expected_exception_with_test_set_ambiguity(test_single_task_benchmark):
+@pytest.mark.parametrize("fmt", ["dict", "tuple"])
+def test_different_subset_formats_single_task(test_single_task_benchmark, fmt):
+    train, _ = test_single_task_benchmark.get_train_test_split(target_format=fmt)
+    assert isinstance(train.y, np.ndarray)
+    assert train.y.shape == (len(train),)
+    assert isinstance(train[0][1], float)
+    assert isinstance(next(train)[1], float)
 
-    y_pred = np.random.random(len(test_single_task_benchmark.split[1]))
 
-    # When getting the same split twice, we do not expect an error
-    test_single_task_benchmark.get_train_test_split()
-    test_single_task_benchmark.get_train_test_split()
-    test_single_task_benchmark.evaluate(y_pred)
+def test_different_subset_formats_multi_task_dict(test_multi_task_benchmark):
+    train, _ = test_multi_task_benchmark.get_train_test_split(target_format="dict")
+    assert isinstance(train.y, dict)
+    assert all(c in test_multi_task_benchmark.target_cols for c in train.y)
+    assert all(isinstance(v, np.ndarray) and v.shape == (len(train),) for v in train.y.values())
+    assert isinstance(train[0][1], dict)
+    assert isinstance(next(train)[1], dict)
 
-    # With two different split, we do expect an error
-    test_single_task_benchmark.get_train_test_split()
-    with pytest.raises(EvaluationError):
-        test_single_task_benchmark.get_train_test_split(featurization_fn=dm.to_fp)
-    test_single_task_benchmark.evaluate(y_pred)
 
-    # With two different splits, but an evaluate in between, all is good again!
-    test_single_task_benchmark.get_train_test_split()
-    test_single_task_benchmark.evaluate(y_pred)
-    test_single_task_benchmark.get_train_test_split(featurization_fn=dm.to_fp)
+def test_different_subset_formats_multi_task_tuple(test_multi_task_benchmark):
+    train, _ = test_multi_task_benchmark.get_train_test_split(target_format="tuple")
+    assert isinstance(train.y, np.ndarray)
+    assert train.y.shape == (len(train), len(train.target_cols))
+    assert isinstance(train[0][1], tuple)
+    assert isinstance(next(train)[1], tuple)
+
+
+def test_consistency_between_different_formats(test_multi_task_benchmark):
+
+    train_tup, _ = test_multi_task_benchmark.get_train_test_split(target_format="tuple")
+    train_dict, _ = test_multi_task_benchmark.get_train_test_split(target_format="dict")
+
+    t = train_tup[0][1]
+    d = train_dict[0][1]
+
+    assert len(d) == len(t)
+    for k, v in d.items():
+        idx = test_multi_task_benchmark.target_cols.index(k)
+        assert t[idx] == v

From 8820e3d2e3a3918abf2654e61f97b07bc428b3fb Mon Sep 17 00:00:00 2001
From: Cas Wognum <cas.wognum@recursionpharma.com>
Date: Mon, 19 Feb 2024 13:16:37 -0500
Subject: [PATCH 6/8] updated outdated docstring

---
 polaris/benchmark/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 97847e2a..895cf25e 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -422,7 +422,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
         """
 
-        # Instead of having the user pass the ground truth, we maintain a copy of the last created test set internally.
+        # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
         # This simplifies the API, but also was added to make accidental access to the test set targets less likely.
         # See also the `hide_targets` parameter in the `Subset` class.
         test = self.get_train_test_split(target_format="dict")[1]

From e623f153e4068f88a832337a6dd765c04a3e8494 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Wed, 6 Mar 2024 15:05:49 -0500
Subject: [PATCH 7/8] ruff format

---
 tests/test_subset.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test_subset.py b/tests/test_subset.py
index 0b255dd6..c6e1c4c5 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -54,7 +54,6 @@ def test_access_to_test_set(test_single_task_benchmark):
 
 
 def test_input_featurization(test_single_task_benchmark):
-
     # Without a transformation, we expect a SMILES string
     train, test = test_single_task_benchmark.get_train_test_split()
     test_single_task_benchmark._n_splits_since_evaluate = 0  # Manually reset for sake of test
@@ -115,7 +114,6 @@ def test_different_subset_formats_multi_task_tuple(test_multi_task_benchmark):
 
 
 def test_consistency_between_different_formats(test_multi_task_benchmark):
-
     train_tup, _ = test_multi_task_benchmark.get_train_test_split(target_format="tuple")
     train_dict, _ = test_multi_task_benchmark.get_train_test_split(target_format="dict")
 

From 871fcc76ee161d66a590199ca484f437b6637d0c Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Wed, 6 Mar 2024 15:08:43 -0500
Subject: [PATCH 8/8] format

---
 polaris/utils/types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/polaris/utils/types.py b/polaris/utils/types.py
index 402b98e9..8291f8b3 100644
--- a/polaris/utils/types.py
+++ b/polaris/utils/types.py
@@ -124,9 +124,9 @@ class License(BaseModel):
             Else it is required to manually specify this.
     """
 
-    SPDX_LICENSE_DATA_PATH: ClassVar[
-        str
-    ] = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
+    SPDX_LICENSE_DATA_PATH: ClassVar[str] = (
+        "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json"
+    )
 
     id: str
     reference: Optional[HttpUrlString] = None