From 56dc9d1bf98a3dd6868e0a8658275abd0bd01280 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Mon, 22 Apr 2024 16:44:13 -0400
Subject: [PATCH 01/18] add aafe and ba

---
 polaris/evaluate/_metric.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 57dfd0ee..43a5ad3d 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -15,6 +15,7 @@
     mean_squared_error,
     r2_score,
     roc_auc_score,
+    balanced_accuracy_score,
 )
 
 from polaris.utils.types import DirectionType
@@ -30,6 +31,28 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray):
     return stats.spearmanr(y_true, y_pred).statistic
 
 
+def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray):
+    """
+    Calculate the Absolute Average Fold Error (AAFE) metric.
+
+    Parameters:
+    y_true : array-like of shape (n_samples,)
+        The true target values.
+    y_pred : array-like of shape (n_samples,)
+        The predicted target values.
+
+    Returns:
+    aafe : float
+        The Absolute Average Fold Error.
+    """
+    if len(y_true) != len(y_pred):
+        raise ValueError("Length of y_true and y_pred must be the same.")
+
+    aafe = np.mean(np.abs(y_pred) / np.abs(y_true))
+
+    return aafe
+
+
 class MetricInfo(BaseModel):
     """
     Metric metadata
@@ -65,6 +88,7 @@ class Metric(Enum):
     pearsonr = MetricInfo(fn=pearsonr, direction="max")
     spearmanr = MetricInfo(fn=spearman, direction="max")
     explained_var = MetricInfo(fn=explained_variance_score, direction="max")
+    aafe = MetricInfo(fn=absolute_average_fold_error, direction="max")
 
     # classification
     accuracy = MetricInfo(fn=accuracy_score, direction="max")
@@ -75,7 +99,9 @@ class Metric(Enum):
     pr_auc = MetricInfo(fn=average_precision_score, direction="max")
     mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
     cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
+
     # TODO: adding metrics for multiclass tasks
+    ba = MetricInfo(fn=balanced_accuracy_score, direction="max")
 
     @property
     def fn(self) -> Callable:

From 11b86851cd43a39e3a295f288813a563464e3a44 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 23 Apr 2024 11:37:07 -0400
Subject: [PATCH 02/18] add docs

---
 polaris/evaluate/_metric.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 43a5ad3d..94ad9582 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -34,16 +34,18 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray):
 def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray):
     """
     Calculate the Absolute Average Fold Error (AAFE) metric.
+    It measures the fold change between predicted values and observed values.
+    The implementation is based on https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305.
 
-    Parameters:
-    y_true : array-like of shape (n_samples,)
-        The true target values.
-    y_pred : array-like of shape (n_samples,)
-        The predicted target values.
+    Args:
+        y_true : array-like of shape (n_samples,)
+            The true target values.
+        y_pred : array-like of shape (n_samples,)
+            The predicted target values.
 
     Returns:
-    aafe : float
-        The Absolute Average Fold Error.
+        aafe : float
+            The Absolute Average Fold Error.
     """
     if len(y_true) != len(y_pred):
         raise ValueError("Length of y_true and y_pred must be the same.")
@@ -88,7 +90,7 @@ class Metric(Enum):
     pearsonr = MetricInfo(fn=pearsonr, direction="max")
     spearmanr = MetricInfo(fn=spearman, direction="max")
     explained_var = MetricInfo(fn=explained_variance_score, direction="max")
-    aafe = MetricInfo(fn=absolute_average_fold_error, direction="max")
+    absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction="max")
 
     # classification
     accuracy = MetricInfo(fn=accuracy_score, direction="max")
@@ -100,8 +102,8 @@ class Metric(Enum):
     mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
     cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
 
-    # TODO: adding metrics for multiclass tasks
-    ba = MetricInfo(fn=balanced_accuracy_score, direction="max")
+    # multiclass tasks
+    balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
 
     @property
     def fn(self) -> Callable:

From eb1824aeeb31aaab45ec8f605e544f5640b9dd21 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 23 Apr 2024 13:11:47 -0400
Subject: [PATCH 03/18] add tests

---
 polaris/evaluate/_metric.py | 15 +++++++++------
 tests/conftest.py           | 23 ++++++++++++++++++++++-
 tests/test_evaluate.py      | 10 ++++++++++
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 94ad9582..bd774e79 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -92,18 +92,21 @@ class Metric(Enum):
     explained_var = MetricInfo(fn=explained_variance_score, direction="max")
     absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction="max")
 
-    # classification
+    # binary and multiclass classification
     accuracy = MetricInfo(fn=accuracy_score, direction="max")
+    balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
+    mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
+    cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
+
+    # binary only
     f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max")
-    f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
-    f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
+    # note: At the moment, multi-dimension inputs for classification are not supported
     roc_auc = MetricInfo(fn=roc_auc_score, direction="max")
     pr_auc = MetricInfo(fn=average_precision_score, direction="max")
-    mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
-    cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
 
     # multiclass tasks
-    balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
+    f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
+    f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
 
     @property
     def fn(self) -> Callable:
diff --git a/tests/conftest.py b/tests/conftest.py
index 1ebc0a02..56e77f05 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,6 +23,8 @@ def test_data():
     # set an abitrary threshold for testing purpose.
     data["CLASS_expt"] = data["expt"].gt(0).astype(int).values
     data["CLASS_calc"] = data["calc"].gt(0).astype(int).values
+    data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0])
+    data["MULTICLASS_calc"] =np.random.randint(low=0, high=3, size=data.shape[0])
     return data
 
 
@@ -99,6 +101,7 @@ def test_single_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error"
         ],
         main_metric="mean_absolute_error",
         split=(train_indices, test_indices),
@@ -117,7 +120,7 @@ def test_single_task_benchmark_clf(test_dataset):
         name="single-task-benchmark",
         dataset=test_dataset,
         main_metric="accuracy",
-        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
+        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa", "balanced_accuracy"],
         split=(train_indices, test_indices),
         target_cols="CLASS_expt",
         input_cols="smiles",
@@ -125,6 +128,22 @@ def test_single_task_benchmark_clf(test_dataset):
     check_version(benchmark)
     return benchmark
 
+@pytest.fixture(scope="function")
+def test_single_task_benchmark_multi_clf(test_dataset):
+    train_indices = list(range(90))
+    test_indices = list(range(90, 100))
+
+    benchmark = SingleTaskBenchmarkSpecification(
+        name="single-task-benchmark",
+        dataset=test_dataset,
+        main_metric="accuracy",
+        metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa" , "f1_macro",  "f1_micro"],
+        split=(train_indices, test_indices),
+        target_cols="MULTICLASS_expt",
+        input_cols="smiles",
+    )
+    check_version(benchmark)
+    return benchmark
 
 @pytest.fixture(scope="function")
 def test_single_task_benchmark_multiple_test_sets(test_dataset):
@@ -140,6 +159,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error"
         ],
         main_metric="r2",
         split=(train_indices, test_indices),
@@ -166,6 +186,7 @@ def test_multi_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error"
         ],
         split=(train_indices, test_indices),
         target_cols=["expt", "calc"],
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 5a4332c9..5ec8643a 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -78,6 +78,16 @@ def test_metrics_singletask_clf(
         assert metric in result.results.Metric.tolist()
 
 
+def test_metrics_singletask_multicls_clf(
+    tmpdir: str, test_single_task_benchmark_multi_clf: SingleTaskBenchmarkSpecification
+):
+    _, test = test_single_task_benchmark_multi_clf.get_train_test_split()
+    predictions = np.random.randint(4, size=test.inputs.shape[0])
+    result = test_single_task_benchmark_multi_clf.evaluate(predictions)
+    for metric in test_single_task_benchmark_multi_clf.metrics:
+        assert metric in result.results.Metric.tolist()
+
+
 def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: MultiTaskBenchmarkSpecification):
     train, test = test_multi_task_benchmark_clf.get_train_test_split()
     predictions = {

From de11628ffe73da3f5b274b62417c39c2aadfc264 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 23 Apr 2024 13:13:42 -0400
Subject: [PATCH 04/18] format

---
 tests/conftest.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 56e77f05..9001ebaf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,7 +24,7 @@ def test_data():
     data["CLASS_expt"] = data["expt"].gt(0).astype(int).values
     data["CLASS_calc"] = data["calc"].gt(0).astype(int).values
     data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0])
-    data["MULTICLASS_calc"] =np.random.randint(low=0, high=3, size=data.shape[0])
+    data["MULTICLASS_calc"] = np.random.randint(low=0, high=3, size=data.shape[0])
     return data
 
 
@@ -101,7 +101,7 @@ def test_single_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
-            "absolute_average_fold_error"
+            "absolute_average_fold_error",
         ],
         main_metric="mean_absolute_error",
         split=(train_indices, test_indices),
@@ -128,6 +128,7 @@ def test_single_task_benchmark_clf(test_dataset):
     check_version(benchmark)
     return benchmark
 
+
 @pytest.fixture(scope="function")
 def test_single_task_benchmark_multi_clf(test_dataset):
     train_indices = list(range(90))
@@ -137,7 +138,7 @@ def test_single_task_benchmark_multi_clf(test_dataset):
         name="single-task-benchmark",
         dataset=test_dataset,
         main_metric="accuracy",
-        metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa" , "f1_macro",  "f1_micro"],
+        metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa", "f1_macro", "f1_micro"],
         split=(train_indices, test_indices),
         target_cols="MULTICLASS_expt",
         input_cols="smiles",
@@ -145,6 +146,7 @@ def test_single_task_benchmark_multi_clf(test_dataset):
     check_version(benchmark)
     return benchmark
 
+
 @pytest.fixture(scope="function")
 def test_single_task_benchmark_multiple_test_sets(test_dataset):
     train_indices = list(range(90))
@@ -159,7 +161,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
-            "absolute_average_fold_error"
+            "absolute_average_fold_error",
         ],
         main_metric="r2",
         split=(train_indices, test_indices),
@@ -186,7 +188,7 @@ def test_multi_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
-            "absolute_average_fold_error"
+            "absolute_average_fold_error",
         ],
         split=(train_indices, test_indices),
         target_cols=["expt", "calc"],

From 56da3adb8da1b9694e2e024b8fa6a0d76fde271e Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Thu, 25 Apr 2024 16:36:15 -0400
Subject: [PATCH 05/18] allows probability inputs for evaluation

---
 polaris/benchmark/_base.py  | 15 +++++++++++----
 polaris/evaluate/_metric.py | 25 ++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 0d4f12ed..0a8e7aff 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -388,7 +388,7 @@ def _get_subset(indices, hide_targets):
 
         return train, test
 
-    def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
+    def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> BenchmarkResults:
         """Execute the evaluation protocol for the benchmark, given a set of predictions.
 
         info: What about `y_true`?
@@ -408,6 +408,8 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
                 If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
                 If there are multiple test sets, the predictions should be further wrapped in a dictionary
                     with the test subset labels as keys.
+            y_prob: The predicted probabilities for the test set, as NumPy arrays.
+                Currently only multiclass in singletask setting is supported.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
@@ -429,6 +431,9 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred):
             y_pred = {"test": y_pred}
 
+        if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob):
+            y_prob = {"test": y_prob}
+
         if any(k not in y_pred for k in test.keys()):
             raise KeyError(
                 f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
@@ -441,15 +446,17 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         for test_label, y_true_subset in y_true.items():
             # For every metric...
             for metric in self.metrics:
+                y_pred_eval = y_prob if metric.needs_probs else y_pred
+
                 if metric.is_multitask:
                     # Multi-task but with a metric across targets
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                    score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label])
                     scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                     continue
 
                 if not isinstance(y_true_subset, dict):
                     # Single task
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                    score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label])
                     scores.loc[len(scores)] = (
                         test_label,
                         self.target_cols[0],
@@ -465,7 +472,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
                     mask = ~np.isnan(y_true_target)
                     score = metric(
                         y_true=y_true_target[mask],
-                        y_pred=y_pred[test_label][target_label][mask],
+                        y_pred=y_pred_eval[test_label][target_label][mask],
                     )
                     scores.loc[len(scores)] = (test_label, target_label, metric, score)
 
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index bd774e79..361fdba6 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -35,7 +35,7 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray):
     """
     Calculate the Absolute Average Fold Error (AAFE) metric.
     It measures the fold change between predicted values and observed values.
-    The implementation is based on https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305.
+    The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305).
 
     Args:
         y_true : array-like of shape (n_samples,)
@@ -50,6 +50,9 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray):
     if len(y_true) != len(y_pred):
         raise ValueError("Length of y_true and y_pred must be the same.")
 
+    if np.any(y_true == 0):
+        raise ValueError("`y_true` contains zero which will result `Inf` value.")
+
     aafe = np.mean(np.abs(y_pred) / np.abs(y_true))
 
     return aafe
@@ -70,6 +73,7 @@ class MetricInfo(BaseModel):
     is_multitask: bool = False
     kwargs: dict = Field(default_factory=dict)
     direction: DirectionType
+    needs_probs: bool = False
 
 
 class Metric(Enum):
@@ -90,23 +94,29 @@ class Metric(Enum):
     pearsonr = MetricInfo(fn=pearsonr, direction="max")
     spearmanr = MetricInfo(fn=spearman, direction="max")
     explained_var = MetricInfo(fn=explained_variance_score, direction="max")
-    absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction="max")
+    absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction=1)
 
     # binary and multiclass classification
     accuracy = MetricInfo(fn=accuracy_score, direction="max")
     balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
     mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
     cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
+    pr_auc = MetricInfo(fn=average_precision_score, direction="max", needs_probs=True)
 
     # binary only
     f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max")
     # note: At the moment, multi-dimension inputs for classification are not supported
-    roc_auc = MetricInfo(fn=roc_auc_score, direction="max")
-    pr_auc = MetricInfo(fn=average_precision_score, direction="max")
+    roc_auc = MetricInfo(fn=roc_auc_score, direction="max", needs_probs=True)
 
-    # multiclass tasks
+    # multiclass tasks only
     f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
     f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
+    roc_auc_ovr = MetricInfo(
+        fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", needs_probs=True
+    )
+    roc_auc_ovo = MetricInfo(
+        fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", needs_probs=True
+    )
 
     @property
     def fn(self) -> Callable:
@@ -118,6 +128,11 @@ def is_multitask(self) -> bool:
         """Whether the metric expects a single set of predictions or a dict of predictions."""
         return self.value.is_multitask
 
+    @property
+    def needs_probs(self) -> bool:
+        """Whether the metric expects preditive probablities."""
+        return self.value.needs_probs
+
     def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
         """Endpoint for computing the metric.
 

From 9c4c9af49d7bf463fbcf48f35f40baf153364ef3 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Thu, 25 Apr 2024 16:36:32 -0400
Subject: [PATCH 06/18] update tests

---
 tests/conftest.py      | 19 ++++++++++++++---
 tests/test_evaluate.py | 46 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 9001ebaf..545cb26c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -131,14 +131,27 @@ def test_single_task_benchmark_clf(test_dataset):
 
 @pytest.fixture(scope="function")
 def test_single_task_benchmark_multi_clf(test_dataset):
-    train_indices = list(range(90))
-    test_indices = list(range(90, 100))
+    np.random.seed(111)
+    indices = np.arange(100)
+    np.random.shuffle(indices)
+    train_indices = indices[:80]
+    test_indices = indices[80:]
 
     benchmark = SingleTaskBenchmarkSpecification(
         name="single-task-benchmark",
         dataset=test_dataset,
         main_metric="accuracy",
-        metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa", "f1_macro", "f1_micro"],
+        metrics=[
+            "accuracy",
+            "balanced_accuracy",
+            "mcc",
+            "cohen_kappa",
+            "f1_macro",
+            "f1_micro",
+            "roc_auc_ovr",
+            "roc_auc_ovo",
+            "pr_auc",
+        ],
         split=(train_indices, test_indices),
         target_cols="MULTICLASS_expt",
         input_cols="smiles",
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 5ec8643a..1e788df4 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -1,5 +1,5 @@
 import os
-
+import pytest
 import numpy as np
 import pandas as pd
 
@@ -73,7 +73,8 @@ def test_metrics_singletask_clf(
 ):
     _, test = test_single_task_benchmark_clf.get_train_test_split()
     predictions = np.random.randint(2, size=test.inputs.shape[0])
-    result = test_single_task_benchmark_clf.evaluate(predictions)
+    probabilities = np.random.uniform(size=test.inputs.shape[0])
+    result = test_single_task_benchmark_clf.evaluate(predictions, probabilities)
     for metric in test_single_task_benchmark_clf.metrics:
         assert metric in result.results.Metric.tolist()
 
@@ -82,8 +83,10 @@ def test_metrics_singletask_multicls_clf(
     tmpdir: str, test_single_task_benchmark_multi_clf: SingleTaskBenchmarkSpecification
 ):
     _, test = test_single_task_benchmark_multi_clf.get_train_test_split()
-    predictions = np.random.randint(4, size=test.inputs.shape[0])
-    result = test_single_task_benchmark_multi_clf.evaluate(predictions)
+    predictions = np.random.randint(3, size=test.inputs.shape[0])
+    probablities = np.random.random(size=(test.inputs.shape[0], 3))
+    probablities = probablities / probablities.sum(axis=1, keepdims=True)
+    result = test_single_task_benchmark_multi_clf.evaluate(predictions, probablities)
     for metric in test_single_task_benchmark_multi_clf.metrics:
         assert metric in result.results.Metric.tolist()
 
@@ -93,7 +96,10 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi
     predictions = {
         target_col: np.random.randint(2, size=test.inputs.shape[0]) for target_col in train.target_cols
     }
-    result = test_multi_task_benchmark_clf.evaluate(predictions)
+    probabilities = {
+        target_col: np.random.uniform(size=test.inputs.shape[0]) for target_col in train.target_cols
+    }
+    result = test_multi_task_benchmark_clf.evaluate(predictions, probabilities)
     assert isinstance(result.results, pd.DataFrame)
     assert set(result.results.columns) == {
         "Test set",
@@ -111,4 +117,32 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi
 
 def test_metric_direction():
     for metric in Metric:
-        assert metric.value.direction in ["min", "max"]
+        assert metric.value.direction in ["min", "max", 1]
+
+
+def test_absolute_average_fold_error():
+    y_true = np.random.uniform(low=50, high=100, size=200)
+    y_pred_1 = y_true + np.random.uniform(low=0, high=5, size=200)
+    y_pred_2 = y_true + np.random.uniform(low=5, high=20, size=200)
+    y_pred_3 = y_true - 10
+    y_zero = np.zeros(shape=200)
+
+    # Optimal value
+    aafe_0 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_true)
+    assert aafe_0 == 1
+
+    # small fold change
+    aafe_1 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_1)
+    assert aafe_1 > 1
+
+    # larger fold change
+    aafe_2 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_2)
+    assert aafe_2 > aafe_1
+
+    # undershoot
+    aafe_3 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_3)
+    assert aafe_3 < 1
+
+    # undershoot
+    with pytest.raises(ValueError):
+        aafe_4 = Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3)

From d42eeecf55d11fd572e46cae5da9dba51ff9836d Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Thu, 25 Apr 2024 16:36:47 -0400
Subject: [PATCH 07/18] update docs

---
 docs/api/evaluation.md | 2 ++
 polaris/utils/types.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md
index 6f5561fd..e69e20f9 100644
--- a/docs/api/evaluation.md
+++ b/docs/api/evaluation.md
@@ -5,6 +5,8 @@
 
 ::: polaris.evaluate.MetricInfo
 
+::: polaris.evaluate._metric.absolute_average_fold_error
+
 ---
 
 ::: polaris.evaluate.Metric
diff --git a/polaris/utils/types.py b/polaris/utils/types.py
index b8093450..256224fa 100644
--- a/polaris/utils/types.py
+++ b/polaris/utils/types.py
@@ -71,7 +71,7 @@
 This is useful for interactions with httpx and authlib, who have their own URL types.
 """
 
-DirectionType: TypeAlias = Literal["min", "max"]
+DirectionType: TypeAlias = float | Literal["min", "max"]
 """
 The direction of any variable to be sorted.
 This can be used to sort the metric score, indicate the optmization direction of endpoint.

From 3d2ead8a2748c2a1ea6b523db872115c6656d89e Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Thu, 25 Apr 2024 16:38:55 -0400
Subject: [PATCH 08/18] ruff

---
 tests/test_evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 1e788df4..1ebbdb32 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -145,4 +145,4 @@ def test_absolute_average_fold_error():
 
     # undershoot
     with pytest.raises(ValueError):
-        aafe_4 = Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3)
+        Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3)

From 430ab31c3a1ced32d33b440f82c0083209c9eae6 Mon Sep 17 00:00:00 2001
From: Lu Zhu <lu@valencediscovery.com>
Date: Thu, 25 Apr 2024 16:39:24 -0400
Subject: [PATCH 09/18] Update polaris/evaluate/_metric.py

Co-authored-by: Cas Wognum <caswognum@outlook.com>
---
 polaris/evaluate/_metric.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 361fdba6..6b2f4638 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -38,14 +38,11 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray):
     The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305).
 
     Args:
-        y_true : array-like of shape (n_samples,)
-            The true target values.
-        y_pred : array-like of shape (n_samples,)
-            The predicted target values.
+        y_true: The true target values of shape (n_samples,)
+        y_pred: The predicted target values of shape (n_samples,).
 
     Returns:
-        aafe : float
-            The Absolute Average Fold Error.
+        aafe: The Absolute Average Fold Error.
     """
     if len(y_true) != len(y_pred):
         raise ValueError("Length of y_true and y_pred must be the same.")

From 9ddf8c8b9bc0913c7113941d8c7cc3fcf232f49f Mon Sep 17 00:00:00 2001
From: Lu Zhu <lu@valencediscovery.com>
Date: Thu, 25 Apr 2024 16:39:34 -0400
Subject: [PATCH 10/18] Update polaris/evaluate/_metric.py

Co-authored-by: Cas Wognum <caswognum@outlook.com>
---
 polaris/evaluate/_metric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 6b2f4638..460e06c7 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -31,7 +31,7 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray):
     return stats.spearmanr(y_true, y_pred).statistic
 
 
-def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray):
+def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
     """
     Calculate the Absolute Average Fold Error (AAFE) metric.
     It measures the fold change between predicted values and observed values.

From a61143dcaeaa602e76ef50de91eb59e48150a34c Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Mon, 29 Apr 2024 17:48:42 -0400
Subject: [PATCH 11/18] specify y_type

---
 docs/api/evaluation.md      |  2 +-
 polaris/benchmark/_base.py  | 15 ++++++++-----
 polaris/evaluate/_metric.py | 42 ++++++++++++++++++++++++-------------
 tests/test_evaluate.py      |  6 +++---
 4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md
index e69e20f9..9187a763 100644
--- a/docs/api/evaluation.md
+++ b/docs/api/evaluation.md
@@ -11,6 +11,6 @@
 
 ::: polaris.evaluate.Metric
     options: 
-        filters: ["!^_", "!fn", "!is_multitask"]
+        filters: ["!^_", "!fn", "!is_multitask", "!y_type"]
 
 ---
\ No newline at end of file
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 0a8e7aff..2f1f3579 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -446,17 +446,19 @@ def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> B
         for test_label, y_true_subset in y_true.items():
             # For every metric...
             for metric in self.metrics:
-                y_pred_eval = y_prob if metric.needs_probs else y_pred
-
                 if metric.is_multitask:
                     # Multi-task but with a metric across targets
-                    score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label])
+                    score = metric(
+                        y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None)
+                    )
                     scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                     continue
 
                 if not isinstance(y_true_subset, dict):
                     # Single task
-                    score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label])
+                    score = metric(
+                        y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None)
+                    )
                     scores.loc[len(scores)] = (
                         test_label,
                         self.target_cols[0],
@@ -472,7 +474,10 @@ def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> B
                     mask = ~np.isnan(y_true_target)
                     score = metric(
                         y_true=y_true_target[mask],
-                        y_pred=y_pred_eval[test_label][target_label][mask],
+                        y_pred=y_pred[test_label][target_label][mask],
+                        y_prob=y_prob[test_label][target_label][mask]
+                        if y_prob[test_label] is not None
+                        else None,
                     )
                     scores.loc[len(scores)] = (test_label, target_label, metric, score)
 
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 460e06c7..9d589733 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Callable
+from typing import Callable, Literal, Optional
 
 import numpy as np
 from pydantic import BaseModel, Field
@@ -7,7 +7,7 @@
 from sklearn.metrics import (
     accuracy_score,
     average_precision_score,
-    cohen_kappa_score,
+    cohen_kappa_score as sk_cohen_kappa_score,
     explained_variance_score,
     f1_score,
     matthews_corrcoef,
@@ -55,6 +55,11 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float
     return aafe
 
 
+def cohen_kappa_score(y_true, y_pred, **kwargs):
+    """Scikit learn cohen_kappa_score wraper with renamed arguments"""
+    return sk_cohen_kappa_score(y1=y_true, y2=y_pred, **kwargs)
+
+
 class MetricInfo(BaseModel):
     """
     Metric metadata
@@ -70,7 +75,7 @@ class MetricInfo(BaseModel):
     is_multitask: bool = False
     kwargs: dict = Field(default_factory=dict)
     direction: DirectionType
-    needs_probs: bool = False
+    y_type: Literal["y_pred", "y_prob", "y_score"] = "y_pred"
 
 
 class Metric(Enum):
@@ -98,21 +103,20 @@ class Metric(Enum):
     balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
     mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
     cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
-    pr_auc = MetricInfo(fn=average_precision_score, direction="max", needs_probs=True)
+    pr_auc = MetricInfo(fn=average_precision_score, direction="max", y_type="y_score")
 
     # binary only
     f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max")
-    # note: At the moment, multi-dimension inputs for classification are not supported
-    roc_auc = MetricInfo(fn=roc_auc_score, direction="max", needs_probs=True)
+    roc_auc = MetricInfo(fn=roc_auc_score, direction="max", y_type="y_score")
 
     # multiclass tasks only
     f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
     f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
     roc_auc_ovr = MetricInfo(
-        fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", needs_probs=True
+        fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", y_type="y_score"
     )
     roc_auc_ovo = MetricInfo(
-        fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", needs_probs=True
+        fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score"
     )
 
     @property
@@ -126,11 +130,11 @@ def is_multitask(self) -> bool:
         return self.value.is_multitask
 
     @property
-    def needs_probs(self) -> bool:
+    def y_type(self) -> bool:
         """Whether the metric expects preditive probablities."""
-        return self.value.needs_probs
+        return self.value.y_type
 
-    def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    def score(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None) -> float:
         """Endpoint for computing the metric.
 
         For convenience, calling a `Metric` will result in this method being called.
@@ -140,8 +144,18 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
         assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second)
         ```
         """
-        return self.fn(y_true, y_pred, **self.value.kwargs)
+        # return self.fn(y_true, y_pred, **self.value.kwargs)
+        if y_pred is None and y_prob is None:
+            raise ValueError("Neither `y_pred` nor `y_prob` is specified.")
+
+        if self.y_type == "y_pred":
+            pred = y_pred
+        else:
+            pred = y_prob
+
+        kwargs = {"y_true": y_true, self.y_type: pred}
+        return self.fn(**kwargs, **self.value.kwargs)
 
-    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray = None) -> float:
         """For convenience, make metrics callable"""
-        return self.score(y_true, y_pred)
+        return self.score(y_true, y_pred, y_prob)
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 1ebbdb32..3134dfe0 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -74,7 +74,7 @@ def test_metrics_singletask_clf(
     _, test = test_single_task_benchmark_clf.get_train_test_split()
     predictions = np.random.randint(2, size=test.inputs.shape[0])
     probabilities = np.random.uniform(size=test.inputs.shape[0])
-    result = test_single_task_benchmark_clf.evaluate(predictions, probabilities)
+    result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
     for metric in test_single_task_benchmark_clf.metrics:
         assert metric in result.results.Metric.tolist()
 
@@ -86,7 +86,7 @@ def test_metrics_singletask_multicls_clf(
     predictions = np.random.randint(3, size=test.inputs.shape[0])
     probablities = np.random.random(size=(test.inputs.shape[0], 3))
     probablities = probablities / probablities.sum(axis=1, keepdims=True)
-    result = test_single_task_benchmark_multi_clf.evaluate(predictions, probablities)
+    result = test_single_task_benchmark_multi_clf.evaluate(y_pred=predictions, y_prob=probablities)
     for metric in test_single_task_benchmark_multi_clf.metrics:
         assert metric in result.results.Metric.tolist()
 
@@ -99,7 +99,7 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi
     probabilities = {
         target_col: np.random.uniform(size=test.inputs.shape[0]) for target_col in train.target_cols
     }
-    result = test_multi_task_benchmark_clf.evaluate(predictions, probabilities)
+    result = test_multi_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
     assert isinstance(result.results, pd.DataFrame)
     assert set(result.results.columns) == {
         "Test set",

From acac572147cee50c962dd0bfe3ba8b57732eb9b3 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 30 Apr 2024 10:08:15 -0400
Subject: [PATCH 12/18] add todos

---
 polaris/benchmark/_base.py  | 1 -
 polaris/evaluate/_metric.py | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 2f1f3579..c3f82da4 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -409,7 +409,6 @@ def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> B
                 If there are multiple test sets, the predictions should be further wrapped in a dictionary
                     with the test subset labels as keys.
             y_prob: The predicted probabilities for the test set, as NumPy arrays.
-                Currently only multiclass in singletask setting is supported.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 9d589733..6aa2dc7b 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -88,7 +88,7 @@ class Metric(Enum):
 
     # TODO (cwognum):
     #  - Any preprocessing needed? For example changing the shape / dtype? Converting from torch tensors or lists?
-
+    
     # regression
     mean_absolute_error = MetricInfo(fn=mean_absolute_error, direction="min")
     mean_squared_error = MetricInfo(fn=mean_squared_error, direction="min")
@@ -118,6 +118,7 @@ class Metric(Enum):
     roc_auc_ovo = MetricInfo(
         fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score"
     )
+    # TODO: add metrics to handle multitask multiclass predictions. 
 
     @property
     def fn(self) -> Callable:

From 355bd67724796ea543f7a13514852b3032fe0655 Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 30 Apr 2024 10:10:12 -0400
Subject: [PATCH 13/18] format

---
 polaris/evaluate/_metric.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 6aa2dc7b..eff562c6 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -88,7 +88,7 @@ class Metric(Enum):
 
     # TODO (cwognum):
     #  - Any preprocessing needed? For example changing the shape / dtype? Converting from torch tensors or lists?
-    
+
     # regression
     mean_absolute_error = MetricInfo(fn=mean_absolute_error, direction="min")
     mean_squared_error = MetricInfo(fn=mean_squared_error, direction="min")
@@ -118,7 +118,7 @@ class Metric(Enum):
     roc_auc_ovo = MetricInfo(
         fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score"
     )
-    # TODO: add metrics to handle multitask multiclass predictions. 
+    # TODO: add metrics to handle multitask multiclass predictions.
 
     @property
     def fn(self) -> Callable:

From d7bb39fd3af545fd0419e4a4955bca201e07dafc Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 30 Apr 2024 12:57:42 -0400
Subject: [PATCH 14/18] add more tests

---
 polaris/benchmark/_base.py  |  2 +-
 polaris/evaluate/_metric.py | 11 ++++++++---
 tests/test_evaluate.py      | 37 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index c3f82da4..145ba515 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -388,7 +388,7 @@ def _get_subset(indices, hide_targets):
 
         return train, test
 
-    def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> BenchmarkResults:
+    def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = None) -> BenchmarkResults:
         """Execute the evaluation protocol for the benchmark, given a set of predictions.
 
         info: What about `y_true`?
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index eff562c6..0f253a0e 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -135,7 +135,9 @@ def y_type(self) -> bool:
         """Whether the metric expects preditive probablities."""
         return self.value.y_type
 
-    def score(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None) -> float:
+    def score(
+        self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: Optional[np.ndarray] = None
+    ) -> float:
         """Endpoint for computing the metric.
 
         For convenience, calling a `Metric` will result in this method being called.
@@ -145,18 +147,21 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndar
         assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second)
         ```
         """
-        # return self.fn(y_true, y_pred, **self.value.kwargs)
         if y_pred is None and y_prob is None:
             raise ValueError("Neither `y_pred` nor `y_prob` is specified.")
 
         if self.y_type == "y_pred":
+            if y_pred is None:
+                raise ValueError(f"{self} requires `y_pred` input. ")
             pred = y_pred
         else:
+            if y_prob is None:
+                raise ValueError(f"{self} requires `y_prob` input. ")
             pred = y_prob
 
         kwargs = {"y_true": y_true, self.y_type: pred}
         return self.fn(**kwargs, **self.value.kwargs)
 
-    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray = None) -> float:
+    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: np.ndarray = None) -> float:
         """For convenience, make metrics callable"""
         return self.score(y_true, y_pred, y_prob)
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 3134dfe0..f5fb7235 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -11,6 +11,7 @@
 from polaris.evaluate._metric import Metric
 from polaris.evaluate._results import BenchmarkResults
 from polaris.utils.types import HubOwner
+from polaris.dataset import Dataset
 
 
 def test_result_to_json(tmpdir: str, test_user_owner: HubOwner):
@@ -143,6 +144,40 @@ def test_absolute_average_fold_error():
     aafe_3 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_3)
     assert aafe_3 < 1
 
-    # undershoot
+    # y_true contains zeros
     with pytest.raises(ValueError):
         Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3)
+
+
+def test_metric_y_types(
+    tmpdir: str, test_single_task_benchmark_clf: SingleTaskBenchmarkSpecification, test_data: Dataset
+):
+    # here we use train split for testing purpose.
+    _, test = test_single_task_benchmark_clf.get_train_test_split()
+    predictions = np.random.randint(2, size=test.inputs.shape[0])
+    probabilities = np.random.uniform(size=test.inputs.shape[0])
+    test_y = test_data.loc[test.indices, "CLASS_expt"]
+
+    # If y_pred is None and y_prob is None, an error is thrown.
+    with pytest.raises(ValueError, match="Neither `y_pred` nor `y_prob` is specified."):
+        test_single_task_benchmark_clf.evaluate()
+
+    # If y_type == "y_pred" and y_pred is None, an error is thrown.
+    with pytest.raises(ValueError, match="Metric.accuracy requires `y_pred` input"):
+        test_single_task_benchmark_clf.metrics = [Metric.accuracy]
+        test_single_task_benchmark_clf.evaluate(y_prob=probabilities)
+
+    # If y_type != "y_pred" and y_prob is None, an error is thrown.
+    with pytest.raises(ValueError, match="Metric.roc_auc requires `y_prob` input"):
+        test_single_task_benchmark_clf.metrics = [Metric.roc_auc]
+        test_single_task_benchmark_clf.evaluate(y_pred=predictions)
+
+    # If y_type != "y_pred" and y_pred is not None and y_prob is not None, it uses y_prob as expected!
+    test_single_task_benchmark_clf.metrics = [Metric.roc_auc]
+    result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
+    assert result.results.Score.values[0] == Metric.roc_auc(y_true=test_y, y_prob=probabilities)
+
+    # If y_type == "y_pred" and y_pred is not None and y_prob is not None, it uses y_pred as expected!
+    test_single_task_benchmark_clf.metrics = [Metric.f1]
+    result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
+    assert result.results.Score.values[0] == Metric.f1(y_true=test_y, y_pred=predictions)

From 0216ff6a6ce4fbe27a0e55984bf75ec94c3afb74 Mon Sep 17 00:00:00 2001
From: Lu Zhu <lu@valencediscovery.com>
Date: Tue, 30 Apr 2024 13:00:04 -0400
Subject: [PATCH 15/18] Update polaris/benchmark/_base.py

Co-authored-by: Cas Wognum <caswognum@outlook.com>
---
 polaris/benchmark/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 145ba515..3df5e0ea 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -448,7 +448,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non
                 if metric.is_multitask:
                     # Multi-task but with a metric across targets
                     score = metric(
-                        y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None)
+                        y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label)
                     )
                     scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                     continue

From e0aa6c256ddee6c16f0377961b83a35245af57bd Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 30 Apr 2024 14:10:46 -0400
Subject: [PATCH 16/18] more tests

---
 polaris/benchmark/_base.py | 10 ++++++----
 tests/conftest.py          | 20 ++++++++++++++++++++
 tests/test_integration.py  | 28 +++++++++++++++++++++++++++-
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 3df5e0ea..3c18d875 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -433,7 +433,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non
         if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob):
             y_prob = {"test": y_prob}
 
-        if any(k not in y_pred for k in test.keys()):
+        if any(k not in y_pred for k in test.keys()) and any(k not in y_prob for k in test.keys()):
             raise KeyError(
                 f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
             )
@@ -448,7 +448,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non
                 if metric.is_multitask:
                     # Multi-task but with a metric across targets
                     score = metric(
-                        y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label)
+                        y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
                     )
                     scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                     continue
@@ -456,7 +456,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non
                 if not isinstance(y_true_subset, dict):
                     # Single task
                     score = metric(
-                        y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None)
+                        y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
                     )
                     scores.loc[len(scores)] = (
                         test_label,
@@ -473,7 +473,9 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non
                     mask = ~np.isnan(y_true_target)
                     score = metric(
                         y_true=y_true_target[mask],
-                        y_pred=y_pred[test_label][target_label][mask],
+                        y_pred=y_pred[test_label][target_label][mask]
+                        if y_pred[test_label] is not None
+                        else None,
                         y_prob=y_prob[test_label][target_label][mask]
                         if y_prob[test_label] is not None
                         else None,
diff --git a/tests/conftest.py b/tests/conftest.py
index 545cb26c..aa02c3f7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -185,6 +185,26 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
     return benchmark
 
 
+@pytest.fixture(scope="function")
+def test_single_task_benchmark_clf_multiple_test_sets(test_dataset):
+    np.random.seed(111)  # make sure two classes in `y_true`
+    indices = np.arange(100)
+    np.random.shuffle(indices)
+    train_indices = indices[:80]
+    test_indices = {"test_1": indices[80:90], "test_2": indices[90:]}
+    benchmark = SingleTaskBenchmarkSpecification(
+        name="single-task-benchmark-clf",
+        dataset=test_dataset,
+        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
+        main_metric="pr_auc",
+        split=(train_indices, test_indices),
+        target_cols="CLASS_calc",
+        input_cols="smiles",
+    )
+    check_version(benchmark)
+    return benchmark
+
+
 @pytest.fixture(scope="function")
 def test_multi_task_benchmark(test_dataset):
     # For the sake of simplicity, just use a small set of indices
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 7b96a4e3..f0de3fcc 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -1,6 +1,7 @@
 import datamol as dm
 import numpy as np
-from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from sklearn.svm import SVC
 
 from polaris.evaluate import BenchmarkResults
 
@@ -41,6 +42,31 @@ def test_single_task_benchmark_loop_with_multiple_test_sets(test_single_task_ben
     assert isinstance(scores, BenchmarkResults)
 
 
+def test_single_task_benchmark_clf_loop_with_multiple_test_sets(
+    test_single_task_benchmark_clf_multiple_test_sets,
+):
+    """Tests the integrated API for a single-task benchmark for classification probabilities with multiple test sets."""
+    train, test = test_single_task_benchmark_clf_multiple_test_sets.get_train_test_split()
+
+    smiles, y = train.as_array("xy")
+
+    x_train = np.array([dm.to_fp(dm.to_mol(smi)) for smi in smiles])
+
+    model = RandomForestClassifier()
+    model.fit(X=x_train, y=y)
+
+    y_prob = {}
+    y_pred = {}
+    for k, test_subset in test.items():
+        print(k, test_subset)
+        x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test_subset.inputs])
+        y_prob[k] = model.predict_proba(x_test)[:, :1]  # for binary classification
+        y_pred[k] = model.predict(x_test)
+
+    scores = test_single_task_benchmark_clf_multiple_test_sets.evaluate(y_prob=y_prob, y_pred=y_pred)
+    assert isinstance(scores, BenchmarkResults)
+
+
 def test_multi_task_benchmark_loop(test_multi_task_benchmark):
     """Tests the integrated API for a multi-task benchmark."""
     train, test = test_multi_task_benchmark.get_train_test_split()

From 6985976e8dc43be20ec10d04ca7b5d9a9b057e9d Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 30 Apr 2024 14:12:55 -0400
Subject: [PATCH 17/18] lint

---
 tests/test_integration.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_integration.py b/tests/test_integration.py
index f0de3fcc..5d9a983b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -1,7 +1,6 @@
 import datamol as dm
 import numpy as np
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
-from sklearn.svm import SVC
 
 from polaris.evaluate import BenchmarkResults
 

From 0848f1c23637b4343f8f2f259a1f1491ec83f91a Mon Sep 17 00:00:00 2001
From: Lu Zhu <zhu.lu@hotmail.com>
Date: Tue, 30 Apr 2024 14:42:43 -0400
Subject: [PATCH 18/18] minor changes

---
 polaris/benchmark/_base.py  | 4 +++-
 polaris/evaluate/_metric.py | 6 ++++--
 tests/test_evaluate.py      | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 3c18d875..70fd02da 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -388,7 +388,9 @@ def _get_subset(indices, hide_targets):
 
         return train, test
 
-    def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = None) -> BenchmarkResults:
+    def evaluate(
+        self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None
+    ) -> BenchmarkResults:
         """Execute the evaluation protocol for the benchmark, given a set of predictions.
 
         info: What about `y_true`?
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 0f253a0e..eee9cd27 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -136,7 +136,7 @@ def y_type(self) -> bool:
         return self.value.y_type
 
     def score(
-        self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: Optional[np.ndarray] = None
+        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
     ) -> float:
         """Endpoint for computing the metric.
 
@@ -162,6 +162,8 @@ def score(
         kwargs = {"y_true": y_true, self.y_type: pred}
         return self.fn(**kwargs, **self.value.kwargs)
 
-    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: np.ndarray = None) -> float:
+    def __call__(
+        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+    ) -> float:
         """For convenience, make metrics callable"""
         return self.score(y_true, y_pred, y_prob)
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index f5fb7235..ecefa97c 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -175,9 +175,9 @@ def test_metric_y_types(
     # If y_type != "y_pred" and y_pred is not None and y_prob is not None, it uses y_prob as expected!
     test_single_task_benchmark_clf.metrics = [Metric.roc_auc]
     result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
-    assert result.results.Score.values[0] == Metric.roc_auc(y_true=test_y, y_prob=probabilities)
+    assert result.results.Score.values[0] == Metric.roc_auc.fn(y_true=test_y, y_score=probabilities)
 
     # If y_type == "y_pred" and y_pred is not None and y_prob is not None, it uses y_pred as expected!
     test_single_task_benchmark_clf.metrics = [Metric.f1]
     result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
-    assert result.results.Score.values[0] == Metric.f1(y_true=test_y, y_pred=predictions)
+    assert result.results.Score.values[0] == Metric.f1.fn(y_true=test_y, y_pred=predictions)