polaris-hub · zhu0619 · Apr 30, 2024 · Apr 22, 2024 · Apr 23, 2024 · Apr 23, 2024
@@ -5,10 +5,12 @@
 
 ::: polaris.evaluate.MetricInfo
 
+::: polaris.evaluate._metric.absolute_average_fold_error
+
 ---
 
 ::: polaris.evaluate.Metric
     options: 
-        filters: ["!^_", "!fn", "!is_multitask"]
+        filters: ["!^_", "!fn", "!is_multitask", "!y_type"]
 
 ---
@@ -388,7 +388,9 @@ def _get_subset(indices, hide_targets):
 
         return train, test
 
-    def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
+    def evaluate(
+        self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None
+    ) -> BenchmarkResults:
         """Execute the evaluation protocol for the benchmark, given a set of predictions.
 
         info: What about `y_true`?
@@ -408,6 +410,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
                 If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
                 If there are multiple test sets, the predictions should be further wrapped in a dictionary
                     with the test subset labels as keys.
+            y_prob: The predicted probabilities for the test set, as NumPy arrays.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
@@ -429,7 +432,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred):
             y_pred = {"test": y_pred}
 
-        if any(k not in y_pred for k in test.keys()):
+        if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob):
+            y_prob = {"test": y_prob}
+
+        if any(k not in y_pred for k in test.keys()) and any(k not in y_prob for k in test.keys()):
             raise KeyError(
                 f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
             )
@@ -443,13 +449,17 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
             for metric in self.metrics:
                 if metric.is_multitask:
                     # Multi-task but with a metric across targets
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                    score = metric(
+                        y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
+                    )
                     scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                     continue
 
                 if not isinstance(y_true_subset, dict):
                     # Single task
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                    score = metric(
+                        y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
+                    )
                     scores.loc[len(scores)] = (
                         test_label,
                         self.target_cols[0],
@@ -465,7 +475,12 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
                     mask = ~np.isnan(y_true_target)
                     score = metric(
                         y_true=y_true_target[mask],
-                        y_pred=y_pred[test_label][target_label][mask],
+                        y_pred=y_pred[test_label][target_label][mask]
+                        if y_pred[test_label] is not None
+                        else None,
+                        y_prob=y_prob[test_label][target_label][mask]
+                        if y_prob[test_label] is not None
+                        else None,
                     )
                     scores.loc[len(scores)] = (test_label, target_label, metric, score)
 

@@ -1,20 +1,21 @@
 from enum import Enum
-from typing import Callable
+from typing import Callable, Literal, Optional
 
 import numpy as np
 from pydantic import BaseModel, Field
 from scipy import stats
 from sklearn.metrics import (
     accuracy_score,
     average_precision_score,
-    cohen_kappa_score,
+    cohen_kappa_score as sk_cohen_kappa_score,
     explained_variance_score,
     f1_score,
     matthews_corrcoef,
     mean_absolute_error,
     mean_squared_error,
     r2_score,
     roc_auc_score,
+    balanced_accuracy_score,
 )
 
 from polaris.utils.types import DirectionType
@@ -30,6 +31,35 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray):
     return stats.spearmanr(y_true, y_pred).statistic
 
 
+def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    """
+    Calculate the Absolute Average Fold Error (AAFE) metric.
+    It measures the fold change between predicted values and observed values.
+    The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305).
+
+    Args:
+        y_true: The true target values of shape (n_samples,)
+        y_pred: The predicted target values of shape (n_samples,).
+
+    Returns:
+        aafe: The Absolute Average Fold Error.
+    """
+    if len(y_true) != len(y_pred):
+        raise ValueError("Length of y_true and y_pred must be the same.")
+
+    if np.any(y_true == 0):
+        raise ValueError("`y_true` contains zero which will result `Inf` value.")
+
+    aafe = np.mean(np.abs(y_pred) / np.abs(y_true))
+
+    return aafe
+
+
+def cohen_kappa_score(y_true, y_pred, **kwargs):
+    """Scikit learn cohen_kappa_score wraper with renamed arguments"""
+    return sk_cohen_kappa_score(y1=y_true, y2=y_pred, **kwargs)
+
+
 class MetricInfo(BaseModel):
     """
     Metric metadata
@@ -45,6 +75,7 @@ class MetricInfo(BaseModel):
     is_multitask: bool = False
     kwargs: dict = Field(default_factory=dict)
     direction: DirectionType
+    y_type: Literal["y_pred", "y_prob", "y_score"] = "y_pred"
 
 
 class Metric(Enum):
@@ -65,17 +96,29 @@ class Metric(Enum):
     pearsonr = MetricInfo(fn=pearsonr, direction="max")
     spearmanr = MetricInfo(fn=spearman, direction="max")
     explained_var = MetricInfo(fn=explained_variance_score, direction="max")
+    absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction=1)
 
-    # classification
+    # binary and multiclass classification
     accuracy = MetricInfo(fn=accuracy_score, direction="max")
+    balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
+    mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
+    cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
+    pr_auc = MetricInfo(fn=average_precision_score, direction="max", y_type="y_score")
+
+    # binary only
     f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max")
+    roc_auc = MetricInfo(fn=roc_auc_score, direction="max", y_type="y_score")
+
+    # multiclass tasks only
     f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
     f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
-    roc_auc = MetricInfo(fn=roc_auc_score, direction="max")
-    pr_auc = MetricInfo(fn=average_precision_score, direction="max")
-    mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
-    cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
-    # TODO: adding metrics for multiclass tasks
+    roc_auc_ovr = MetricInfo(
+        fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", y_type="y_score"
+    )
+    roc_auc_ovo = MetricInfo(
+        fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score"
+    )
+    # TODO: add metrics to handle multitask multiclass predictions.
 
     @property
     def fn(self) -> Callable:
@@ -87,7 +130,14 @@ def is_multitask(self) -> bool:
         """Whether the metric expects a single set of predictions or a dict of predictions."""
         return self.value.is_multitask
 
-    def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    @property
+    def y_type(self) -> bool:
+        """Whether the metric expects preditive probablities."""
+        return self.value.y_type
+
+    def score(
+        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+    ) -> float:
         """Endpoint for computing the metric.
 
         For convenience, calling a `Metric` will result in this method being called.
@@ -97,8 +147,23 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
         assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second)
         ```
         """
-        return self.fn(y_true, y_pred, **self.value.kwargs)
-
-    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        if y_pred is None and y_prob is None:
+            raise ValueError("Neither `y_pred` nor `y_prob` is specified.")
+
+        if self.y_type == "y_pred":
+            if y_pred is None:
+                raise ValueError(f"{self} requires `y_pred` input. ")
+            pred = y_pred
+        else:
+            if y_prob is None:
+                raise ValueError(f"{self} requires `y_prob` input. ")
+            pred = y_prob
+
+        kwargs = {"y_true": y_true, self.y_type: pred}
+        return self.fn(**kwargs, **self.value.kwargs)
+
+    def __call__(
+        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+    ) -> float:
         """For convenience, make metrics callable"""
-        return self.score(y_true, y_pred)
+        return self.score(y_true, y_pred, y_prob)
@@ -71,7 +71,7 @@
 This is useful for interactions with httpx and authlib, who have their own URL types.
 """
 
-DirectionType: TypeAlias = Literal["min", "max"]
+DirectionType: TypeAlias = float | Literal["min", "max"]
 """
 The direction of any variable to be sorted.
 This can be used to sort the metric score, indicate the optmization direction of endpoint.

@@ -23,6 +23,8 @@ def test_data():
     # set an abitrary threshold for testing purpose.
     data["CLASS_expt"] = data["expt"].gt(0).astype(int).values
     data["CLASS_calc"] = data["calc"].gt(0).astype(int).values
+    data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0])
+    data["MULTICLASS_calc"] = np.random.randint(low=0, high=3, size=data.shape[0])
     return data
 
 
@@ -99,6 +101,7 @@ def test_single_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error",
         ],
         main_metric="mean_absolute_error",
         split=(train_indices, test_indices),
@@ -117,7 +120,7 @@ def test_single_task_benchmark_clf(test_dataset):
         name="single-task-benchmark",
         dataset=test_dataset,
         main_metric="accuracy",
-        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
+        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa", "balanced_accuracy"],
         split=(train_indices, test_indices),
         target_cols="CLASS_expt",
         input_cols="smiles",
@@ -126,6 +129,37 @@ def test_single_task_benchmark_clf(test_dataset):
     return benchmark
 
 
+@pytest.fixture(scope="function")
+def test_single_task_benchmark_multi_clf(test_dataset):
+    np.random.seed(111)
+    indices = np.arange(100)
+    np.random.shuffle(indices)
+    train_indices = indices[:80]
+    test_indices = indices[80:]
+
+    benchmark = SingleTaskBenchmarkSpecification(
+        name="single-task-benchmark",
+        dataset=test_dataset,
+        main_metric="accuracy",
+        metrics=[
+            "accuracy",
+            "balanced_accuracy",
+            "mcc",
+            "cohen_kappa",
+            "f1_macro",
+            "f1_micro",
+            "roc_auc_ovr",
+            "roc_auc_ovo",
+            "pr_auc",
+        ],
+        split=(train_indices, test_indices),
+        target_cols="MULTICLASS_expt",
+        input_cols="smiles",
+    )
+    check_version(benchmark)
+    return benchmark
+
+
 @pytest.fixture(scope="function")
 def test_single_task_benchmark_multiple_test_sets(test_dataset):
     train_indices = list(range(90))
@@ -140,6 +174,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error",
         ],
         main_metric="r2",
         split=(train_indices, test_indices),
@@ -150,6 +185,26 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
     return benchmark
 
 
+@pytest.fixture(scope="function")
+def test_single_task_benchmark_clf_multiple_test_sets(test_dataset):
+    np.random.seed(111)  # make sure two classes in `y_true`
+    indices = np.arange(100)
+    np.random.shuffle(indices)
+    train_indices = indices[:80]
+    test_indices = {"test_1": indices[80:90], "test_2": indices[90:]}
+    benchmark = SingleTaskBenchmarkSpecification(
+        name="single-task-benchmark-clf",
+        dataset=test_dataset,
+        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
+        main_metric="pr_auc",
+        split=(train_indices, test_indices),
+        target_cols="CLASS_calc",
+        input_cols="smiles",
+    )
+    check_version(benchmark)
+    return benchmark
+
+
 @pytest.fixture(scope="function")
 def test_multi_task_benchmark(test_dataset):
     # For the sake of simplicity, just use a small set of indices
@@ -166,6 +221,7 @@ def test_multi_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error",
         ],
         split=(train_indices, test_indices),
         target_cols=["expt", "calc"],