Skip to content
Merged
4 changes: 3 additions & 1 deletion docs/api/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

::: polaris.evaluate.MetricInfo

::: polaris.evaluate._metric.absolute_average_fold_error

---

::: polaris.evaluate.Metric
options:
filters: ["!^_", "!fn", "!is_multitask"]
filters: ["!^_", "!fn", "!is_multitask", "!y_type"]

---
25 changes: 20 additions & 5 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,9 @@ def _get_subset(indices, hide_targets):

return train, test

def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
def evaluate(
self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None
) -> BenchmarkResults:
"""Execute the evaluation protocol for the benchmark, given a set of predictions.

info: What about `y_true`?
Expand All @@ -408,6 +410,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
If there are multiple test sets, the predictions should be further wrapped in a dictionary
with the test subset labels as keys.
y_prob: The predicted probabilities for the test set, as NumPy arrays.

Returns:
A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
Expand All @@ -429,7 +432,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred):
y_pred = {"test": y_pred}

if any(k not in y_pred for k in test.keys()):
if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob):
y_prob = {"test": y_prob}

if any(k not in y_pred for k in test.keys()) and any(k not in y_prob for k in test.keys()):
raise KeyError(
f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
)
Expand All @@ -443,13 +449,17 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
for metric in self.metrics:
if metric.is_multitask:
# Multi-task but with a metric across targets
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
score = metric(
y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
)
scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
continue

if not isinstance(y_true_subset, dict):
# Single task
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
score = metric(
y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
)
scores.loc[len(scores)] = (
test_label,
self.target_cols[0],
Expand All @@ -465,7 +475,12 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
mask = ~np.isnan(y_true_target)
score = metric(
y_true=y_true_target[mask],
y_pred=y_pred[test_label][target_label][mask],
y_pred=y_pred[test_label][target_label][mask]
if y_pred[test_label] is not None
else None,
y_prob=y_prob[test_label][target_label][mask]
if y_prob[test_label] is not None
else None,
)
scores.loc[len(scores)] = (test_label, target_label, metric, score)

Expand Down
91 changes: 78 additions & 13 deletions polaris/evaluate/_metric.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
from enum import Enum
from typing import Callable
from typing import Callable, Literal, Optional

import numpy as np
from pydantic import BaseModel, Field
from scipy import stats
from sklearn.metrics import (
accuracy_score,
average_precision_score,
cohen_kappa_score,
cohen_kappa_score as sk_cohen_kappa_score,
explained_variance_score,
f1_score,
matthews_corrcoef,
mean_absolute_error,
mean_squared_error,
r2_score,
roc_auc_score,
balanced_accuracy_score,
)

from polaris.utils.types import DirectionType
Expand All @@ -30,6 +31,35 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray):
return stats.spearmanr(y_true, y_pred).statistic


def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
"""
Calculate the Absolute Average Fold Error (AAFE) metric.
It measures the fold change between predicted values and observed values.
The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305).

Args:
y_true: The true target values of shape (n_samples,)
y_pred: The predicted target values of shape (n_samples,).

Returns:
aafe: The Absolute Average Fold Error.
"""
if len(y_true) != len(y_pred):
raise ValueError("Length of y_true and y_pred must be the same.")

if np.any(y_true == 0):
raise ValueError("`y_true` contains zero which will result `Inf` value.")

aafe = np.mean(np.abs(y_pred) / np.abs(y_true))

return aafe


def cohen_kappa_score(y_true, y_pred, **kwargs):
"""Scikit learn cohen_kappa_score wraper with renamed arguments"""
return sk_cohen_kappa_score(y1=y_true, y2=y_pred, **kwargs)


class MetricInfo(BaseModel):
"""
Metric metadata
Expand All @@ -45,6 +75,7 @@ class MetricInfo(BaseModel):
is_multitask: bool = False
kwargs: dict = Field(default_factory=dict)
direction: DirectionType
y_type: Literal["y_pred", "y_prob", "y_score"] = "y_pred"


class Metric(Enum):
Expand All @@ -65,17 +96,29 @@ class Metric(Enum):
pearsonr = MetricInfo(fn=pearsonr, direction="max")
spearmanr = MetricInfo(fn=spearman, direction="max")
explained_var = MetricInfo(fn=explained_variance_score, direction="max")
absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction=1)

# classification
# binary and multiclass classification
accuracy = MetricInfo(fn=accuracy_score, direction="max")
balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
pr_auc = MetricInfo(fn=average_precision_score, direction="max", y_type="y_score")

# binary only
f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max")
roc_auc = MetricInfo(fn=roc_auc_score, direction="max", y_type="y_score")

# multiclass tasks only
f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
roc_auc = MetricInfo(fn=roc_auc_score, direction="max")
pr_auc = MetricInfo(fn=average_precision_score, direction="max")
mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
# TODO: adding metrics for multiclass tasks
roc_auc_ovr = MetricInfo(
fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", y_type="y_score"
)
roc_auc_ovo = MetricInfo(
fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score"
)
# TODO: add metrics to handle multitask multiclass predictions.

@property
def fn(self) -> Callable:
Expand All @@ -87,7 +130,14 @@ def is_multitask(self) -> bool:
"""Whether the metric expects a single set of predictions or a dict of predictions."""
return self.value.is_multitask

def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
@property
def y_type(self) -> bool:
"""Whether the metric expects preditive probablities."""
return self.value.y_type

def score(
self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
) -> float:
"""Endpoint for computing the metric.

For convenience, calling a `Metric` will result in this method being called.
Expand All @@ -97,8 +147,23 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second)
```
"""
return self.fn(y_true, y_pred, **self.value.kwargs)

def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
if y_pred is None and y_prob is None:
raise ValueError("Neither `y_pred` nor `y_prob` is specified.")

if self.y_type == "y_pred":
if y_pred is None:
raise ValueError(f"{self} requires `y_pred` input. ")
pred = y_pred
else:
if y_prob is None:
raise ValueError(f"{self} requires `y_prob` input. ")
pred = y_prob

kwargs = {"y_true": y_true, self.y_type: pred}
return self.fn(**kwargs, **self.value.kwargs)

def __call__(
self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
) -> float:
"""For convenience, make metrics callable"""
return self.score(y_true, y_pred)
return self.score(y_true, y_pred, y_prob)
2 changes: 1 addition & 1 deletion polaris/utils/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
This is useful for interactions with httpx and authlib, who have their own URL types.
"""

DirectionType: TypeAlias = Literal["min", "max"]
DirectionType: TypeAlias = float | Literal["min", "max"]
"""
The direction of any variable to be sorted.
This can be used to sort the metric score, indicate the optmization direction of endpoint.
Expand Down
58 changes: 57 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ def test_data():
# set an abitrary threshold for testing purpose.
data["CLASS_expt"] = data["expt"].gt(0).astype(int).values
data["CLASS_calc"] = data["calc"].gt(0).astype(int).values
data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0])
data["MULTICLASS_calc"] = np.random.randint(low=0, high=3, size=data.shape[0])
return data


Expand Down Expand Up @@ -99,6 +101,7 @@ def test_single_task_benchmark(test_dataset):
"spearmanr",
"pearsonr",
"explained_var",
"absolute_average_fold_error",
],
main_metric="mean_absolute_error",
split=(train_indices, test_indices),
Expand All @@ -117,7 +120,7 @@ def test_single_task_benchmark_clf(test_dataset):
name="single-task-benchmark",
dataset=test_dataset,
main_metric="accuracy",
metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa", "balanced_accuracy"],
split=(train_indices, test_indices),
target_cols="CLASS_expt",
input_cols="smiles",
Expand All @@ -126,6 +129,37 @@ def test_single_task_benchmark_clf(test_dataset):
return benchmark


@pytest.fixture(scope="function")
def test_single_task_benchmark_multi_clf(test_dataset):
np.random.seed(111)
indices = np.arange(100)
np.random.shuffle(indices)
train_indices = indices[:80]
test_indices = indices[80:]

benchmark = SingleTaskBenchmarkSpecification(
name="single-task-benchmark",
dataset=test_dataset,
main_metric="accuracy",
metrics=[
"accuracy",
"balanced_accuracy",
"mcc",
"cohen_kappa",
"f1_macro",
"f1_micro",
"roc_auc_ovr",
"roc_auc_ovo",
"pr_auc",
],
split=(train_indices, test_indices),
target_cols="MULTICLASS_expt",
input_cols="smiles",
)
check_version(benchmark)
return benchmark


@pytest.fixture(scope="function")
def test_single_task_benchmark_multiple_test_sets(test_dataset):
train_indices = list(range(90))
Expand All @@ -140,6 +174,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
"spearmanr",
"pearsonr",
"explained_var",
"absolute_average_fold_error",
],
main_metric="r2",
split=(train_indices, test_indices),
Expand All @@ -150,6 +185,26 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
return benchmark


@pytest.fixture(scope="function")
def test_single_task_benchmark_clf_multiple_test_sets(test_dataset):
np.random.seed(111) # make sure two classes in `y_true`
indices = np.arange(100)
np.random.shuffle(indices)
train_indices = indices[:80]
test_indices = {"test_1": indices[80:90], "test_2": indices[90:]}
benchmark = SingleTaskBenchmarkSpecification(
name="single-task-benchmark-clf",
dataset=test_dataset,
metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
main_metric="pr_auc",
split=(train_indices, test_indices),
target_cols="CLASS_calc",
input_cols="smiles",
)
check_version(benchmark)
return benchmark


@pytest.fixture(scope="function")
def test_multi_task_benchmark(test_dataset):
# For the sake of simplicity, just use a small set of indices
Expand All @@ -166,6 +221,7 @@ def test_multi_task_benchmark(test_dataset):
"spearmanr",
"pearsonr",
"explained_var",
"absolute_average_fold_error",
],
split=(train_indices, test_indices),
target_cols=["expt", "calc"],
Expand Down
Loading