diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md index 6f5561fd..9187a763 100644 --- a/docs/api/evaluation.md +++ b/docs/api/evaluation.md @@ -5,10 +5,12 @@ ::: polaris.evaluate.MetricInfo +::: polaris.evaluate._metric.absolute_average_fold_error + --- ::: polaris.evaluate.Metric options: - filters: ["!^_", "!fn", "!is_multitask"] + filters: ["!^_", "!fn", "!is_multitask", "!y_type"] --- \ No newline at end of file diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 0d4f12ed..70fd02da 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -388,7 +388,9 @@ def _get_subset(indices, hide_targets): return train, test - def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: + def evaluate( + self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None + ) -> BenchmarkResults: """Execute the evaluation protocol for the benchmark, given a set of predictions. info: What about `y_true`? @@ -408,6 +410,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. If there are multiple test sets, the predictions should be further wrapped in a dictionary with the test subset labels as keys. + y_prob: The predicted probabilities for the test set, as NumPy arrays. Returns: A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. @@ -429,7 +432,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred): y_pred = {"test": y_pred} - if any(k not in y_pred for k in test.keys()): + if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob): + y_prob = {"test": y_prob} + + if any(k not in y_pred for k in test.keys()) and any(k not in y_prob for k in test.keys()): raise KeyError( f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" ) @@ -443,13 +449,17 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: for metric in self.metrics: if metric.is_multitask: # Multi-task but with a metric across targets - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + score = metric( + y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label) + ) scores.loc[len(scores)] = (test_label, "aggregated", metric, score) continue if not isinstance(y_true_subset, dict): # Single task - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + score = metric( + y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label) + ) scores.loc[len(scores)] = ( test_label, self.target_cols[0], @@ -465,7 +475,12 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: mask = ~np.isnan(y_true_target) score = metric( y_true=y_true_target[mask], - y_pred=y_pred[test_label][target_label][mask], + y_pred=y_pred[test_label][target_label][mask] + if y_pred[test_label] is not None + else None, + y_prob=y_prob[test_label][target_label][mask] + if y_prob[test_label] is not None + else None, ) scores.loc[len(scores)] = (test_label, target_label, metric, score) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 57dfd0ee..eee9cd27 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Callable +from typing import Callable, Literal, Optional import numpy as np from pydantic import BaseModel, Field @@ -7,7 +7,7 @@ from sklearn.metrics import ( accuracy_score, average_precision_score, - cohen_kappa_score, + cohen_kappa_score as sk_cohen_kappa_score, explained_variance_score, f1_score, matthews_corrcoef, @@ -15,6 +15,7 @@ mean_squared_error, r2_score, roc_auc_score, + balanced_accuracy_score, ) from polaris.utils.types import DirectionType @@ -30,6 +31,35 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray): return stats.spearmanr(y_true, y_pred).statistic +def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float: + """ + Calculate the Absolute Average Fold Error (AAFE) metric. + It measures the fold change between predicted values and observed values. + The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305). + + Args: + y_true: The true target values of shape (n_samples,) + y_pred: The predicted target values of shape (n_samples,). + + Returns: + aafe: The Absolute Average Fold Error. + """ + if len(y_true) != len(y_pred): + raise ValueError("Length of y_true and y_pred must be the same.") + + if np.any(y_true == 0): + raise ValueError("`y_true` contains zero which will result `Inf` value.") + + aafe = np.mean(np.abs(y_pred) / np.abs(y_true)) + + return aafe + + +def cohen_kappa_score(y_true, y_pred, **kwargs): + """Scikit learn cohen_kappa_score wraper with renamed arguments""" + return sk_cohen_kappa_score(y1=y_true, y2=y_pred, **kwargs) + + class MetricInfo(BaseModel): """ Metric metadata @@ -45,6 +75,7 @@ class MetricInfo(BaseModel): is_multitask: bool = False kwargs: dict = Field(default_factory=dict) direction: DirectionType + y_type: Literal["y_pred", "y_prob", "y_score"] = "y_pred" class Metric(Enum): @@ -65,17 +96,29 @@ class Metric(Enum): pearsonr = MetricInfo(fn=pearsonr, direction="max") spearmanr = MetricInfo(fn=spearman, direction="max") explained_var = MetricInfo(fn=explained_variance_score, direction="max") + absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction=1) - # classification + # binary and multiclass classification accuracy = MetricInfo(fn=accuracy_score, direction="max") + balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max") + mcc = MetricInfo(fn=matthews_corrcoef, direction="max") + cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") + pr_auc = MetricInfo(fn=average_precision_score, direction="max", y_type="y_score") + + # binary only f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max") + roc_auc = MetricInfo(fn=roc_auc_score, direction="max", y_type="y_score") + + # multiclass tasks only f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max") f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max") - roc_auc = MetricInfo(fn=roc_auc_score, direction="max") - pr_auc = MetricInfo(fn=average_precision_score, direction="max") - mcc = MetricInfo(fn=matthews_corrcoef, direction="max") - cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") - # TODO: adding metrics for multiclass tasks + roc_auc_ovr = MetricInfo( + fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", y_type="y_score" + ) + roc_auc_ovo = MetricInfo( + fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score" + ) + # TODO: add metrics to handle multitask multiclass predictions. @property def fn(self) -> Callable: @@ -87,7 +130,14 @@ def is_multitask(self) -> bool: """Whether the metric expects a single set of predictions or a dict of predictions.""" return self.value.is_multitask - def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: + @property + def y_type(self) -> bool: + """Whether the metric expects preditive probablities.""" + return self.value.y_type + + def score( + self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None + ) -> float: """Endpoint for computing the metric. For convenience, calling a `Metric` will result in this method being called. @@ -97,8 +147,23 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second) ``` """ - return self.fn(y_true, y_pred, **self.value.kwargs) - - def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: + if y_pred is None and y_prob is None: + raise ValueError("Neither `y_pred` nor `y_prob` is specified.") + + if self.y_type == "y_pred": + if y_pred is None: + raise ValueError(f"{self} requires `y_pred` input. ") + pred = y_pred + else: + if y_prob is None: + raise ValueError(f"{self} requires `y_prob` input. ") + pred = y_prob + + kwargs = {"y_true": y_true, self.y_type: pred} + return self.fn(**kwargs, **self.value.kwargs) + + def __call__( + self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None + ) -> float: """For convenience, make metrics callable""" - return self.score(y_true, y_pred) + return self.score(y_true, y_pred, y_prob) diff --git a/polaris/utils/types.py b/polaris/utils/types.py index b8093450..256224fa 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -71,7 +71,7 @@ This is useful for interactions with httpx and authlib, who have their own URL types. """ -DirectionType: TypeAlias = Literal["min", "max"] +DirectionType: TypeAlias = float | Literal["min", "max"] """ The direction of any variable to be sorted. This can be used to sort the metric score, indicate the optmization direction of endpoint. diff --git a/tests/conftest.py b/tests/conftest.py index 1ebc0a02..aa02c3f7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,8 @@ def test_data(): # set an abitrary threshold for testing purpose. data["CLASS_expt"] = data["expt"].gt(0).astype(int).values data["CLASS_calc"] = data["calc"].gt(0).astype(int).values + data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0]) + data["MULTICLASS_calc"] = np.random.randint(low=0, high=3, size=data.shape[0]) return data @@ -99,6 +101,7 @@ def test_single_task_benchmark(test_dataset): "spearmanr", "pearsonr", "explained_var", + "absolute_average_fold_error", ], main_metric="mean_absolute_error", split=(train_indices, test_indices), @@ -117,7 +120,7 @@ def test_single_task_benchmark_clf(test_dataset): name="single-task-benchmark", dataset=test_dataset, main_metric="accuracy", - metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"], + metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa", "balanced_accuracy"], split=(train_indices, test_indices), target_cols="CLASS_expt", input_cols="smiles", @@ -126,6 +129,37 @@ def test_single_task_benchmark_clf(test_dataset): return benchmark +@pytest.fixture(scope="function") +def test_single_task_benchmark_multi_clf(test_dataset): + np.random.seed(111) + indices = np.arange(100) + np.random.shuffle(indices) + train_indices = indices[:80] + test_indices = indices[80:] + + benchmark = SingleTaskBenchmarkSpecification( + name="single-task-benchmark", + dataset=test_dataset, + main_metric="accuracy", + metrics=[ + "accuracy", + "balanced_accuracy", + "mcc", + "cohen_kappa", + "f1_macro", + "f1_micro", + "roc_auc_ovr", + "roc_auc_ovo", + "pr_auc", + ], + split=(train_indices, test_indices), + target_cols="MULTICLASS_expt", + input_cols="smiles", + ) + check_version(benchmark) + return benchmark + + @pytest.fixture(scope="function") def test_single_task_benchmark_multiple_test_sets(test_dataset): train_indices = list(range(90)) @@ -140,6 +174,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset): "spearmanr", "pearsonr", "explained_var", + "absolute_average_fold_error", ], main_metric="r2", split=(train_indices, test_indices), @@ -150,6 +185,26 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset): return benchmark +@pytest.fixture(scope="function") +def test_single_task_benchmark_clf_multiple_test_sets(test_dataset): + np.random.seed(111) # make sure two classes in `y_true` + indices = np.arange(100) + np.random.shuffle(indices) + train_indices = indices[:80] + test_indices = {"test_1": indices[80:90], "test_2": indices[90:]} + benchmark = SingleTaskBenchmarkSpecification( + name="single-task-benchmark-clf", + dataset=test_dataset, + metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"], + main_metric="pr_auc", + split=(train_indices, test_indices), + target_cols="CLASS_calc", + input_cols="smiles", + ) + check_version(benchmark) + return benchmark + + @pytest.fixture(scope="function") def test_multi_task_benchmark(test_dataset): # For the sake of simplicity, just use a small set of indices @@ -166,6 +221,7 @@ def test_multi_task_benchmark(test_dataset): "spearmanr", "pearsonr", "explained_var", + "absolute_average_fold_error", ], split=(train_indices, test_indices), target_cols=["expt", "calc"], diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 5a4332c9..ecefa97c 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -1,5 +1,5 @@ import os - +import pytest import numpy as np import pandas as pd @@ -11,6 +11,7 @@ from polaris.evaluate._metric import Metric from polaris.evaluate._results import BenchmarkResults from polaris.utils.types import HubOwner +from polaris.dataset import Dataset def test_result_to_json(tmpdir: str, test_user_owner: HubOwner): @@ -73,17 +74,33 @@ def test_metrics_singletask_clf( ): _, test = test_single_task_benchmark_clf.get_train_test_split() predictions = np.random.randint(2, size=test.inputs.shape[0]) - result = test_single_task_benchmark_clf.evaluate(predictions) + probabilities = np.random.uniform(size=test.inputs.shape[0]) + result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) for metric in test_single_task_benchmark_clf.metrics: assert metric in result.results.Metric.tolist() +def test_metrics_singletask_multicls_clf( + tmpdir: str, test_single_task_benchmark_multi_clf: SingleTaskBenchmarkSpecification +): + _, test = test_single_task_benchmark_multi_clf.get_train_test_split() + predictions = np.random.randint(3, size=test.inputs.shape[0]) + probablities = np.random.random(size=(test.inputs.shape[0], 3)) + probablities = probablities / probablities.sum(axis=1, keepdims=True) + result = test_single_task_benchmark_multi_clf.evaluate(y_pred=predictions, y_prob=probablities) + for metric in test_single_task_benchmark_multi_clf.metrics: + assert metric in result.results.Metric.tolist() + + def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: MultiTaskBenchmarkSpecification): train, test = test_multi_task_benchmark_clf.get_train_test_split() predictions = { target_col: np.random.randint(2, size=test.inputs.shape[0]) for target_col in train.target_cols } - result = test_multi_task_benchmark_clf.evaluate(predictions) + probabilities = { + target_col: np.random.uniform(size=test.inputs.shape[0]) for target_col in train.target_cols + } + result = test_multi_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) assert isinstance(result.results, pd.DataFrame) assert set(result.results.columns) == { "Test set", @@ -101,4 +118,66 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi def test_metric_direction(): for metric in Metric: - assert metric.value.direction in ["min", "max"] + assert metric.value.direction in ["min", "max", 1] + + +def test_absolute_average_fold_error(): + y_true = np.random.uniform(low=50, high=100, size=200) + y_pred_1 = y_true + np.random.uniform(low=0, high=5, size=200) + y_pred_2 = y_true + np.random.uniform(low=5, high=20, size=200) + y_pred_3 = y_true - 10 + y_zero = np.zeros(shape=200) + + # Optimal value + aafe_0 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_true) + assert aafe_0 == 1 + + # small fold change + aafe_1 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_1) + assert aafe_1 > 1 + + # larger fold change + aafe_2 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_2) + assert aafe_2 > aafe_1 + + # undershoot + aafe_3 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_3) + assert aafe_3 < 1 + + # y_true contains zeros + with pytest.raises(ValueError): + Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3) + + +def test_metric_y_types( + tmpdir: str, test_single_task_benchmark_clf: SingleTaskBenchmarkSpecification, test_data: Dataset +): + # here we use train split for testing purpose. + _, test = test_single_task_benchmark_clf.get_train_test_split() + predictions = np.random.randint(2, size=test.inputs.shape[0]) + probabilities = np.random.uniform(size=test.inputs.shape[0]) + test_y = test_data.loc[test.indices, "CLASS_expt"] + + # If y_pred is None and y_prob is None, an error is thrown. + with pytest.raises(ValueError, match="Neither `y_pred` nor `y_prob` is specified."): + test_single_task_benchmark_clf.evaluate() + + # If y_type == "y_pred" and y_pred is None, an error is thrown. + with pytest.raises(ValueError, match="Metric.accuracy requires `y_pred` input"): + test_single_task_benchmark_clf.metrics = [Metric.accuracy] + test_single_task_benchmark_clf.evaluate(y_prob=probabilities) + + # If y_type != "y_pred" and y_prob is None, an error is thrown. + with pytest.raises(ValueError, match="Metric.roc_auc requires `y_prob` input"): + test_single_task_benchmark_clf.metrics = [Metric.roc_auc] + test_single_task_benchmark_clf.evaluate(y_pred=predictions) + + # If y_type != "y_pred" and y_pred is not None and y_prob is not None, it uses y_prob as expected! + test_single_task_benchmark_clf.metrics = [Metric.roc_auc] + result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) + assert result.results.Score.values[0] == Metric.roc_auc.fn(y_true=test_y, y_score=probabilities) + + # If y_type == "y_pred" and y_pred is not None and y_prob is not None, it uses y_pred as expected! + test_single_task_benchmark_clf.metrics = [Metric.f1] + result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) + assert result.results.Score.values[0] == Metric.f1.fn(y_true=test_y, y_pred=predictions) diff --git a/tests/test_integration.py b/tests/test_integration.py index 7b96a4e3..5d9a983b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,6 @@ import datamol as dm import numpy as np -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from polaris.evaluate import BenchmarkResults @@ -41,6 +41,31 @@ def test_single_task_benchmark_loop_with_multiple_test_sets(test_single_task_ben assert isinstance(scores, BenchmarkResults) +def test_single_task_benchmark_clf_loop_with_multiple_test_sets( + test_single_task_benchmark_clf_multiple_test_sets, +): + """Tests the integrated API for a single-task benchmark for classification probabilities with multiple test sets.""" + train, test = test_single_task_benchmark_clf_multiple_test_sets.get_train_test_split() + + smiles, y = train.as_array("xy") + + x_train = np.array([dm.to_fp(dm.to_mol(smi)) for smi in smiles]) + + model = RandomForestClassifier() + model.fit(X=x_train, y=y) + + y_prob = {} + y_pred = {} + for k, test_subset in test.items(): + print(k, test_subset) + x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test_subset.inputs]) + y_prob[k] = model.predict_proba(x_test)[:, :1] # for binary classification + y_pred[k] = model.predict(x_test) + + scores = test_single_task_benchmark_clf_multiple_test_sets.evaluate(y_prob=y_prob, y_pred=y_pred) + assert isinstance(scores, BenchmarkResults) + + def test_multi_task_benchmark_loop(test_multi_task_benchmark): """Tests the integrated API for a multi-task benchmark.""" train, test = test_multi_task_benchmark.get_train_test_split()