From 56dc9d1bf98a3dd6868e0a8658275abd0bd01280 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Mon, 22 Apr 2024 16:44:13 -0400 Subject: [PATCH 01/18] add aafe and ba --- polaris/evaluate/_metric.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 57dfd0ee..43a5ad3d 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -15,6 +15,7 @@ mean_squared_error, r2_score, roc_auc_score, + balanced_accuracy_score, ) from polaris.utils.types import DirectionType @@ -30,6 +31,28 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray): return stats.spearmanr(y_true, y_pred).statistic +def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray): + """ + Calculate the Absolute Average Fold Error (AAFE) metric. + + Parameters: + y_true : array-like of shape (n_samples,) + The true target values. + y_pred : array-like of shape (n_samples,) + The predicted target values. + + Returns: + aafe : float + The Absolute Average Fold Error. + """ + if len(y_true) != len(y_pred): + raise ValueError("Length of y_true and y_pred must be the same.") + + aafe = np.mean(np.abs(y_pred) / np.abs(y_true)) + + return aafe + + class MetricInfo(BaseModel): """ Metric metadata @@ -65,6 +88,7 @@ class Metric(Enum): pearsonr = MetricInfo(fn=pearsonr, direction="max") spearmanr = MetricInfo(fn=spearman, direction="max") explained_var = MetricInfo(fn=explained_variance_score, direction="max") + aafe = MetricInfo(fn=absolute_average_fold_error, direction="max") # classification accuracy = MetricInfo(fn=accuracy_score, direction="max") @@ -75,7 +99,9 @@ class Metric(Enum): pr_auc = MetricInfo(fn=average_precision_score, direction="max") mcc = MetricInfo(fn=matthews_corrcoef, direction="max") cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") + # TODO: adding metrics for multiclass tasks + ba = MetricInfo(fn=balanced_accuracy_score, direction="max") @property def fn(self) -> Callable: From 11b86851cd43a39e3a295f288813a563464e3a44 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 23 Apr 2024 11:37:07 -0400 Subject: [PATCH 02/18] add docs --- polaris/evaluate/_metric.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 43a5ad3d..94ad9582 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -34,16 +34,18 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray): def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray): """ Calculate the Absolute Average Fold Error (AAFE) metric. + It measures the fold change between predicted values and observed values. + The implementation is based on https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305. - Parameters: - y_true : array-like of shape (n_samples,) - The true target values. - y_pred : array-like of shape (n_samples,) - The predicted target values. + Args: + y_true : array-like of shape (n_samples,) + The true target values. + y_pred : array-like of shape (n_samples,) + The predicted target values. Returns: - aafe : float - The Absolute Average Fold Error. + aafe : float + The Absolute Average Fold Error. """ if len(y_true) != len(y_pred): raise ValueError("Length of y_true and y_pred must be the same.") @@ -88,7 +90,7 @@ class Metric(Enum): pearsonr = MetricInfo(fn=pearsonr, direction="max") spearmanr = MetricInfo(fn=spearman, direction="max") explained_var = MetricInfo(fn=explained_variance_score, direction="max") - aafe = MetricInfo(fn=absolute_average_fold_error, direction="max") + absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction="max") # classification accuracy = MetricInfo(fn=accuracy_score, direction="max") @@ -100,8 +102,8 @@ class Metric(Enum): mcc = MetricInfo(fn=matthews_corrcoef, direction="max") cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") - # TODO: adding metrics for multiclass tasks - ba = MetricInfo(fn=balanced_accuracy_score, direction="max") + # multiclass tasks + balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max") @property def fn(self) -> Callable: From eb1824aeeb31aaab45ec8f605e544f5640b9dd21 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 23 Apr 2024 13:11:47 -0400 Subject: [PATCH 03/18] add tests --- polaris/evaluate/_metric.py | 15 +++++++++------ tests/conftest.py | 23 ++++++++++++++++++++++- tests/test_evaluate.py | 10 ++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 94ad9582..bd774e79 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -92,18 +92,21 @@ class Metric(Enum): explained_var = MetricInfo(fn=explained_variance_score, direction="max") absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction="max") - # classification + # binary and multiclass classification accuracy = MetricInfo(fn=accuracy_score, direction="max") + balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max") + mcc = MetricInfo(fn=matthews_corrcoef, direction="max") + cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") + + # binary only f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max") - f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max") - f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max") + # note: At the moment, multi-dimension inputs for classification are not supported roc_auc = MetricInfo(fn=roc_auc_score, direction="max") pr_auc = MetricInfo(fn=average_precision_score, direction="max") - mcc = MetricInfo(fn=matthews_corrcoef, direction="max") - cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") # multiclass tasks - balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max") + f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max") + f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max") @property def fn(self) -> Callable: diff --git a/tests/conftest.py b/tests/conftest.py index 1ebc0a02..56e77f05 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,8 @@ def test_data(): # set an abitrary threshold for testing purpose. data["CLASS_expt"] = data["expt"].gt(0).astype(int).values data["CLASS_calc"] = data["calc"].gt(0).astype(int).values + data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0]) + data["MULTICLASS_calc"] =np.random.randint(low=0, high=3, size=data.shape[0]) return data @@ -99,6 +101,7 @@ def test_single_task_benchmark(test_dataset): "spearmanr", "pearsonr", "explained_var", + "absolute_average_fold_error" ], main_metric="mean_absolute_error", split=(train_indices, test_indices), @@ -117,7 +120,7 @@ def test_single_task_benchmark_clf(test_dataset): name="single-task-benchmark", dataset=test_dataset, main_metric="accuracy", - metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"], + metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa", "balanced_accuracy"], split=(train_indices, test_indices), target_cols="CLASS_expt", input_cols="smiles", @@ -125,6 +128,22 @@ def test_single_task_benchmark_clf(test_dataset): check_version(benchmark) return benchmark +@pytest.fixture(scope="function") +def test_single_task_benchmark_multi_clf(test_dataset): + train_indices = list(range(90)) + test_indices = list(range(90, 100)) + + benchmark = SingleTaskBenchmarkSpecification( + name="single-task-benchmark", + dataset=test_dataset, + main_metric="accuracy", + metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa" , "f1_macro", "f1_micro"], + split=(train_indices, test_indices), + target_cols="MULTICLASS_expt", + input_cols="smiles", + ) + check_version(benchmark) + return benchmark @pytest.fixture(scope="function") def test_single_task_benchmark_multiple_test_sets(test_dataset): @@ -140,6 +159,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset): "spearmanr", "pearsonr", "explained_var", + "absolute_average_fold_error" ], main_metric="r2", split=(train_indices, test_indices), @@ -166,6 +186,7 @@ def test_multi_task_benchmark(test_dataset): "spearmanr", "pearsonr", "explained_var", + "absolute_average_fold_error" ], split=(train_indices, test_indices), target_cols=["expt", "calc"], diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 5a4332c9..5ec8643a 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -78,6 +78,16 @@ def test_metrics_singletask_clf( assert metric in result.results.Metric.tolist() +def test_metrics_singletask_multicls_clf( + tmpdir: str, test_single_task_benchmark_multi_clf: SingleTaskBenchmarkSpecification +): + _, test = test_single_task_benchmark_multi_clf.get_train_test_split() + predictions = np.random.randint(4, size=test.inputs.shape[0]) + result = test_single_task_benchmark_multi_clf.evaluate(predictions) + for metric in test_single_task_benchmark_multi_clf.metrics: + assert metric in result.results.Metric.tolist() + + def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: MultiTaskBenchmarkSpecification): train, test = test_multi_task_benchmark_clf.get_train_test_split() predictions = { From de11628ffe73da3f5b274b62417c39c2aadfc264 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 23 Apr 2024 13:13:42 -0400 Subject: [PATCH 04/18] format --- tests/conftest.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 56e77f05..9001ebaf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,7 +24,7 @@ def test_data(): data["CLASS_expt"] = data["expt"].gt(0).astype(int).values data["CLASS_calc"] = data["calc"].gt(0).astype(int).values data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0]) - data["MULTICLASS_calc"] =np.random.randint(low=0, high=3, size=data.shape[0]) + data["MULTICLASS_calc"] = np.random.randint(low=0, high=3, size=data.shape[0]) return data @@ -101,7 +101,7 @@ def test_single_task_benchmark(test_dataset): "spearmanr", "pearsonr", "explained_var", - "absolute_average_fold_error" + "absolute_average_fold_error", ], main_metric="mean_absolute_error", split=(train_indices, test_indices), @@ -128,6 +128,7 @@ def test_single_task_benchmark_clf(test_dataset): check_version(benchmark) return benchmark + @pytest.fixture(scope="function") def test_single_task_benchmark_multi_clf(test_dataset): train_indices = list(range(90)) @@ -137,7 +138,7 @@ def test_single_task_benchmark_multi_clf(test_dataset): name="single-task-benchmark", dataset=test_dataset, main_metric="accuracy", - metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa" , "f1_macro", "f1_micro"], + metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa", "f1_macro", "f1_micro"], split=(train_indices, test_indices), target_cols="MULTICLASS_expt", input_cols="smiles", @@ -145,6 +146,7 @@ def test_single_task_benchmark_multi_clf(test_dataset): check_version(benchmark) return benchmark + @pytest.fixture(scope="function") def test_single_task_benchmark_multiple_test_sets(test_dataset): train_indices = list(range(90)) @@ -159,7 +161,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset): "spearmanr", "pearsonr", "explained_var", - "absolute_average_fold_error" + "absolute_average_fold_error", ], main_metric="r2", split=(train_indices, test_indices), @@ -186,7 +188,7 @@ def test_multi_task_benchmark(test_dataset): "spearmanr", "pearsonr", "explained_var", - "absolute_average_fold_error" + "absolute_average_fold_error", ], split=(train_indices, test_indices), target_cols=["expt", "calc"], From 56da3adb8da1b9694e2e024b8fa6a0d76fde271e Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 25 Apr 2024 16:36:15 -0400 Subject: [PATCH 05/18] allows probability inputs for evaluation --- polaris/benchmark/_base.py | 15 +++++++++++---- polaris/evaluate/_metric.py | 25 ++++++++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 0d4f12ed..0a8e7aff 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -388,7 +388,7 @@ def _get_subset(indices, hide_targets): return train, test - def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: + def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> BenchmarkResults: """Execute the evaluation protocol for the benchmark, given a set of predictions. info: What about `y_true`? @@ -408,6 +408,8 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. If there are multiple test sets, the predictions should be further wrapped in a dictionary with the test subset labels as keys. + y_prob: The predicted probabilities for the test set, as NumPy arrays. + Currently only multiclass in singletask setting is supported. Returns: A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. @@ -429,6 +431,9 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred): y_pred = {"test": y_pred} + if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob): + y_prob = {"test": y_prob} + if any(k not in y_pred for k in test.keys()): raise KeyError( f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" @@ -441,15 +446,17 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: for test_label, y_true_subset in y_true.items(): # For every metric... for metric in self.metrics: + y_pred_eval = y_prob if metric.needs_probs else y_pred + if metric.is_multitask: # Multi-task but with a metric across targets - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label]) scores.loc[len(scores)] = (test_label, "aggregated", metric, score) continue if not isinstance(y_true_subset, dict): # Single task - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label]) scores.loc[len(scores)] = ( test_label, self.target_cols[0], @@ -465,7 +472,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: mask = ~np.isnan(y_true_target) score = metric( y_true=y_true_target[mask], - y_pred=y_pred[test_label][target_label][mask], + y_pred=y_pred_eval[test_label][target_label][mask], ) scores.loc[len(scores)] = (test_label, target_label, metric, score) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index bd774e79..361fdba6 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -35,7 +35,7 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray): """ Calculate the Absolute Average Fold Error (AAFE) metric. It measures the fold change between predicted values and observed values. - The implementation is based on https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305. + The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305). Args: y_true : array-like of shape (n_samples,) @@ -50,6 +50,9 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray): if len(y_true) != len(y_pred): raise ValueError("Length of y_true and y_pred must be the same.") + if np.any(y_true == 0): + raise ValueError("`y_true` contains zero which will result `Inf` value.") + aafe = np.mean(np.abs(y_pred) / np.abs(y_true)) return aafe @@ -70,6 +73,7 @@ class MetricInfo(BaseModel): is_multitask: bool = False kwargs: dict = Field(default_factory=dict) direction: DirectionType + needs_probs: bool = False class Metric(Enum): @@ -90,23 +94,29 @@ class Metric(Enum): pearsonr = MetricInfo(fn=pearsonr, direction="max") spearmanr = MetricInfo(fn=spearman, direction="max") explained_var = MetricInfo(fn=explained_variance_score, direction="max") - absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction="max") + absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction=1) # binary and multiclass classification accuracy = MetricInfo(fn=accuracy_score, direction="max") balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max") mcc = MetricInfo(fn=matthews_corrcoef, direction="max") cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") + pr_auc = MetricInfo(fn=average_precision_score, direction="max", needs_probs=True) # binary only f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max") # note: At the moment, multi-dimension inputs for classification are not supported - roc_auc = MetricInfo(fn=roc_auc_score, direction="max") - pr_auc = MetricInfo(fn=average_precision_score, direction="max") + roc_auc = MetricInfo(fn=roc_auc_score, direction="max", needs_probs=True) - # multiclass tasks + # multiclass tasks only f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max") f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max") + roc_auc_ovr = MetricInfo( + fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", needs_probs=True + ) + roc_auc_ovo = MetricInfo( + fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", needs_probs=True + ) @property def fn(self) -> Callable: @@ -118,6 +128,11 @@ def is_multitask(self) -> bool: """Whether the metric expects a single set of predictions or a dict of predictions.""" return self.value.is_multitask + @property + def needs_probs(self) -> bool: + """Whether the metric expects preditive probablities.""" + return self.value.needs_probs + def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: """Endpoint for computing the metric. From 9c4c9af49d7bf463fbcf48f35f40baf153364ef3 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 25 Apr 2024 16:36:32 -0400 Subject: [PATCH 06/18] update tests --- tests/conftest.py | 19 ++++++++++++++--- tests/test_evaluate.py | 46 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 9001ebaf..545cb26c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -131,14 +131,27 @@ def test_single_task_benchmark_clf(test_dataset): @pytest.fixture(scope="function") def test_single_task_benchmark_multi_clf(test_dataset): - train_indices = list(range(90)) - test_indices = list(range(90, 100)) + np.random.seed(111) + indices = np.arange(100) + np.random.shuffle(indices) + train_indices = indices[:80] + test_indices = indices[80:] benchmark = SingleTaskBenchmarkSpecification( name="single-task-benchmark", dataset=test_dataset, main_metric="accuracy", - metrics=["accuracy", "balanced_accuracy", "mcc", "cohen_kappa", "f1_macro", "f1_micro"], + metrics=[ + "accuracy", + "balanced_accuracy", + "mcc", + "cohen_kappa", + "f1_macro", + "f1_micro", + "roc_auc_ovr", + "roc_auc_ovo", + "pr_auc", + ], split=(train_indices, test_indices), target_cols="MULTICLASS_expt", input_cols="smiles", diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 5ec8643a..1e788df4 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -1,5 +1,5 @@ import os - +import pytest import numpy as np import pandas as pd @@ -73,7 +73,8 @@ def test_metrics_singletask_clf( ): _, test = test_single_task_benchmark_clf.get_train_test_split() predictions = np.random.randint(2, size=test.inputs.shape[0]) - result = test_single_task_benchmark_clf.evaluate(predictions) + probabilities = np.random.uniform(size=test.inputs.shape[0]) + result = test_single_task_benchmark_clf.evaluate(predictions, probabilities) for metric in test_single_task_benchmark_clf.metrics: assert metric in result.results.Metric.tolist() @@ -82,8 +83,10 @@ def test_metrics_singletask_multicls_clf( tmpdir: str, test_single_task_benchmark_multi_clf: SingleTaskBenchmarkSpecification ): _, test = test_single_task_benchmark_multi_clf.get_train_test_split() - predictions = np.random.randint(4, size=test.inputs.shape[0]) - result = test_single_task_benchmark_multi_clf.evaluate(predictions) + predictions = np.random.randint(3, size=test.inputs.shape[0]) + probablities = np.random.random(size=(test.inputs.shape[0], 3)) + probablities = probablities / probablities.sum(axis=1, keepdims=True) + result = test_single_task_benchmark_multi_clf.evaluate(predictions, probablities) for metric in test_single_task_benchmark_multi_clf.metrics: assert metric in result.results.Metric.tolist() @@ -93,7 +96,10 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi predictions = { target_col: np.random.randint(2, size=test.inputs.shape[0]) for target_col in train.target_cols } - result = test_multi_task_benchmark_clf.evaluate(predictions) + probabilities = { + target_col: np.random.uniform(size=test.inputs.shape[0]) for target_col in train.target_cols + } + result = test_multi_task_benchmark_clf.evaluate(predictions, probabilities) assert isinstance(result.results, pd.DataFrame) assert set(result.results.columns) == { "Test set", @@ -111,4 +117,32 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi def test_metric_direction(): for metric in Metric: - assert metric.value.direction in ["min", "max"] + assert metric.value.direction in ["min", "max", 1] + + +def test_absolute_average_fold_error(): + y_true = np.random.uniform(low=50, high=100, size=200) + y_pred_1 = y_true + np.random.uniform(low=0, high=5, size=200) + y_pred_2 = y_true + np.random.uniform(low=5, high=20, size=200) + y_pred_3 = y_true - 10 + y_zero = np.zeros(shape=200) + + # Optimal value + aafe_0 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_true) + assert aafe_0 == 1 + + # small fold change + aafe_1 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_1) + assert aafe_1 > 1 + + # larger fold change + aafe_2 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_2) + assert aafe_2 > aafe_1 + + # undershoot + aafe_3 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_3) + assert aafe_3 < 1 + + # undershoot + with pytest.raises(ValueError): + aafe_4 = Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3) From d42eeecf55d11fd572e46cae5da9dba51ff9836d Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 25 Apr 2024 16:36:47 -0400 Subject: [PATCH 07/18] update docs --- docs/api/evaluation.md | 2 ++ polaris/utils/types.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md index 6f5561fd..e69e20f9 100644 --- a/docs/api/evaluation.md +++ b/docs/api/evaluation.md @@ -5,6 +5,8 @@ ::: polaris.evaluate.MetricInfo +::: polaris.evaluate._metric.absolute_average_fold_error + --- ::: polaris.evaluate.Metric diff --git a/polaris/utils/types.py b/polaris/utils/types.py index b8093450..256224fa 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -71,7 +71,7 @@ This is useful for interactions with httpx and authlib, who have their own URL types. """ -DirectionType: TypeAlias = Literal["min", "max"] +DirectionType: TypeAlias = float | Literal["min", "max"] """ The direction of any variable to be sorted. This can be used to sort the metric score, indicate the optmization direction of endpoint. From 3d2ead8a2748c2a1ea6b523db872115c6656d89e Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 25 Apr 2024 16:38:55 -0400 Subject: [PATCH 08/18] ruff --- tests/test_evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 1e788df4..1ebbdb32 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -145,4 +145,4 @@ def test_absolute_average_fold_error(): # undershoot with pytest.raises(ValueError): - aafe_4 = Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3) + Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3) From 430ab31c3a1ced32d33b440f82c0083209c9eae6 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 25 Apr 2024 16:39:24 -0400 Subject: [PATCH 09/18] Update polaris/evaluate/_metric.py Co-authored-by: Cas Wognum --- polaris/evaluate/_metric.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 361fdba6..6b2f4638 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -38,14 +38,11 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray): The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305). Args: - y_true : array-like of shape (n_samples,) - The true target values. - y_pred : array-like of shape (n_samples,) - The predicted target values. + y_true: The true target values of shape (n_samples,) + y_pred: The predicted target values of shape (n_samples,). Returns: - aafe : float - The Absolute Average Fold Error. + aafe: The Absolute Average Fold Error. """ if len(y_true) != len(y_pred): raise ValueError("Length of y_true and y_pred must be the same.") From 9ddf8c8b9bc0913c7113941d8c7cc3fcf232f49f Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Thu, 25 Apr 2024 16:39:34 -0400 Subject: [PATCH 10/18] Update polaris/evaluate/_metric.py Co-authored-by: Cas Wognum --- polaris/evaluate/_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 6b2f4638..460e06c7 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -31,7 +31,7 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray): return stats.spearmanr(y_true, y_pred).statistic -def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray): +def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float: """ Calculate the Absolute Average Fold Error (AAFE) metric. It measures the fold change between predicted values and observed values. From a61143dcaeaa602e76ef50de91eb59e48150a34c Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Mon, 29 Apr 2024 17:48:42 -0400 Subject: [PATCH 11/18] specify y_type --- docs/api/evaluation.md | 2 +- polaris/benchmark/_base.py | 15 ++++++++----- polaris/evaluate/_metric.py | 42 ++++++++++++++++++++++++------------- tests/test_evaluate.py | 6 +++--- 4 files changed, 42 insertions(+), 23 deletions(-) diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md index e69e20f9..9187a763 100644 --- a/docs/api/evaluation.md +++ b/docs/api/evaluation.md @@ -11,6 +11,6 @@ ::: polaris.evaluate.Metric options: - filters: ["!^_", "!fn", "!is_multitask"] + filters: ["!^_", "!fn", "!is_multitask", "!y_type"] --- \ No newline at end of file diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 0a8e7aff..2f1f3579 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -446,17 +446,19 @@ def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> B for test_label, y_true_subset in y_true.items(): # For every metric... for metric in self.metrics: - y_pred_eval = y_prob if metric.needs_probs else y_pred - if metric.is_multitask: # Multi-task but with a metric across targets - score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label]) + score = metric( + y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None) + ) scores.loc[len(scores)] = (test_label, "aggregated", metric, score) continue if not isinstance(y_true_subset, dict): # Single task - score = metric(y_true=y_true_subset, y_pred=y_pred_eval[test_label]) + score = metric( + y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None) + ) scores.loc[len(scores)] = ( test_label, self.target_cols[0], @@ -472,7 +474,10 @@ def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> B mask = ~np.isnan(y_true_target) score = metric( y_true=y_true_target[mask], - y_pred=y_pred_eval[test_label][target_label][mask], + y_pred=y_pred[test_label][target_label][mask], + y_prob=y_prob[test_label][target_label][mask] + if y_prob[test_label] is not None + else None, ) scores.loc[len(scores)] = (test_label, target_label, metric, score) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 460e06c7..9d589733 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Callable +from typing import Callable, Literal, Optional import numpy as np from pydantic import BaseModel, Field @@ -7,7 +7,7 @@ from sklearn.metrics import ( accuracy_score, average_precision_score, - cohen_kappa_score, + cohen_kappa_score as sk_cohen_kappa_score, explained_variance_score, f1_score, matthews_corrcoef, @@ -55,6 +55,11 @@ def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float return aafe +def cohen_kappa_score(y_true, y_pred, **kwargs): + """Scikit learn cohen_kappa_score wraper with renamed arguments""" + return sk_cohen_kappa_score(y1=y_true, y2=y_pred, **kwargs) + + class MetricInfo(BaseModel): """ Metric metadata @@ -70,7 +75,7 @@ class MetricInfo(BaseModel): is_multitask: bool = False kwargs: dict = Field(default_factory=dict) direction: DirectionType - needs_probs: bool = False + y_type: Literal["y_pred", "y_prob", "y_score"] = "y_pred" class Metric(Enum): @@ -98,21 +103,20 @@ class Metric(Enum): balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max") mcc = MetricInfo(fn=matthews_corrcoef, direction="max") cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max") - pr_auc = MetricInfo(fn=average_precision_score, direction="max", needs_probs=True) + pr_auc = MetricInfo(fn=average_precision_score, direction="max", y_type="y_score") # binary only f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max") - # note: At the moment, multi-dimension inputs for classification are not supported - roc_auc = MetricInfo(fn=roc_auc_score, direction="max", needs_probs=True) + roc_auc = MetricInfo(fn=roc_auc_score, direction="max", y_type="y_score") # multiclass tasks only f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max") f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max") roc_auc_ovr = MetricInfo( - fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", needs_probs=True + fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", y_type="y_score" ) roc_auc_ovo = MetricInfo( - fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", needs_probs=True + fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score" ) @property @@ -126,11 +130,11 @@ def is_multitask(self) -> bool: return self.value.is_multitask @property - def needs_probs(self) -> bool: + def y_type(self) -> bool: """Whether the metric expects preditive probablities.""" - return self.value.needs_probs + return self.value.y_type - def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: + def score(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None) -> float: """Endpoint for computing the metric. For convenience, calling a `Metric` will result in this method being called. @@ -140,8 +144,18 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second) ``` """ - return self.fn(y_true, y_pred, **self.value.kwargs) + # return self.fn(y_true, y_pred, **self.value.kwargs) + if y_pred is None and y_prob is None: + raise ValueError("Neither `y_pred` nor `y_prob` is specified.") + + if self.y_type == "y_pred": + pred = y_pred + else: + pred = y_prob + + kwargs = {"y_true": y_true, self.y_type: pred} + return self.fn(**kwargs, **self.value.kwargs) - def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float: + def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray = None) -> float: """For convenience, make metrics callable""" - return self.score(y_true, y_pred) + return self.score(y_true, y_pred, y_prob) diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 1ebbdb32..3134dfe0 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -74,7 +74,7 @@ def test_metrics_singletask_clf( _, test = test_single_task_benchmark_clf.get_train_test_split() predictions = np.random.randint(2, size=test.inputs.shape[0]) probabilities = np.random.uniform(size=test.inputs.shape[0]) - result = test_single_task_benchmark_clf.evaluate(predictions, probabilities) + result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) for metric in test_single_task_benchmark_clf.metrics: assert metric in result.results.Metric.tolist() @@ -86,7 +86,7 @@ def test_metrics_singletask_multicls_clf( predictions = np.random.randint(3, size=test.inputs.shape[0]) probablities = np.random.random(size=(test.inputs.shape[0], 3)) probablities = probablities / probablities.sum(axis=1, keepdims=True) - result = test_single_task_benchmark_multi_clf.evaluate(predictions, probablities) + result = test_single_task_benchmark_multi_clf.evaluate(y_pred=predictions, y_prob=probablities) for metric in test_single_task_benchmark_multi_clf.metrics: assert metric in result.results.Metric.tolist() @@ -99,7 +99,7 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi probabilities = { target_col: np.random.uniform(size=test.inputs.shape[0]) for target_col in train.target_cols } - result = test_multi_task_benchmark_clf.evaluate(predictions, probabilities) + result = test_multi_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) assert isinstance(result.results, pd.DataFrame) assert set(result.results.columns) == { "Test set", From acac572147cee50c962dd0bfe3ba8b57732eb9b3 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 10:08:15 -0400 Subject: [PATCH 12/18] add todos --- polaris/benchmark/_base.py | 1 - polaris/evaluate/_metric.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 2f1f3579..c3f82da4 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -409,7 +409,6 @@ def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> B If there are multiple test sets, the predictions should be further wrapped in a dictionary with the test subset labels as keys. y_prob: The predicted probabilities for the test set, as NumPy arrays. - Currently only multiclass in singletask setting is supported. Returns: A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub. diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 9d589733..6aa2dc7b 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -88,7 +88,7 @@ class Metric(Enum): # TODO (cwognum): # - Any preprocessing needed? For example changing the shape / dtype? Converting from torch tensors or lists? - + # regression mean_absolute_error = MetricInfo(fn=mean_absolute_error, direction="min") mean_squared_error = MetricInfo(fn=mean_squared_error, direction="min") @@ -118,6 +118,7 @@ class Metric(Enum): roc_auc_ovo = MetricInfo( fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score" ) + # TODO: add metrics to handle multitask multiclass predictions. @property def fn(self) -> Callable: From 355bd67724796ea543f7a13514852b3032fe0655 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 10:10:12 -0400 Subject: [PATCH 13/18] format --- polaris/evaluate/_metric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 6aa2dc7b..eff562c6 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -88,7 +88,7 @@ class Metric(Enum): # TODO (cwognum): # - Any preprocessing needed? For example changing the shape / dtype? Converting from torch tensors or lists? - + # regression mean_absolute_error = MetricInfo(fn=mean_absolute_error, direction="min") mean_squared_error = MetricInfo(fn=mean_squared_error, direction="min") @@ -118,7 +118,7 @@ class Metric(Enum): roc_auc_ovo = MetricInfo( fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score" ) - # TODO: add metrics to handle multitask multiclass predictions. + # TODO: add metrics to handle multitask multiclass predictions. @property def fn(self) -> Callable: From d7bb39fd3af545fd0419e4a4955bca201e07dafc Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 12:57:42 -0400 Subject: [PATCH 14/18] add more tests --- polaris/benchmark/_base.py | 2 +- polaris/evaluate/_metric.py | 11 ++++++++--- tests/test_evaluate.py | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index c3f82da4..145ba515 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -388,7 +388,7 @@ def _get_subset(indices, hide_targets): return train, test - def evaluate(self, y_pred: PredictionsType, y_prob: PredictionsType = None) -> BenchmarkResults: + def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = None) -> BenchmarkResults: """Execute the evaluation protocol for the benchmark, given a set of predictions. info: What about `y_true`? diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index eff562c6..0f253a0e 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -135,7 +135,9 @@ def y_type(self) -> bool: """Whether the metric expects preditive probablities.""" return self.value.y_type - def score(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None) -> float: + def score( + self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: Optional[np.ndarray] = None + ) -> float: """Endpoint for computing the metric. For convenience, calling a `Metric` will result in this method being called. @@ -145,18 +147,21 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndar assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second) ``` """ - # return self.fn(y_true, y_pred, **self.value.kwargs) if y_pred is None and y_prob is None: raise ValueError("Neither `y_pred` nor `y_prob` is specified.") if self.y_type == "y_pred": + if y_pred is None: + raise ValueError(f"{self} requires `y_pred` input. ") pred = y_pred else: + if y_prob is None: + raise ValueError(f"{self} requires `y_prob` input. ") pred = y_prob kwargs = {"y_true": y_true, self.y_type: pred} return self.fn(**kwargs, **self.value.kwargs) - def __call__(self, y_true: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray = None) -> float: + def __call__(self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: np.ndarray = None) -> float: """For convenience, make metrics callable""" return self.score(y_true, y_pred, y_prob) diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 3134dfe0..f5fb7235 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -11,6 +11,7 @@ from polaris.evaluate._metric import Metric from polaris.evaluate._results import BenchmarkResults from polaris.utils.types import HubOwner +from polaris.dataset import Dataset def test_result_to_json(tmpdir: str, test_user_owner: HubOwner): @@ -143,6 +144,40 @@ def test_absolute_average_fold_error(): aafe_3 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_3) assert aafe_3 < 1 - # undershoot + # y_true contains zeros with pytest.raises(ValueError): Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3) + + +def test_metric_y_types( + tmpdir: str, test_single_task_benchmark_clf: SingleTaskBenchmarkSpecification, test_data: Dataset +): + # here we use train split for testing purpose. + _, test = test_single_task_benchmark_clf.get_train_test_split() + predictions = np.random.randint(2, size=test.inputs.shape[0]) + probabilities = np.random.uniform(size=test.inputs.shape[0]) + test_y = test_data.loc[test.indices, "CLASS_expt"] + + # If y_pred is None and y_prob is None, an error is thrown. + with pytest.raises(ValueError, match="Neither `y_pred` nor `y_prob` is specified."): + test_single_task_benchmark_clf.evaluate() + + # If y_type == "y_pred" and y_pred is None, an error is thrown. + with pytest.raises(ValueError, match="Metric.accuracy requires `y_pred` input"): + test_single_task_benchmark_clf.metrics = [Metric.accuracy] + test_single_task_benchmark_clf.evaluate(y_prob=probabilities) + + # If y_type != "y_pred" and y_prob is None, an error is thrown. + with pytest.raises(ValueError, match="Metric.roc_auc requires `y_prob` input"): + test_single_task_benchmark_clf.metrics = [Metric.roc_auc] + test_single_task_benchmark_clf.evaluate(y_pred=predictions) + + # If y_type != "y_pred" and y_pred is not None and y_prob is not None, it uses y_prob as expected! + test_single_task_benchmark_clf.metrics = [Metric.roc_auc] + result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) + assert result.results.Score.values[0] == Metric.roc_auc(y_true=test_y, y_prob=probabilities) + + # If y_type == "y_pred" and y_pred is not None and y_prob is not None, it uses y_pred as expected! + test_single_task_benchmark_clf.metrics = [Metric.f1] + result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) + assert result.results.Score.values[0] == Metric.f1(y_true=test_y, y_pred=predictions) From 0216ff6a6ce4fbe27a0e55984bf75ec94c3afb74 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 13:00:04 -0400 Subject: [PATCH 15/18] Update polaris/benchmark/_base.py Co-authored-by: Cas Wognum --- polaris/benchmark/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 145ba515..3df5e0ea 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -448,7 +448,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non if metric.is_multitask: # Multi-task but with a metric across targets score = metric( - y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None) + y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label) ) scores.loc[len(scores)] = (test_label, "aggregated", metric, score) continue From e0aa6c256ddee6c16f0377961b83a35245af57bd Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 14:10:46 -0400 Subject: [PATCH 16/18] more tests --- polaris/benchmark/_base.py | 10 ++++++---- tests/conftest.py | 20 ++++++++++++++++++++ tests/test_integration.py | 28 +++++++++++++++++++++++++++- 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 3df5e0ea..3c18d875 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -433,7 +433,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob): y_prob = {"test": y_prob} - if any(k not in y_pred for k in test.keys()): + if any(k not in y_pred for k in test.keys()) and any(k not in y_prob for k in test.keys()): raise KeyError( f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" ) @@ -448,7 +448,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non if metric.is_multitask: # Multi-task but with a metric across targets score = metric( - y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label) + y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label) ) scores.loc[len(scores)] = (test_label, "aggregated", metric, score) continue @@ -456,7 +456,7 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non if not isinstance(y_true_subset, dict): # Single task score = metric( - y_true=y_true_subset, y_pred=y_pred[test_label], y_prob=y_prob.get(test_label, None) + y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label) ) scores.loc[len(scores)] = ( test_label, @@ -473,7 +473,9 @@ def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = Non mask = ~np.isnan(y_true_target) score = metric( y_true=y_true_target[mask], - y_pred=y_pred[test_label][target_label][mask], + y_pred=y_pred[test_label][target_label][mask] + if y_pred[test_label] is not None + else None, y_prob=y_prob[test_label][target_label][mask] if y_prob[test_label] is not None else None, diff --git a/tests/conftest.py b/tests/conftest.py index 545cb26c..aa02c3f7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -185,6 +185,26 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset): return benchmark +@pytest.fixture(scope="function") +def test_single_task_benchmark_clf_multiple_test_sets(test_dataset): + np.random.seed(111) # make sure two classes in `y_true` + indices = np.arange(100) + np.random.shuffle(indices) + train_indices = indices[:80] + test_indices = {"test_1": indices[80:90], "test_2": indices[90:]} + benchmark = SingleTaskBenchmarkSpecification( + name="single-task-benchmark-clf", + dataset=test_dataset, + metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"], + main_metric="pr_auc", + split=(train_indices, test_indices), + target_cols="CLASS_calc", + input_cols="smiles", + ) + check_version(benchmark) + return benchmark + + @pytest.fixture(scope="function") def test_multi_task_benchmark(test_dataset): # For the sake of simplicity, just use a small set of indices diff --git a/tests/test_integration.py b/tests/test_integration.py index 7b96a4e3..f0de3fcc 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,7 @@ import datamol as dm import numpy as np -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.svm import SVC from polaris.evaluate import BenchmarkResults @@ -41,6 +42,31 @@ def test_single_task_benchmark_loop_with_multiple_test_sets(test_single_task_ben assert isinstance(scores, BenchmarkResults) +def test_single_task_benchmark_clf_loop_with_multiple_test_sets( + test_single_task_benchmark_clf_multiple_test_sets, +): + """Tests the integrated API for a single-task benchmark for classification probabilities with multiple test sets.""" + train, test = test_single_task_benchmark_clf_multiple_test_sets.get_train_test_split() + + smiles, y = train.as_array("xy") + + x_train = np.array([dm.to_fp(dm.to_mol(smi)) for smi in smiles]) + + model = RandomForestClassifier() + model.fit(X=x_train, y=y) + + y_prob = {} + y_pred = {} + for k, test_subset in test.items(): + print(k, test_subset) + x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test_subset.inputs]) + y_prob[k] = model.predict_proba(x_test)[:, :1] # for binary classification + y_pred[k] = model.predict(x_test) + + scores = test_single_task_benchmark_clf_multiple_test_sets.evaluate(y_prob=y_prob, y_pred=y_pred) + assert isinstance(scores, BenchmarkResults) + + def test_multi_task_benchmark_loop(test_multi_task_benchmark): """Tests the integrated API for a multi-task benchmark.""" train, test = test_multi_task_benchmark.get_train_test_split() From 6985976e8dc43be20ec10d04ca7b5d9a9b057e9d Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 14:12:55 -0400 Subject: [PATCH 17/18] lint --- tests/test_integration.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index f0de3fcc..5d9a983b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,7 +1,6 @@ import datamol as dm import numpy as np from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier -from sklearn.svm import SVC from polaris.evaluate import BenchmarkResults From 0848f1c23637b4343f8f2f259a1f1491ec83f91a Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Tue, 30 Apr 2024 14:42:43 -0400 Subject: [PATCH 18/18] minor changes --- polaris/benchmark/_base.py | 4 +++- polaris/evaluate/_metric.py | 6 ++++-- tests/test_evaluate.py | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 3c18d875..70fd02da 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -388,7 +388,9 @@ def _get_subset(indices, hide_targets): return train, test - def evaluate(self, y_pred: PredictionsType = None, y_prob: PredictionsType = None) -> BenchmarkResults: + def evaluate( + self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None + ) -> BenchmarkResults: """Execute the evaluation protocol for the benchmark, given a set of predictions. info: What about `y_true`? diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py index 0f253a0e..eee9cd27 100644 --- a/polaris/evaluate/_metric.py +++ b/polaris/evaluate/_metric.py @@ -136,7 +136,7 @@ def y_type(self) -> bool: return self.value.y_type def score( - self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: Optional[np.ndarray] = None + self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None ) -> float: """Endpoint for computing the metric. @@ -162,6 +162,8 @@ def score( kwargs = {"y_true": y_true, self.y_type: pred} return self.fn(**kwargs, **self.value.kwargs) - def __call__(self, y_true: np.ndarray, y_pred: np.ndarray = None, y_prob: np.ndarray = None) -> float: + def __call__( + self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None + ) -> float: """For convenience, make metrics callable""" return self.score(y_true, y_pred, y_prob) diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index f5fb7235..ecefa97c 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -175,9 +175,9 @@ def test_metric_y_types( # If y_type != "y_pred" and y_pred is not None and y_prob is not None, it uses y_prob as expected! test_single_task_benchmark_clf.metrics = [Metric.roc_auc] result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) - assert result.results.Score.values[0] == Metric.roc_auc(y_true=test_y, y_prob=probabilities) + assert result.results.Score.values[0] == Metric.roc_auc.fn(y_true=test_y, y_score=probabilities) # If y_type == "y_pred" and y_pred is not None and y_prob is not None, it uses y_pred as expected! test_single_task_benchmark_clf.metrics = [Metric.f1] result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities) - assert result.results.Score.values[0] == Metric.f1(y_true=test_y, y_pred=predictions) + assert result.results.Score.values[0] == Metric.f1.fn(y_true=test_y, y_pred=predictions)