diff --git a/docs/api/evaluation.md b/docs/api/evaluation.md
index 6f5561fd..9187a763 100644
--- a/docs/api/evaluation.md
+++ b/docs/api/evaluation.md
@@ -5,10 +5,12 @@
 
 ::: polaris.evaluate.MetricInfo
 
+::: polaris.evaluate._metric.absolute_average_fold_error
+
 ---
 
 ::: polaris.evaluate.Metric
     options: 
-        filters: ["!^_", "!fn", "!is_multitask"]
+        filters: ["!^_", "!fn", "!is_multitask", "!y_type"]
 
 ---
\ No newline at end of file
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
index 0d4f12ed..70fd02da 100644
--- a/polaris/benchmark/_base.py
+++ b/polaris/benchmark/_base.py
@@ -388,7 +388,9 @@ def _get_subset(indices, hide_targets):
 
         return train, test
 
-    def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
+    def evaluate(
+        self, y_pred: Optional[PredictionsType] = None, y_prob: Optional[PredictionsType] = None
+    ) -> BenchmarkResults:
         """Execute the evaluation protocol for the benchmark, given a set of predictions.
 
         info: What about `y_true`?
@@ -408,6 +410,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
                 If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
                 If there are multiple test sets, the predictions should be further wrapped in a dictionary
                     with the test subset labels as keys.
+            y_prob: The predicted probabilities for the test set, as NumPy arrays.
 
         Returns:
             A `BenchmarkResults` object. This object can be directly submitted to the Polaris Hub.
@@ -429,7 +432,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred):
             y_pred = {"test": y_pred}
 
-        if any(k not in y_pred for k in test.keys()):
+        if not isinstance(y_prob, dict) or all(k in self.target_cols for k in y_prob):
+            y_prob = {"test": y_prob}
+
+        if any(k not in y_pred for k in test.keys()) and any(k not in y_prob for k in test.keys()):
             raise KeyError(
                 f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
             )
@@ -443,13 +449,17 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
             for metric in self.metrics:
                 if metric.is_multitask:
                     # Multi-task but with a metric across targets
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                    score = metric(
+                        y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
+                    )
                     scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
                     continue
 
                 if not isinstance(y_true_subset, dict):
                     # Single task
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                    score = metric(
+                        y_true=y_true_subset, y_pred=y_pred.get(test_label), y_prob=y_prob.get(test_label)
+                    )
                     scores.loc[len(scores)] = (
                         test_label,
                         self.target_cols[0],
@@ -465,7 +475,12 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
                     mask = ~np.isnan(y_true_target)
                     score = metric(
                         y_true=y_true_target[mask],
-                        y_pred=y_pred[test_label][target_label][mask],
+                        y_pred=y_pred[test_label][target_label][mask]
+                        if y_pred[test_label] is not None
+                        else None,
+                        y_prob=y_prob[test_label][target_label][mask]
+                        if y_prob[test_label] is not None
+                        else None,
                     )
                     scores.loc[len(scores)] = (test_label, target_label, metric, score)
 
diff --git a/polaris/evaluate/_metric.py b/polaris/evaluate/_metric.py
index 57dfd0ee..eee9cd27 100644
--- a/polaris/evaluate/_metric.py
+++ b/polaris/evaluate/_metric.py
@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import Callable
+from typing import Callable, Literal, Optional
 
 import numpy as np
 from pydantic import BaseModel, Field
@@ -7,7 +7,7 @@
 from sklearn.metrics import (
     accuracy_score,
     average_precision_score,
-    cohen_kappa_score,
+    cohen_kappa_score as sk_cohen_kappa_score,
     explained_variance_score,
     f1_score,
     matthews_corrcoef,
@@ -15,6 +15,7 @@
     mean_squared_error,
     r2_score,
     roc_auc_score,
+    balanced_accuracy_score,
 )
 
 from polaris.utils.types import DirectionType
@@ -30,6 +31,35 @@ def spearman(y_true: np.ndarray, y_pred: np.ndarray):
     return stats.spearmanr(y_true, y_pred).statistic
 
 
+def absolute_average_fold_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    """
+    Calculate the Absolute Average Fold Error (AAFE) metric.
+    It measures the fold change between predicted values and observed values.
+    The implementation is based on [this paper](https://pubs.acs.org/doi/10.1021/acs.chemrestox.3c00305).
+
+    Args:
+        y_true: The true target values of shape (n_samples,)
+        y_pred: The predicted target values of shape (n_samples,).
+
+    Returns:
+        aafe: The Absolute Average Fold Error.
+    """
+    if len(y_true) != len(y_pred):
+        raise ValueError("Length of y_true and y_pred must be the same.")
+
+    if np.any(y_true == 0):
+        raise ValueError("`y_true` contains zero which will result `Inf` value.")
+
+    aafe = np.mean(np.abs(y_pred) / np.abs(y_true))
+
+    return aafe
+
+
+def cohen_kappa_score(y_true, y_pred, **kwargs):
+    """Scikit learn cohen_kappa_score wraper with renamed arguments"""
+    return sk_cohen_kappa_score(y1=y_true, y2=y_pred, **kwargs)
+
+
 class MetricInfo(BaseModel):
     """
     Metric metadata
@@ -45,6 +75,7 @@ class MetricInfo(BaseModel):
     is_multitask: bool = False
     kwargs: dict = Field(default_factory=dict)
     direction: DirectionType
+    y_type: Literal["y_pred", "y_prob", "y_score"] = "y_pred"
 
 
 class Metric(Enum):
@@ -65,17 +96,29 @@ class Metric(Enum):
     pearsonr = MetricInfo(fn=pearsonr, direction="max")
     spearmanr = MetricInfo(fn=spearman, direction="max")
     explained_var = MetricInfo(fn=explained_variance_score, direction="max")
+    absolute_average_fold_error = MetricInfo(fn=absolute_average_fold_error, direction=1)
 
-    # classification
+    # binary and multiclass classification
     accuracy = MetricInfo(fn=accuracy_score, direction="max")
+    balanced_accuracy = MetricInfo(fn=balanced_accuracy_score, direction="max")
+    mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
+    cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
+    pr_auc = MetricInfo(fn=average_precision_score, direction="max", y_type="y_score")
+
+    # binary only
     f1 = MetricInfo(fn=f1_score, kwargs={"average": "binary"}, direction="max")
+    roc_auc = MetricInfo(fn=roc_auc_score, direction="max", y_type="y_score")
+
+    # multiclass tasks only
     f1_macro = MetricInfo(fn=f1_score, kwargs={"average": "macro"}, direction="max")
     f1_micro = MetricInfo(fn=f1_score, kwargs={"average": "micro"}, direction="max")
-    roc_auc = MetricInfo(fn=roc_auc_score, direction="max")
-    pr_auc = MetricInfo(fn=average_precision_score, direction="max")
-    mcc = MetricInfo(fn=matthews_corrcoef, direction="max")
-    cohen_kappa = MetricInfo(fn=cohen_kappa_score, direction="max")
-    # TODO: adding metrics for multiclass tasks
+    roc_auc_ovr = MetricInfo(
+        fn=roc_auc_score, kwargs={"multi_class": "ovr"}, direction="max", y_type="y_score"
+    )
+    roc_auc_ovo = MetricInfo(
+        fn=roc_auc_score, kwargs={"multi_class": "ovo"}, direction="max", y_type="y_score"
+    )
+    # TODO: add metrics to handle multitask multiclass predictions.
 
     @property
     def fn(self) -> Callable:
@@ -87,7 +130,14 @@ def is_multitask(self) -> bool:
         """Whether the metric expects a single set of predictions or a dict of predictions."""
         return self.value.is_multitask
 
-    def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    @property
+    def y_type(self) -> bool:
+        """Whether the metric expects preditive probablities."""
+        return self.value.y_type
+
+    def score(
+        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+    ) -> float:
         """Endpoint for computing the metric.
 
         For convenience, calling a `Metric` will result in this method being called.
@@ -97,8 +147,23 @@ def score(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
         assert metric.score(y_true=first, y_pred=second) == metric(y_true=first, y_pred=second)
         ```
         """
-        return self.fn(y_true, y_pred, **self.value.kwargs)
-
-    def __call__(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        if y_pred is None and y_prob is None:
+            raise ValueError("Neither `y_pred` nor `y_prob` is specified.")
+
+        if self.y_type == "y_pred":
+            if y_pred is None:
+                raise ValueError(f"{self} requires `y_pred` input. ")
+            pred = y_pred
+        else:
+            if y_prob is None:
+                raise ValueError(f"{self} requires `y_prob` input. ")
+            pred = y_prob
+
+        kwargs = {"y_true": y_true, self.y_type: pred}
+        return self.fn(**kwargs, **self.value.kwargs)
+
+    def __call__(
+        self, y_true: np.ndarray, y_pred: Optional[np.ndarray] = None, y_prob: Optional[np.ndarray] = None
+    ) -> float:
         """For convenience, make metrics callable"""
-        return self.score(y_true, y_pred)
+        return self.score(y_true, y_pred, y_prob)
diff --git a/polaris/utils/types.py b/polaris/utils/types.py
index b8093450..256224fa 100644
--- a/polaris/utils/types.py
+++ b/polaris/utils/types.py
@@ -71,7 +71,7 @@
 This is useful for interactions with httpx and authlib, who have their own URL types.
 """
 
-DirectionType: TypeAlias = Literal["min", "max"]
+DirectionType: TypeAlias = float | Literal["min", "max"]
 """
 The direction of any variable to be sorted.
 This can be used to sort the metric score, indicate the optmization direction of endpoint.
diff --git a/tests/conftest.py b/tests/conftest.py
index 1ebc0a02..aa02c3f7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,6 +23,8 @@ def test_data():
     # set an abitrary threshold for testing purpose.
     data["CLASS_expt"] = data["expt"].gt(0).astype(int).values
     data["CLASS_calc"] = data["calc"].gt(0).astype(int).values
+    data["MULTICLASS_expt"] = np.random.randint(low=0, high=3, size=data.shape[0])
+    data["MULTICLASS_calc"] = np.random.randint(low=0, high=3, size=data.shape[0])
     return data
 
 
@@ -99,6 +101,7 @@ def test_single_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error",
         ],
         main_metric="mean_absolute_error",
         split=(train_indices, test_indices),
@@ -117,7 +120,7 @@ def test_single_task_benchmark_clf(test_dataset):
         name="single-task-benchmark",
         dataset=test_dataset,
         main_metric="accuracy",
-        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
+        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa", "balanced_accuracy"],
         split=(train_indices, test_indices),
         target_cols="CLASS_expt",
         input_cols="smiles",
@@ -126,6 +129,37 @@ def test_single_task_benchmark_clf(test_dataset):
     return benchmark
 
 
+@pytest.fixture(scope="function")
+def test_single_task_benchmark_multi_clf(test_dataset):
+    np.random.seed(111)
+    indices = np.arange(100)
+    np.random.shuffle(indices)
+    train_indices = indices[:80]
+    test_indices = indices[80:]
+
+    benchmark = SingleTaskBenchmarkSpecification(
+        name="single-task-benchmark",
+        dataset=test_dataset,
+        main_metric="accuracy",
+        metrics=[
+            "accuracy",
+            "balanced_accuracy",
+            "mcc",
+            "cohen_kappa",
+            "f1_macro",
+            "f1_micro",
+            "roc_auc_ovr",
+            "roc_auc_ovo",
+            "pr_auc",
+        ],
+        split=(train_indices, test_indices),
+        target_cols="MULTICLASS_expt",
+        input_cols="smiles",
+    )
+    check_version(benchmark)
+    return benchmark
+
+
 @pytest.fixture(scope="function")
 def test_single_task_benchmark_multiple_test_sets(test_dataset):
     train_indices = list(range(90))
@@ -140,6 +174,7 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error",
         ],
         main_metric="r2",
         split=(train_indices, test_indices),
@@ -150,6 +185,26 @@ def test_single_task_benchmark_multiple_test_sets(test_dataset):
     return benchmark
 
 
+@pytest.fixture(scope="function")
+def test_single_task_benchmark_clf_multiple_test_sets(test_dataset):
+    np.random.seed(111)  # make sure two classes in `y_true`
+    indices = np.arange(100)
+    np.random.shuffle(indices)
+    train_indices = indices[:80]
+    test_indices = {"test_1": indices[80:90], "test_2": indices[90:]}
+    benchmark = SingleTaskBenchmarkSpecification(
+        name="single-task-benchmark-clf",
+        dataset=test_dataset,
+        metrics=["accuracy", "f1", "roc_auc", "pr_auc", "mcc", "cohen_kappa"],
+        main_metric="pr_auc",
+        split=(train_indices, test_indices),
+        target_cols="CLASS_calc",
+        input_cols="smiles",
+    )
+    check_version(benchmark)
+    return benchmark
+
+
 @pytest.fixture(scope="function")
 def test_multi_task_benchmark(test_dataset):
     # For the sake of simplicity, just use a small set of indices
@@ -166,6 +221,7 @@ def test_multi_task_benchmark(test_dataset):
             "spearmanr",
             "pearsonr",
             "explained_var",
+            "absolute_average_fold_error",
         ],
         split=(train_indices, test_indices),
         target_cols=["expt", "calc"],
diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
index 5a4332c9..ecefa97c 100644
--- a/tests/test_evaluate.py
+++ b/tests/test_evaluate.py
@@ -1,5 +1,5 @@
 import os
-
+import pytest
 import numpy as np
 import pandas as pd
 
@@ -11,6 +11,7 @@
 from polaris.evaluate._metric import Metric
 from polaris.evaluate._results import BenchmarkResults
 from polaris.utils.types import HubOwner
+from polaris.dataset import Dataset
 
 
 def test_result_to_json(tmpdir: str, test_user_owner: HubOwner):
@@ -73,17 +74,33 @@ def test_metrics_singletask_clf(
 ):
     _, test = test_single_task_benchmark_clf.get_train_test_split()
     predictions = np.random.randint(2, size=test.inputs.shape[0])
-    result = test_single_task_benchmark_clf.evaluate(predictions)
+    probabilities = np.random.uniform(size=test.inputs.shape[0])
+    result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
     for metric in test_single_task_benchmark_clf.metrics:
         assert metric in result.results.Metric.tolist()
 
 
+def test_metrics_singletask_multicls_clf(
+    tmpdir: str, test_single_task_benchmark_multi_clf: SingleTaskBenchmarkSpecification
+):
+    _, test = test_single_task_benchmark_multi_clf.get_train_test_split()
+    predictions = np.random.randint(3, size=test.inputs.shape[0])
+    probablities = np.random.random(size=(test.inputs.shape[0], 3))
+    probablities = probablities / probablities.sum(axis=1, keepdims=True)
+    result = test_single_task_benchmark_multi_clf.evaluate(y_pred=predictions, y_prob=probablities)
+    for metric in test_single_task_benchmark_multi_clf.metrics:
+        assert metric in result.results.Metric.tolist()
+
+
 def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: MultiTaskBenchmarkSpecification):
     train, test = test_multi_task_benchmark_clf.get_train_test_split()
     predictions = {
         target_col: np.random.randint(2, size=test.inputs.shape[0]) for target_col in train.target_cols
     }
-    result = test_multi_task_benchmark_clf.evaluate(predictions)
+    probabilities = {
+        target_col: np.random.uniform(size=test.inputs.shape[0]) for target_col in train.target_cols
+    }
+    result = test_multi_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
     assert isinstance(result.results, pd.DataFrame)
     assert set(result.results.columns) == {
         "Test set",
@@ -101,4 +118,66 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi
 
 def test_metric_direction():
     for metric in Metric:
-        assert metric.value.direction in ["min", "max"]
+        assert metric.value.direction in ["min", "max", 1]
+
+
+def test_absolute_average_fold_error():
+    y_true = np.random.uniform(low=50, high=100, size=200)
+    y_pred_1 = y_true + np.random.uniform(low=0, high=5, size=200)
+    y_pred_2 = y_true + np.random.uniform(low=5, high=20, size=200)
+    y_pred_3 = y_true - 10
+    y_zero = np.zeros(shape=200)
+
+    # Optimal value
+    aafe_0 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_true)
+    assert aafe_0 == 1
+
+    # small fold change
+    aafe_1 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_1)
+    assert aafe_1 > 1
+
+    # larger fold change
+    aafe_2 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_2)
+    assert aafe_2 > aafe_1
+
+    # undershoot
+    aafe_3 = Metric.absolute_average_fold_error(y_true=y_true, y_pred=y_pred_3)
+    assert aafe_3 < 1
+
+    # y_true contains zeros
+    with pytest.raises(ValueError):
+        Metric.absolute_average_fold_error(y_true=y_zero, y_pred=y_pred_3)
+
+
+def test_metric_y_types(
+    tmpdir: str, test_single_task_benchmark_clf: SingleTaskBenchmarkSpecification, test_data: Dataset
+):
+    # here we use train split for testing purpose.
+    _, test = test_single_task_benchmark_clf.get_train_test_split()
+    predictions = np.random.randint(2, size=test.inputs.shape[0])
+    probabilities = np.random.uniform(size=test.inputs.shape[0])
+    test_y = test_data.loc[test.indices, "CLASS_expt"]
+
+    # If y_pred is None and y_prob is None, an error is thrown.
+    with pytest.raises(ValueError, match="Neither `y_pred` nor `y_prob` is specified."):
+        test_single_task_benchmark_clf.evaluate()
+
+    # If y_type == "y_pred" and y_pred is None, an error is thrown.
+    with pytest.raises(ValueError, match="Metric.accuracy requires `y_pred` input"):
+        test_single_task_benchmark_clf.metrics = [Metric.accuracy]
+        test_single_task_benchmark_clf.evaluate(y_prob=probabilities)
+
+    # If y_type != "y_pred" and y_prob is None, an error is thrown.
+    with pytest.raises(ValueError, match="Metric.roc_auc requires `y_prob` input"):
+        test_single_task_benchmark_clf.metrics = [Metric.roc_auc]
+        test_single_task_benchmark_clf.evaluate(y_pred=predictions)
+
+    # If y_type != "y_pred" and y_pred is not None and y_prob is not None, it uses y_prob as expected!
+    test_single_task_benchmark_clf.metrics = [Metric.roc_auc]
+    result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
+    assert result.results.Score.values[0] == Metric.roc_auc.fn(y_true=test_y, y_score=probabilities)
+
+    # If y_type == "y_pred" and y_pred is not None and y_prob is not None, it uses y_pred as expected!
+    test_single_task_benchmark_clf.metrics = [Metric.f1]
+    result = test_single_task_benchmark_clf.evaluate(y_pred=predictions, y_prob=probabilities)
+    assert result.results.Score.values[0] == Metric.f1.fn(y_true=test_y, y_pred=predictions)
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 7b96a4e3..5d9a983b 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -1,6 +1,6 @@
 import datamol as dm
 import numpy as np
-from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 
 from polaris.evaluate import BenchmarkResults
 
@@ -41,6 +41,31 @@ def test_single_task_benchmark_loop_with_multiple_test_sets(test_single_task_ben
     assert isinstance(scores, BenchmarkResults)
 
 
+def test_single_task_benchmark_clf_loop_with_multiple_test_sets(
+    test_single_task_benchmark_clf_multiple_test_sets,
+):
+    """Tests the integrated API for a single-task benchmark for classification probabilities with multiple test sets."""
+    train, test = test_single_task_benchmark_clf_multiple_test_sets.get_train_test_split()
+
+    smiles, y = train.as_array("xy")
+
+    x_train = np.array([dm.to_fp(dm.to_mol(smi)) for smi in smiles])
+
+    model = RandomForestClassifier()
+    model.fit(X=x_train, y=y)
+
+    y_prob = {}
+    y_pred = {}
+    for k, test_subset in test.items():
+        print(k, test_subset)
+        x_test = np.array([dm.to_fp(dm.to_mol(smi)) for smi in test_subset.inputs])
+        y_prob[k] = model.predict_proba(x_test)[:, :1]  # for binary classification
+        y_pred[k] = model.predict(x_test)
+
+    scores = test_single_task_benchmark_clf_multiple_test_sets.evaluate(y_prob=y_prob, y_pred=y_pred)
+    assert isinstance(scores, BenchmarkResults)
+
+
 def test_multi_task_benchmark_loop(test_multi_task_benchmark):
     """Tests the integrated API for a multi-task benchmark."""
     train, test = test_multi_task_benchmark.get_train_test_split()