In [1]:
import mlflow
from mlflow.metrics import EvaluationMetric
from mlflow.metrics import MetricValue
import pandas as pd
from typing import Dict
from typing import Union
import random

In [2]:
experiment_name = "custom-metrics"
try:
    _= mlflow.create_experiment(experiment_name)
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)

experiment = mlflow.set_experiment(experiment_name)

In [None]:
def custom_accuracy(predictions: pd.Series, targets: pd.Series, metrics: Dict[str, MetricValue] = None) -> Union[float, MetricValue]:
    

SyntaxError: incomplete input (1704088905.py, line 1)

In [3]:
def custom_accuracy(predictions: pd.Series, targets: pd.Series, metrics: Dict[str, MetricValue]) -> Union[float, MetricValue]:
    custom_accuracy = (predictions == targets).sum() / len(targets)
    return custom_accuracy

In [4]:
test_df = pd.DataFrame({
    "predictions": [random.randint(0,1) for _ in range(100)],
    "targets": [random.randint(0,1) for _ in range(100)]
})
test_df.head()

Unnamed: 0,predictions,targets
0,1,1
1,1,1
2,0,1
3,1,0
4,1,0


In [5]:
custom_accuracy(test_df["predictions"], test_df["targets"], {})

0.48

Evaluatiometric implements the logic of my function

In [6]:
my_accuracy = EvaluationMetric(
    eval_fn = custom_accuracy,
    name = "custom_accuracy",
    greater_is_better = True,
    metric_details = "Custom accuracy metric",
    metric_metadata = {"version":"0.0.1", "type": "custom"},
    version = "v0.0.1"
)

In [7]:
results = mlflow.evaluate(
    data = test_df,
    model_type = "classifier",
    targets = "targets",
    predictions = "predictions",
    extra_metrics = [my_accuracy]
)

2025/09/16 14:42:44 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2025/09/16 14:42:44 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2025/09/16 14:42:51 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer ExactExplainer is used.


<Figure size 1050x700 with 0 Axes>

In [8]:
results.metrics

{'true_negatives': 27,
 'false_positives': 24,
 'false_negatives': 28,
 'true_positives': 21,
 'example_count': 100,
 'accuracy_score': 0.48,
 'recall_score': 0.42857142857142855,
 'precision_score': 0.4666666666666667,
 'f1_score': 0.44680851063829785,
 'custom_accuracy/v0.0.1': 0.48}

calculate metrics from other metrics

In [11]:
def good_enough(predictions:pd.Series, targets:pd.Series, metrics:Dict[str, MetricValue]) -> Union[float, MetricValue]:
    custom_accuracy = metrics["custom_accuracy/v0.0.1"].aggregate_results.get("custom_accuracy")
    if custom_accuracy >= 0.5:
        return 1
    else:
        return 0
good_enough_metric = EvaluationMetric(
    eval_fn = good_enough,
    name = "good_enough",
    greater_is_better = True,
    metric_details = "good enough metric",
    metric_metadata = {"version": "0.0.1", "type":"custom"},
    version = "v0.0.1"
)


In [14]:
results = mlflow.evaluate(
    data = test_df,
    model_type = "classifier",
    targets = "targets",
    predictions = "predictions",
    extra_metrics = [my_accuracy, good_enough_metric]
    
)

2025/09/16 14:51:36 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2025/09/16 14:51:36 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2025/09/16 14:51:37 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer ExactExplainer is used.


In [15]:
results.metrics

{'true_negatives': 27,
 'false_positives': 24,
 'false_negatives': 28,
 'true_positives': 21,
 'example_count': 100,
 'accuracy_score': 0.48,
 'recall_score': 0.42857142857142855,
 'precision_score': 0.4666666666666667,
 'f1_score': 0.44680851063829785,
 'custom_accuracy/v0.0.1': 0.48,
 'good_enough/v0.0.1': 0}

Using make_metric

In [16]:
import random
def random_metric(predictions:pd.Series, targets:pd.Series, metrics:Dict[str, MetricValue]) -> Union[float, MetricValue]:
    return random.random()

In [18]:
my_random_metric = mlflow.metrics.make_metric(
    eval_fn = random_metric,
    name = "random_metric",
    metric_details = "random metric",
    greater_is_better = False,
    metric_metadata = {"version": "0.0.1", "type":"custom"},
    version = "0.0.1"
)

In [20]:
result = mlflow.evaluate(
    data = test_df,
    model_type = "classifier",
    targets = "targets",
    predictions = "predictions",
    extra_metrics = [my_accuracy, good_enough_metric, my_random_metric]
)

2025/09/16 15:42:26 INFO mlflow.models.evaluation.evaluators.classifier: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2025/09/16 15:42:26 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2025/09/16 15:42:28 INFO mlflow.models.evaluation.evaluators.shap: Shap explainer ExactExplainer is used.


In [21]:
result.metrics

{'true_negatives': 27,
 'false_positives': 24,
 'false_negatives': 28,
 'true_positives': 21,
 'example_count': 100,
 'accuracy_score': 0.48,
 'recall_score': 0.42857142857142855,
 'precision_score': 0.4666666666666667,
 'f1_score': 0.44680851063829785,
 'custom_accuracy/v0.0.1': 0.48,
 'good_enough/v0.0.1': 0,
 'random_metric/0.0.1': 0.81107985565389}