In [None]:
from sklearn import metrics
import inspect

In [None]:
# 1) Get all items from sklearn.metrics
all_items = [name for name in dir(metrics) if not name.startswith('_')]
print("All items in metrics module:")
print(all_items)

In [None]:
# Categorize items by type
functions = []
classes = []
submodules = []

for name in all_items:
    obj = getattr(metrics, name)
    if inspect.isfunction(obj):
        functions.append(name)
    elif inspect.isclass(obj):
        classes.append(name)
    elif inspect.ismodule(obj):
        submodules.append(name)

print(f"Functions: {len(functions)}")
print(functions)
print(f"\nClasses: {len(classes)}")
print(classes)
print(f"\nSubmodules: {len(submodules)}")
print(submodules)

In [None]:
# 2) Filter classification metrics suitable for TFD (binary classification)
# Exclude regression metrics, clustering metrics, pairwise distances, etc.

regression_metrics = {
    'mean_absolute_error', 'mean_squared_error', 'mean_squared_log_error',
    'median_absolute_error', 'r2_score', 'explained_variance_score',
    'max_error', 'mean_absolute_percentage_error', 'mean_tweedie_deviance',
    'mean_poisson_deviance', 'mean_gamma_deviance', 'mean_pinball_loss',
    'd2_absolute_error_score', 'd2_pinball_score', 'd2_tweedie_score',
    'root_mean_squared_error', 'root_mean_squared_log_error'
}

clustering_metrics = {
    'adjusted_mutual_info_score', 'adjusted_rand_score', 'calinski_harabasz_score',
    'completeness_score', 'davies_bouldin_score', 'fowlkes_mallows_score',
    'homogeneity_completeness_v_measure', 'homogeneity_score', 'mutual_info_score',
    'normalized_mutual_info_score', 'rand_score', 'silhouette_samples',
    'silhouette_score', 'v_measure_score', 'consensus_score'
}

ranking_metrics = {
    'coverage_error', 'dcg_score', 'label_ranking_average_precision_score',
    'label_ranking_loss', 'ndcg_score'
}

pairwise_utils = {
    'euclidean_distances', 'nan_euclidean_distances', 'pairwise_distances',
    'pairwise_distances_argmin', 'pairwise_distances_argmin_min',
    'pairwise_distances_chunked', 'pairwise_kernels', 'auc'
}

utility_functions = {
    'check_scoring', 'get_scorer', 'get_scorer_names', 'make_scorer'
}

# Classification metrics functions
classification_functions = [
    name for name in functions
    if name not in regression_metrics
    and name not in clustering_metrics
    and name not in ranking_metrics
    and name not in pairwise_utils
    and name not in utility_functions
]

print(f"Classification metrics functions: {len(classification_functions)}")
print(classification_functions)

In [None]:
# =============================================================================
# COMPREHENSIVE METRICS CONFIGURATION FOR TRANSACTION FRAUD DETECTION (TFD)
# =============================================================================
#
# TFD Characteristics:
# - Binary classification (fraud=1, non-fraud=0)
# - Highly imbalanced (~1-5% fraud rate)
# - High cost of False Negatives (missed fraud = financial loss)
# - Cost of False Positives (blocked legitimate = customer friction)
#
# Key Differences from River (Online ML):
# - sklearn metrics are BATCH metrics (computed on entire dataset at once)
# - sklearn metrics are FUNCTIONS, not classes (no .update() method)
# - sklearn requires y_true and y_pred/y_score as input arrays
# - Some metrics need predicted labels, others need probability scores
#
# =============================================================================
# FINAL RECOMMENDED CONFIGURATION
# =============================================================================

# -----------------------------------------------------------------------------
# PRIMARY METRICS - Most important for fraud detection
# These should be the main focus for model selection and evaluation
# -----------------------------------------------------------------------------

primary_metric_functions = {
    # ROC-AUC: Best overall threshold-independent metric for imbalanced binary
    # Uses probability scores, not predicted labels
    "roc_auc_score": metrics.roc_auc_score,
    
    # Average Precision (PR-AUC): Area under precision-recall curve
    # Better than ROC-AUC for highly imbalanced data
    "average_precision_score": metrics.average_precision_score,
    
    # Recall: Fraud detection rate (minimize missed fraud)
    # TP / (TP + FN) - How many actual frauds did we catch?
    "recall_score": metrics.recall_score,
    
    # Precision: False alarm rate (customer experience)
    # TP / (TP + FP) - Of predicted frauds, how many were actually fraud?
    "precision_score": metrics.precision_score,
    
    # F1: Harmonic mean of Precision & Recall
    # Best when you want balance between precision and recall
    "f1_score": metrics.f1_score,
    
    # F-beta with beta=2: Weights Recall 2x more than Precision
    # CRITICAL for fraud detection where missing fraud is costly
    "fbeta_score": metrics.fbeta_score,
}

primary_metric_args = {
    # ROC-AUC: No special args needed for binary classification
    # Expects y_score (probabilities), not y_pred (labels)
    "roc_auc_score": {},
    
    # Average Precision: pos_label=1 means fraud is positive class
    # Expects y_score (probabilities)
    "average_precision_score": {
        "pos_label": 1,
    },
    
    # Recall: binary classification with fraud as positive class
    # Expects y_pred (labels)
    "recall_score": {
        "pos_label": 1,
        "average": "binary",
        "zero_division": 0.0,
    },
    
    # Precision: binary classification with fraud as positive class
    # Expects y_pred (labels)
    "precision_score": {
        "pos_label": 1,
        "average": "binary",
        "zero_division": 0.0,
    },
    
    # F1: binary classification with fraud as positive class
    # Expects y_pred (labels)
    "f1_score": {
        "pos_label": 1,
        "average": "binary",
        "zero_division": 0.0,
    },
    
    # F-beta: beta=2 weights Recall 2x more than Precision
    # For fraud: missing fraud (FN) is more costly than false alarm (FP)
    # Expects y_pred (labels)
    "fbeta_score": {
        "beta": 2.0,  # Recall is 2x more important than Precision
        "pos_label": 1,
        "average": "binary",
        "zero_division": 0.0,
    },
}

print("Primary metrics defined:")
for name in primary_metric_functions:
    print(f"  - {name}: {primary_metric_args.get(name, {})}")

In [None]:
# -----------------------------------------------------------------------------
# SECONDARY METRICS - Good for monitoring and additional insights
# These provide complementary information but shouldn't drive model selection
# -----------------------------------------------------------------------------

secondary_metric_functions = {
    # Accuracy: Overall correctness (TP + TN) / Total
    # CAUTION: Misleading for imbalanced data when used alone!
    # Include for: baseline comparison, sanity checks, stakeholder reporting
    # With 3% fraud: predicting all non-fraud = 97% accuracy (useless!)
    # ALWAYS show alongside balanced_accuracy and recall
    "accuracy_score": metrics.accuracy_score,
    
    # Balanced Accuracy: Average of recall on each class
    # = (TPR + TNR) / 2 = (Recall_fraud + Recall_non_fraud) / 2
    # Better than accuracy for imbalanced data - penalizes ignoring minority
    "balanced_accuracy_score": metrics.balanced_accuracy_score,
    
    # Matthews Correlation Coefficient: Most robust single metric
    # Balanced measure, works well with imbalanced classes
    # Range: [-1, +1], 0 = random, +1 = perfect, -1 = inverse
    # Only metric that gives high score when all 4 confusion matrix categories are good
    "matthews_corrcoef": metrics.matthews_corrcoef,
    
    # Cohen's Kappa: Agreement beyond chance
    # Useful for comparing with baseline/random classifier
    # Range: [-1, +1], 0 = no better than chance, +1 = perfect
    "cohen_kappa_score": metrics.cohen_kappa_score,
    
    # Jaccard Score: Intersection over Union (IoU)
    # TP / (TP + FP + FN) - stricter than F1
    # Ignores TN, focuses only on positive class predictions
    "jaccard_score": metrics.jaccard_score,
}

secondary_metric_args = {
    # Accuracy: normalize=True returns fraction [0, 1]
    # sample_weight=None means equal weight for all samples
    # For imbalanced data: use alongside balanced_accuracy, never alone!
    "accuracy_score": {
        "normalize": True,  # Return fraction (0.0 to 1.0), not count
        # sample_weight: Can be set dynamically to correct for imbalance
        # Example: weight fraud samples higher to penalize missing them
    },
    
    # Balanced Accuracy: adjusted=False returns [0, 1], adjusted=True shifts to [-0.5, 1]
    # adjusted=True: random classifier scores 0, adjusted=False: random scores ~0.5
    "balanced_accuracy_score": {
        "adjusted": False,  # Keep in [0, 1] range for interpretability
    },
    
    # MCC: No special args, works on y_true vs y_pred
    # Handles imbalanced data well by design
    "matthews_corrcoef": {},
    
    # Cohen's Kappa: weights=None for unweighted agreement
    # weights='linear' or 'quadratic' for ordinal classification
    "cohen_kappa_score": {
        "weights": None,  # Unweighted (linear/quadratic for ordinal data)
    },
    
    # Jaccard: binary classification with fraud as positive class
    "jaccard_score": {
        "pos_label": 1,
        "average": "binary",
        "zero_division": 0.0,
    },
}

print("Secondary metrics defined:")
for name in secondary_metric_functions:
    print(f"  - {name}: {secondary_metric_args.get(name, {})}")

In [None]:
# -----------------------------------------------------------------------------
# PROBABILISTIC/CALIBRATION METRICS - For probability calibration monitoring
# These measure how well-calibrated the predicted probabilities are
# -----------------------------------------------------------------------------

probabilistic_metric_functions = {
    # Log Loss (Cross-Entropy): Penalizes confident wrong predictions
    # Lower is better, heavily penalizes confident mistakes
    "log_loss": metrics.log_loss,
    
    # Brier Score: Mean squared error of probability predictions
    # Lower is better, range [0, 1] for binary with scale_by_half=True
    "brier_score_loss": metrics.brier_score_loss,
    
    # D^2 Log Loss Score: Fraction of log loss explained
    # Similar to R^2, but for log loss; higher is better
    "d2_log_loss_score": metrics.d2_log_loss_score,
    
    # D^2 Brier Score: Fraction of Brier score explained
    # Similar to R^2, but for Brier score; higher is better
    # Best=1.0, can be negative if worse than null model
    "d2_brier_score": metrics.d2_brier_score,
}

probabilistic_metric_args = {
    # Log Loss: normalize=True returns mean loss per sample
    "log_loss": {
        "normalize": True,
    },
    
    # Brier Score: pos_label=1 means fraud is positive class
    "brier_score_loss": {
        "pos_label": 1,
    },
    
    # D^2 Log Loss: No special args
    "d2_log_loss_score": {},
    
    # D^2 Brier Score: pos_label=1 for fraud class
    "d2_brier_score": {
        "pos_label": 1,
    },
}

print("Probabilistic/Calibration metrics defined:")
for name in probabilistic_metric_functions:
    print(f"  - {name}: {probabilistic_metric_args.get(name, {})}")

In [None]:
# -----------------------------------------------------------------------------
# ANALYSIS/REPORTING METRICS - For detailed analysis and threshold tuning
# These return multiple values or structured outputs
# -----------------------------------------------------------------------------

analysis_metric_functions = {
    # Confusion Matrix: Foundation for many other metrics
    # Returns 2x2 matrix: [[TN, FP], [FN, TP]]
    "confusion_matrix": metrics.confusion_matrix,
    
    # NEW IN SKLEARN 1.8! Confusion Matrix at Thresholds
    # Returns TN, FP, FN, TP arrays for each threshold
    # CRITICAL for threshold optimization in fraud detection
    "confusion_matrix_at_thresholds": metrics.confusion_matrix_at_thresholds,
    
    # Classification Report: Text summary of P, R, F1 per class
    # Can return dict with output_dict=True
    "classification_report": metrics.classification_report,
    
    # Precision-Recall Curve: For threshold analysis
    # Returns (precision, recall, thresholds)
    "precision_recall_curve": metrics.precision_recall_curve,
    
    # ROC Curve: For threshold analysis
    # Returns (fpr, tpr, thresholds)
    "roc_curve": metrics.roc_curve,
    
    # DET Curve: Detection Error Tradeoff
    # Returns (fpr, fnr, thresholds) - plots FNR vs FPR
    # Useful for fraud: visualize false alarm vs missed fraud tradeoff
    "det_curve": metrics.det_curve,
    
    # Class Likelihood Ratios: LR+, LR- for diagnostic testing
    # Returns (positive_lr, negative_lr)
    "class_likelihood_ratios": metrics.class_likelihood_ratios,
    
    # Precision-Recall-FScore-Support: All in one
    # Returns (precision, recall, fbeta, support) arrays
    "precision_recall_fscore_support": metrics.precision_recall_fscore_support,
    
    # AUC: General utility to compute area under any curve
    "auc": metrics.auc,
}

analysis_metric_args = {
    # Confusion Matrix: labels=[0, 1] ensures consistent ordering
    # normalize='true' normalizes over actual (row-wise)
    "confusion_matrix": {
        "labels": [0, 1],  # [non-fraud, fraud]
        "normalize": None,  # Return raw counts; use 'true'/'pred'/'all' for proportions
    },
    
    # Confusion Matrix at Thresholds: pos_label=1 for fraud
    # Returns (tns, fps, fns, tps, thresholds) arrays
    "confusion_matrix_at_thresholds": {
        "pos_label": 1,  # Fraud is positive class
    },
    
    # Classification Report: output_dict=True for programmatic access
    "classification_report": {
        "target_names": ["Non-Fraud", "Fraud"],
        "output_dict": True,  # Return dict instead of string
        "zero_division": 0.0,
    },
    
    # Precision-Recall Curve: pos_label=1 for fraud class
    "precision_recall_curve": {
        "pos_label": 1,
    },
    
    # ROC Curve: pos_label=1 for fraud class
    "roc_curve": {
        "pos_label": 1,
        "drop_intermediate": True,  # Reduce points for efficiency
    },
    
    # DET Curve: pos_label=1 for fraud class
    "det_curve": {
        "pos_label": 1,
        "drop_intermediate": True,  # Reduce points for efficiency
    },
    
    # Class Likelihood Ratios: labels=[non-fraud, fraud] ordering
    "class_likelihood_ratios": {
        "labels": [0, 1],  # [negative_class, positive_class]
    },
    
    # Precision-Recall-FScore-Support: beta=1.0 for F1
    "precision_recall_fscore_support": {
        "beta": 1.0,
        "pos_label": 1,
        "average": "binary",
        "zero_division": 0.0,
    },
    
    # AUC: No default args, takes x and y arrays directly
    "auc": {},
}

print("Analysis/Reporting metrics defined:")
for name in analysis_metric_functions:
    print(f"  - {name}: {analysis_metric_args.get(name, {})}")

In [None]:
# -----------------------------------------------------------------------------
# VISUALIZATION CLASSES - For plotting metrics
# These are classes that create matplotlib visualizations
# -----------------------------------------------------------------------------

display_classes = {
    # Confusion Matrix Display: Heatmap of confusion matrix
    "ConfusionMatrixDisplay": metrics.ConfusionMatrixDisplay,
    
    # ROC Curve Display: Plot ROC curve with AUC
    "RocCurveDisplay": metrics.RocCurveDisplay,
    
    # Precision-Recall Display: Plot PR curve with AP
    "PrecisionRecallDisplay": metrics.PrecisionRecallDisplay,
    
    # DET Curve Display: Detection Error Tradeoff curve
    "DetCurveDisplay": metrics.DetCurveDisplay,
}

print("Display classes for visualization:")
for name in display_classes:
    print(f"  - {name}")
    print(f"    Methods: from_estimator(), from_predictions(), plot()")

In [None]:
# -----------------------------------------------------------------------------
# EXCLUDED METRICS - Less relevant for TFD binary classification
# -----------------------------------------------------------------------------

excluded_metrics = {
    # top_k_accuracy_score: For multiclass ranking, not binary
    "top_k_accuracy_score": "For multiclass ranking scenarios, not binary classification",
    
    # hamming_loss: More relevant for multilabel classification
    "hamming_loss": "More relevant for multilabel classification",
    
    # zero_one_loss: Equivalent to 1 - accuracy, same issues
    # If you need error rate, just compute 1 - accuracy_score
    "zero_one_loss": "Equivalent to 1 - accuracy, redundant",
    
    # hinge_loss: Specific to SVM margin-based classifiers
    # Only meaningful for linear SVMs, not tree-based models like XGBoost/CatBoost
    "hinge_loss": "Specific to SVM margin-based classifiers",
    
    # multilabel_confusion_matrix: For multilabel, not binary
    "multilabel_confusion_matrix": "For multilabel classification scenarios",
}

print("Excluded metrics and reasons:")
for name, reason in excluded_metrics.items():
    print(f"  - {name}: {reason}")

In [None]:
# =============================================================================
# USAGE EXAMPLE - How to use these metrics in practice
# =============================================================================

import numpy as np

# Simulated predictions
np.random.seed(42)
n_samples = 1000
fraud_rate = 0.03  # 3% fraud rate (imbalanced)

# True labels
y_true = np.random.choice([0, 1], size=n_samples, p=[1-fraud_rate, fraud_rate])

# Simulated model probabilities (fraud probability)
# Good model: higher probs for actual fraud, lower for non-fraud
y_proba = np.where(
    y_true == 1,
    np.random.beta(5, 2, size=n_samples),  # Fraud: skewed high
    np.random.beta(2, 5, size=n_samples),  # Non-fraud: skewed low
)

# Predicted labels at threshold 0.5
threshold = 0.5
y_pred = (y_proba >= threshold).astype(int)

print(f"Dataset: {n_samples} samples, {y_true.sum()} fraud ({100*y_true.mean():.1f}%)")
print(f"Predictions: {y_pred.sum()} predicted fraud ({100*y_pred.mean():.1f}%)")
print()

In [None]:
# Compute PRIMARY metrics
print("=" * 60)
print("PRIMARY METRICS (Most important for fraud detection)")
print("=" * 60)

# ROC-AUC (uses probabilities)
roc_auc = metrics.roc_auc_score(y_true, y_proba, **primary_metric_args["roc_auc_score"])
print(f"ROC-AUC Score: {roc_auc:.4f}")

# Average Precision (uses probabilities)
avg_precision = metrics.average_precision_score(y_true, y_proba, **primary_metric_args["average_precision_score"])
print(f"Average Precision (PR-AUC): {avg_precision:.4f}")

# Recall (uses labels)
recall = metrics.recall_score(y_true, y_pred, **primary_metric_args["recall_score"])
print(f"Recall (Fraud Detection Rate): {recall:.4f}")

# Precision (uses labels)
precision = metrics.precision_score(y_true, y_pred, **primary_metric_args["precision_score"])
print(f"Precision (True Fraud Rate): {precision:.4f}")

# F1 Score (uses labels)
f1 = metrics.f1_score(y_true, y_pred, **primary_metric_args["f1_score"])
print(f"F1 Score: {f1:.4f}")

# F2 Score (uses labels) - Recall weighted 2x
f2 = metrics.fbeta_score(y_true, y_pred, **primary_metric_args["fbeta_score"])
print(f"F2 Score (Recall-weighted): {f2:.4f}")

In [None]:
# Compute SECONDARY metrics
print("\n" + "=" * 60)
print("SECONDARY METRICS (Additional monitoring)")
print("=" * 60)

# Accuracy (with caution label)
accuracy = metrics.accuracy_score(y_true, y_pred, **secondary_metric_args["accuracy_score"])
print(f"Accuracy: {accuracy:.4f}  ⚠️ (misleading alone for imbalanced data)")

# Balanced Accuracy (preferred over accuracy for imbalanced)
bal_acc = metrics.balanced_accuracy_score(y_true, y_pred, **secondary_metric_args["balanced_accuracy_score"])
print(f"Balanced Accuracy: {bal_acc:.4f}  ✓ (use this instead of accuracy)")

# Show the difference to illustrate the imbalance problem
print(f"\n  → Gap (Accuracy - Balanced Acc): {accuracy - bal_acc:.4f}")
print(f"  → Large gap indicates class imbalance affecting accuracy")

# Matthews Correlation Coefficient
mcc = metrics.matthews_corrcoef(y_true, y_pred, **secondary_metric_args["matthews_corrcoef"])
print(f"\nMatthews Correlation Coefficient: {mcc:.4f}")

# Cohen's Kappa
kappa = metrics.cohen_kappa_score(y_true, y_pred, **secondary_metric_args["cohen_kappa_score"])
print(f"Cohen's Kappa: {kappa:.4f}")

# Jaccard Score
jaccard = metrics.jaccard_score(y_true, y_pred, **secondary_metric_args["jaccard_score"])
print(f"Jaccard Score: {jaccard:.4f}")

# Demonstrate sample_weight for accuracy (correcting imbalance)
print("\n" + "-" * 60)
print("ACCURACY WITH SAMPLE WEIGHTS (Correcting for Imbalance)")
print("-" * 60)

# Compute class weights inversely proportional to class frequency
n_fraud = y_true.sum()
n_non_fraud = len(y_true) - n_fraud
weight_fraud = len(y_true) / (2 * n_fraud) if n_fraud > 0 else 1
weight_non_fraud = len(y_true) / (2 * n_non_fraud) if n_non_fraud > 0 else 1

# Create sample weights array
sample_weights = np.where(y_true == 1, weight_fraud, weight_non_fraud)

# Compute weighted accuracy
weighted_accuracy = metrics.accuracy_score(y_true, y_pred, sample_weight=sample_weights)
print(f"Unweighted Accuracy: {accuracy:.4f}")
print(f"Weighted Accuracy:   {weighted_accuracy:.4f}")
print(f"Balanced Accuracy:   {bal_acc:.4f}")
print(f"\n  → Weighted accuracy ≈ Balanced accuracy when using inverse class weights")

In [None]:
# Compute PROBABILISTIC/CALIBRATION metrics
print("\n" + "=" * 60)
print("PROBABILISTIC METRICS (Calibration monitoring)")
print("=" * 60)

# Log Loss
logloss = metrics.log_loss(y_true, y_proba, **probabilistic_metric_args["log_loss"])
print(f"Log Loss: {logloss:.4f}")

# Brier Score
brier = metrics.brier_score_loss(y_true, y_proba, **probabilistic_metric_args["brier_score_loss"])
print(f"Brier Score: {brier:.4f}")

# D^2 Log Loss Score
d2_logloss = metrics.d2_log_loss_score(y_true, y_proba, **probabilistic_metric_args["d2_log_loss_score"])
print(f"D^2 Log Loss Score: {d2_logloss:.4f}")

# D^2 Brier Score
d2_brier = metrics.d2_brier_score(y_true, y_proba, **probabilistic_metric_args["d2_brier_score"])
print(f"D^2 Brier Score: {d2_brier:.4f}")

In [None]:
# Compute ANALYSIS metrics
print("\n" + "=" * 60)
print("ANALYSIS METRICS (Detailed reporting)")
print("=" * 60)

# Confusion Matrix
cm = metrics.confusion_matrix(y_true, y_pred, **analysis_metric_args["confusion_matrix"])
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix:")
print(f"                 Predicted")
print(f"              Non-Fraud  Fraud")
print(f"Actual Non-Fraud  {tn:5d}    {fp:5d}")
print(f"       Fraud      {fn:5d}    {tp:5d}")

# Classification Report
print(f"\nClassification Report:")
report = metrics.classification_report(y_true, y_pred, **analysis_metric_args["classification_report"])
for cls, vals in report.items():
    if isinstance(vals, dict):
        print(f"  {cls}: P={vals['precision']:.3f}, R={vals['recall']:.3f}, F1={vals['f1-score']:.3f}")

# Class Likelihood Ratios
lr_pos, lr_neg = metrics.class_likelihood_ratios(y_true, y_pred, **analysis_metric_args["class_likelihood_ratios"])
print(f"\nClass Likelihood Ratios:")
print(f"  LR+ (Positive): {lr_pos:.4f}")
print(f"  LR- (Negative): {lr_neg:.4f}")

In [None]:
# =============================================================================
# NEW METRICS DEMONSTRATION
# confusion_matrix_at_thresholds and det_curve are especially useful for TFD
# =============================================================================

print("\n" + "=" * 60)
print("NEW SKLEARN 1.8 METRICS (Threshold Optimization)")
print("=" * 60)

# Confusion Matrix at Thresholds - CRITICAL for threshold optimization
tns, fps, fns, tps, thresholds = metrics.confusion_matrix_at_thresholds(
    y_true, y_proba, **analysis_metric_args["confusion_matrix_at_thresholds"]
)
print(f"\nConfusion Matrix at Thresholds (first 5 thresholds):")
print(f"{'Threshold':>10} | {'TN':>6} | {'FP':>6} | {'FN':>6} | {'TP':>6} | {'Recall':>8} | {'Precision':>10}")
print("-" * 75)
for i in range(min(5, len(thresholds))):
    recall_i = tps[i] / (tps[i] + fns[i]) if (tps[i] + fns[i]) > 0 else 0
    precision_i = tps[i] / (tps[i] + fps[i]) if (tps[i] + fps[i]) > 0 else 0
    print(f"{thresholds[i]:>10.4f} | {tns[i]:>6.0f} | {fps[i]:>6.0f} | {fns[i]:>6.0f} | {tps[i]:>6.0f} | {recall_i:>8.4f} | {precision_i:>10.4f}")

# DET Curve - Detection Error Tradeoff
fpr_det, fnr_det, thresholds_det = metrics.det_curve(
    y_true, y_proba, **analysis_metric_args["det_curve"]
)
print(f"\nDET Curve (Detection Error Tradeoff):")
print(f"  FPR (False Positive Rate) range: {fpr_det.min():.4f} - {fpr_det.max():.4f}")
print(f"  FNR (False Negative Rate) range: {fnr_det.min():.4f} - {fnr_det.max():.4f}")
print(f"  Number of thresholds: {len(thresholds_det)}")

# Show a few threshold points
print(f"\nSample DET curve points (FPR vs FNR tradeoff):")
print(f"{'Threshold':>10} | {'FPR':>8} | {'FNR':>8}")
print("-" * 35)
sample_indices = [0, len(thresholds_det)//4, len(thresholds_det)//2, -1]
for i in sample_indices:
    if i < len(thresholds_det):
        print(f"{thresholds_det[i]:>10.4f} | {fpr_det[i]:>8.4f} | {fnr_det[i]:>8.4f}")

In [None]:
# =============================================================================
# CONSOLIDATED CONFIGURATION FOR TFD BATCH ML
# =============================================================================

# All scalar metrics in one dictionary (for easy iteration)
scalar_metric_functions = {
    **primary_metric_functions,
    **secondary_metric_functions,
    **probabilistic_metric_functions,
}

scalar_metric_args = {
    **primary_metric_args,
    **secondary_metric_args,
    **probabilistic_metric_args,
}

# Metrics that require probability scores (y_proba)
proba_metrics = {
    "roc_auc_score",
    "average_precision_score",
    "log_loss",
    "brier_score_loss",
    "d2_log_loss_score",
    "d2_brier_score",
}

# Metrics that require predicted labels (y_pred)
label_metrics = {
    "accuracy_score",  # Added: include for baseline/sanity checks
    "recall_score",
    "precision_score",
    "f1_score",
    "fbeta_score",
    "balanced_accuracy_score",
    "matthews_corrcoef",
    "cohen_kappa_score",
    "jaccard_score",
}

# Curve/threshold metrics (return arrays, not scalars)
curve_metrics = {
    "roc_curve",
    "precision_recall_curve",
    "det_curve",
    "confusion_matrix_at_thresholds",
}

print("Consolidated configuration:")
print(f"  Total scalar metrics: {len(scalar_metric_functions)}")
print(f"  Probability-based: {len(proba_metrics)}")
print(f"  Label-based: {len(label_metrics)}")
print(f"  Curve/threshold metrics: {len(curve_metrics)}")

In [None]:
# =============================================================================
# HELPER FUNCTION FOR COMPUTING ALL METRICS
# =============================================================================

def compute_all_classification_metrics(y_true, y_pred, y_proba):
    """
    Compute all recommended classification metrics for TFD.
    
    Parameters:
    -----------
    y_true : array-like
        True binary labels (0 = non-fraud, 1 = fraud)
    y_pred : array-like
        Predicted binary labels
    y_proba : array-like
        Predicted probability of fraud (positive class)
        
    Returns:
    --------
    dict : Dictionary with all computed metrics
    """
    results = {}
    
    # PRIMARY METRICS
    results["roc_auc"] = metrics.roc_auc_score(y_true, y_proba)
    results["average_precision"] = metrics.average_precision_score(y_true, y_proba, pos_label=1)
    results["recall"] = metrics.recall_score(y_true, y_pred, pos_label=1, zero_division=0.0)
    results["precision"] = metrics.precision_score(y_true, y_pred, pos_label=1, zero_division=0.0)
    results["f1"] = metrics.f1_score(y_true, y_pred, pos_label=1, zero_division=0.0)
    results["f2"] = metrics.fbeta_score(y_true, y_pred, beta=2.0, pos_label=1, zero_division=0.0)
    
    # SECONDARY METRICS
    results["accuracy"] = metrics.accuracy_score(y_true, y_pred)  # Added: for baseline/reporting
    results["balanced_accuracy"] = metrics.balanced_accuracy_score(y_true, y_pred)
    results["mcc"] = metrics.matthews_corrcoef(y_true, y_pred)
    results["cohen_kappa"] = metrics.cohen_kappa_score(y_true, y_pred)
    results["jaccard"] = metrics.jaccard_score(y_true, y_pred, pos_label=1, zero_division=0.0)
    
    # PROBABILISTIC METRICS
    results["log_loss"] = metrics.log_loss(y_true, y_proba)
    results["brier_score"] = metrics.brier_score_loss(y_true, y_proba, pos_label=1)
    results["d2_log_loss"] = metrics.d2_log_loss_score(y_true, y_proba)
    results["d2_brier"] = metrics.d2_brier_score(y_true, y_proba, pos_label=1)
    
    # CONFUSION MATRIX COMPONENTS
    cm = metrics.confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    results["true_negatives"] = int(tn)
    results["false_positives"] = int(fp)
    results["false_negatives"] = int(fn)
    results["true_positives"] = int(tp)
    
    return results

# Test the helper function
all_metrics = compute_all_classification_metrics(y_true, y_pred, y_proba)
print("All metrics computed:")
for name, value in all_metrics.items():
    if isinstance(value, float):
        print(f"  {name}: {value:.4f}")
    else:
        print(f"  {name}: {value}")

In [None]:
# =============================================================================
# KEY INSIGHTS FOR TFD BATCH ML
# =============================================================================
#
# 1. PRIMARY METRICS (Use for model selection):
#    - ROC-AUC: Best overall threshold-independent metric
#    - Average Precision (PR-AUC): Better for highly imbalanced data
#    - F2 Score: When catching fraud is more important than false alarms
#
# 2. ACCURACY vs BALANCED ACCURACY:
#    - accuracy_score: Include for baseline comparison and stakeholder reporting
#    - ⚠️ NEVER use accuracy alone for imbalanced data!
#    - With 3% fraud rate, predicting all non-fraud = 97% accuracy (useless!)
#    - ALWAYS show balanced_accuracy alongside accuracy
#    - Use sample_weight parameter to correct for class imbalance
#    - Large gap (accuracy - balanced_accuracy) indicates imbalance problem
#
# 3. THRESHOLD SELECTION (NEW IN SKLEARN 1.8):
#    - Default threshold (0.5) is rarely optimal for imbalanced data
#    - Use confusion_matrix_at_thresholds for optimal cutoff analysis
#    - Use precision_recall_curve to find optimal threshold
#    - Consider business cost of FN vs FP when choosing threshold
#
# 4. PROBABILITY CALIBRATION:
#    - Use brier_score_loss and log_loss to assess calibration
#    - NEW: d2_brier_score measures fraction of Brier explained (like R²)
#    - Well-calibrated probabilities are crucial for threshold tuning
#
# 5. ERROR TRADEOFF ANALYSIS:
#    - NEW: det_curve shows FPR vs FNR tradeoff directly
#    - For fraud: FPR = false alarms, FNR = missed fraud
#    - Use DetCurveDisplay for visualization
#
# 6. BATCH vs ONLINE:
#    - sklearn metrics compute on full dataset at once
#    - For online/streaming, use River library instead
#    - Batch metrics are more stable but require all data upfront
#
# 7. METRIC INTERPRETATION FOR TFD:
#    - High Recall: Catching most fraud (minimizing FN)
#    - High Precision: Few false alarms (minimizing FP)
#    - F2 > F1: Indicates model favors recall (fraud detection)
#    - MCC close to 1: Strong balanced performance
#    - Large (Accuracy - Balanced Accuracy) gap: Class imbalance issue
#
# 8. METRICS SUMMARY:
#    - Primary: 6 (roc_auc, avg_precision, recall, precision, f1, f2)
#    - Secondary: 5 (accuracy, balanced_acc, mcc, cohen_kappa, jaccard)
#    - Probabilistic: 4 (log_loss, brier, d2_log_loss, d2_brier)
#    - Analysis/Curve: 9 (confusion_matrix, cm_at_thresholds, classification_report,
#                         pr_curve, roc_curve, det_curve, class_lr, prfs, auc)
#    - Visualization: 4 (ConfusionMatrixDisplay, RocCurveDisplay,
#                        PrecisionRecallDisplay, DetCurveDisplay)
#
# =============================================================================
print("Key insights documented above. Review before using in production.")
print()
print("TOTAL METRICS INVESTIGATED:")
print(f"  - Primary: 6 (roc_auc, avg_precision, recall, precision, f1, f2)")
print(f"  - Secondary: 5 (accuracy, balanced_acc, mcc, cohen_kappa, jaccard)")
print(f"  - Probabilistic: 4 (log_loss, brier, d2_log_loss, d2_brier)")
print(f"  - Analysis/Curve: 9 (confusion_matrix, cm_at_thresholds, classification_report,")
print(f"                       pr_curve, roc_curve, det_curve, class_lr, prfs, auc)")
print(f"  - Visualization: 4 (ConfusionMatrixDisplay, RocCurveDisplay,")
print(f"                      PrecisionRecallDisplay, DetCurveDisplay)")
print()
print("TOTAL: 28 metrics/functions + 4 display classes = 32 items")