In [2]:
import pandas as pd
import numpy as np
import properscoring as ps
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, brier_score_loss, average_precision_score, roc_auc_score
from common_utils.utils_evaluation_metrics import EvaluationMetrics
from views_forecasts.extensions import *

import warnings
warnings.filterwarnings("ignore")

In [3]:
from dataclasses import dataclass
from typing import Optional
import pandas as pd
from statistics import mean, stdev, median


@dataclass
class EvaluationMetrics:
    """
    A data class for storing and managing evaluation metrics for time series forecasting models.
    
    Attributes:
        MSE (Optional[float]): Mean Squared Error.
        MAE (Optional[float]): Mean Absolute Error.
        MSLE (Optional[float]): Mean Squared Logarithmic Error.
        KLD (Optional[float]): Kullback-Leibler Divergence.
        Jeffreys (Optional[float]): Jeffreys Divergence.
        CRPS (Optional[float]): Continuous Ranked Probability Score.
        Brier (Optional[float]): Brier Score.
        AP (Optional[float]): Average Precision.
        AUC (Optional[float]): Area Under the ROC Curve.
        ensemble_weight_reg (Optional[float]): Weight for regression ensemble models.
        ensemble_weight_class (Optional[float]): Weight for classification ensemble models.
    """

    MSE: Optional[float] = None
    MAE: Optional[float] = None
    MSLE: Optional[float] = None
    KLD: Optional[float] = None
    Jeffreys: Optional[float] = None
    CRPS: Optional[float] = None
    Brier: Optional[float] = None
    AP: Optional[float] = None
    AUC: Optional[float] = None
    ensemble_weight_reg: Optional[float] = None
    ensemble_weight_class: Optional[float] = None

    @classmethod
    def make_evaluation_dict(cls, steps=36) -> dict:
        """
        Generates a dictionary of EvaluationMetrics instances for a specified number of forecasting steps.

        This method facilitates the batch creation of metric containers for multiple forecasting steps, initializing them with None.

        Args:
            steps (int): The number of forecasting steps for which to generate evaluation metrics. Defaults to 36.

        Returns:
            dict: A dictionary where each key is a step label (e.g., 'step01', 'step02', ...) and each value is an instance of EvaluationMetrics.

        Example:
            >>> from utils_evaluation_metrics import EvaluationMetrics
            >>> evaluation_dict = EvaluationMetrics.make_evaluation_dict(steps=36)
            >>> evaluation_dict['step01'].MSE = sklearn.metrics.mean_squared_error(step01_y_true, step01_y_pred)
            >>> evaluation_dict['step02'].MSE = sklearn.metrics.mean_squared_error(step02_y_true, step02_y_pred)
            >>> ...
            
        """
        return {f"step{str(i).zfill(2)}": cls() for i in range(1, steps + 1)}

    @staticmethod
    def evaluation_dict_to_dataframe(evaluation_dict: dict) -> pd.DataFrame:
        """
        Converts a dictionary of EvaluationMetrics instances into a pandas DataFrame for easy analysis.

        This static method transforms a structured dictionary of evaluation metrics into a DataFrame, where each row corresponds to a forecasting step and columns represent different metrics.

        Args:
            evaluation_dict (dict): A dictionary of EvaluationMetrics instances, typically generated by the make_evaluation_dict class method.

        Returns:
            pd.DataFrame: A pandas DataFrame where each row indexes a forecasting step and columns correspond to the various metrics stored in EvaluationMetrics.

        Example:
            >>> evaluation_df = EvaluationMetrics.evaluation_dict_to_dataframe(evaluation_dict)

        """
        return pd.DataFrame.from_dict(evaluation_dict, orient='index')

    @staticmethod
    def calculate_aggregate_metrics(evaluation_dict: dict) -> dict:
        metrics_aggregate = {
            'mean': {},
            'std': {},
            'median': {}
        }

        for metric in EvaluationMetrics.__annotations__.keys():
            metric_values = [getattr(evaluation, metric) for evaluation in evaluation_dict.values() if getattr(evaluation, metric) is not None]
            if metric_values: 
                metrics_aggregate['mean'][metric] = mean(metric_values)
                metrics_aggregate['std'][metric] = stdev(metric_values)
                metrics_aggregate['median'][metric] = median(metric_values)
            else:
                metrics_aggregate['mean'][metric] = None
                metrics_aggregate['std'][metric] = None
                metrics_aggregate['median'][metric] = None

        return metrics_aggregate

    @staticmethod
    def output_metrics(evaluation_dict):
        aggregate = EvaluationMetrics.calculate_aggregate_metrics(evaluation_dict)
        step_metrics_dict = {step: vars(metrics) for step, metrics in evaluation_dict.items()}
        step_metrics_dict['mean'] = aggregate['mean']
        step_metrics_dict['std'] = aggregate['std']
        step_metrics_dict['median'] = aggregate['median']
        return step_metrics_dict

In [4]:
steps = [*range(1, 36 + 1, 1)]
stepcols = ['ged_sb_dep']
for step in steps:
    stepcols.append("step_pred_" + str(step))

df_calib = pd.DataFrame.forecasts.read_store(name='orange_pasta_calib')
df_calib = df_calib.replace([np.inf, -np.inf], 0)[stepcols]

pred_cols = [f"step_pred_{str(i)}" for i in steps]
df_calib["mse"] = df_calib.apply(lambda row: mean_squared_error([row['ged_sb_dep']] * 36,
                                                    [row[col] for col in pred_cols]), axis=1)

pr_1_orange_pasta_calib.parquet


In [9]:
df_calib["mse"].mean()

164.9208020574185

In [5]:
def generate_metric_dict(df, steps):
    evaluation_dict = EvaluationMetrics.make_evaluation_dict(steps=steps[-1])
    for step in steps:
        evaluation_dict[f"step{str(step).zfill(2)}"].MSE = mean_squared_error(df["ged_sb_dep"], df[f"step_pred_{step}"])
        evaluation_dict[f"step{str(step).zfill(2)}"].MAE = mean_absolute_error(df["ged_sb_dep"], df[f"step_pred_{step}"])
        # evaluation_dict[f"step{str(step).zfill(2)}"].MSLE = mean_squared_log_error(df["ged_sb_dep"], df[f"step_pred_{step}"])
        evaluation_dict[f"step{str(step).zfill(2)}"].CRPS = ps.crps_ensemble(df["ged_sb_dep"], df[f"step_pred_{step}"]).mean()
        # evaluation_dict[f"step{str(step).zfill(2)}"].Brier = brier_score_loss(df["ged_sb_dep"], df[f"step_pred_{step}"])
        # evaluation_dict[f"step{str(step).zfill(2)}"].AUC = roc_auc_score(df["ged_sb_dep"], df[f"step_pred_{step}"])
        # evaluation_dict[f"step{str(step).zfill(2)}"].AP = average_precision_score(df["ged_sb_dep"], df[f"step_pred_{step}"])
    evaluation_dict = EvaluationMetrics.output_metrics(evaluation_dict)
    df_evaluation_dict = EvaluationMetrics.evaluation_dict_to_dataframe(evaluation_dict)  
    return evaluation_dict, df_evaluation_dict
evaluation_dict, df_evaluation_dict = generate_metric_dict(df_calib, steps)
df_evaluation_dict

Unnamed: 0,MSE,MAE,MSLE,KLD,Jeffreys,CRPS,Brier,AP,AUC,ensemble_weight_reg,ensemble_weight_class
step01,421.055895,0.544768,,,,0.544768,,,,,
step02,184.208247,0.503068,,,,0.503068,,,,,
step03,165.793424,0.515745,,,,0.515745,,,,,
step04,143.921008,0.488558,,,,0.488558,,,,,
step05,151.523066,0.516613,,,,0.516613,,,,,
step06,209.712357,0.569543,,,,0.569543,,,,,
step07,250.058933,0.600958,,,,0.600958,,,,,
step08,170.398173,0.54746,,,,0.54746,,,,,
step09,164.33842,0.517319,,,,0.517319,,,,,
step10,227.747797,0.557095,,,,0.557095,,,,,


In [6]:
evaluation_dict

{'step01': {'MSE': 421.0558954480274,
  'MAE': 0.5447678909634467,
  'MSLE': None,
  'KLD': None,
  'Jeffreys': None,
  'CRPS': 0.5447678909634467,
  'Brier': None,
  'AP': None,
  'AUC': None,
  'ensemble_weight_reg': None,
  'ensemble_weight_class': None},
 'step02': {'MSE': 184.20824693629098,
  'MAE': 0.5030680042749763,
  'MSLE': None,
  'KLD': None,
  'Jeffreys': None,
  'CRPS': 0.5030680042749763,
  'Brier': None,
  'AP': None,
  'AUC': None,
  'ensemble_weight_reg': None,
  'ensemble_weight_class': None},
 'step03': {'MSE': 165.79342424048565,
  'MAE': 0.5157452841119792,
  'MSLE': None,
  'KLD': None,
  'Jeffreys': None,
  'CRPS': 0.5157452841119792,
  'Brier': None,
  'AP': None,
  'AUC': None,
  'ensemble_weight_reg': None,
  'ensemble_weight_class': None},
 'step04': {'MSE': 143.92100836849542,
  'MAE': 0.4885582429628628,
  'MSLE': None,
  'KLD': None,
  'Jeffreys': None,
  'CRPS': 0.4885582429628628,
  'Brier': None,
  'AP': None,
  'AUC': None,
  'ensemble_weight_reg': N

In [112]:
# import json
# with open('evaluation_calib.json', 'w') as f:
#     json.dump(evaluation_dict, f, indent=2)
# df_evaluation_dict.to_csv('evaluation_calib.csv')

In [12]:
df_evaluation_dict.index

Index(['step01', 'step02', 'step03', 'step04', 'step05', 'step06', 'step07',
       'step08', 'step09', 'step10', 'step11', 'step12', 'step13', 'step14',
       'step15', 'step16', 'step17', 'step18', 'step19', 'step20', 'step21',
       'step22', 'step23', 'step24', 'step25', 'step26', 'step27', 'step28',
       'step29', 'step30', 'step31', 'step32', 'step33', 'step34', 'step35',
       'step36', 'mean', 'std', 'median'],
      dtype='object')

In [14]:
df_evaluation_dict.loc['mean']['MSE']

164.92080205741857