In [1]:
import pandas as pd
import plotly.express as px
import glob
from pathlib import Path
import matplotlib.pyplot as plt

# Set paths
METRICS_DIR = Path("../outputs/metrics")
ANOMALIES_DIR = Path("../outputs/anomalies")

## 1. Model Performance Comparison

We compare the performance of Neural Networks (LSTM, GRU, RNN, BiLSTM) and SARIMA models.

In [2]:
# Load NN metrics
nn_summary_path = METRICS_DIR / "NN_models_summary.csv"
if nn_summary_path.exists():
    nn_metrics = pd.read_csv(nn_summary_path)
    print("Neural Network Metrics:")
    display(nn_metrics.head())
else:
    print("NN metrics not found.")

# Load SARIMA metrics
sarima_summary_path = METRICS_DIR / "SARIMA_CUML_GPU_summary.csv"
if sarima_summary_path.exists():
    sarima_metrics = pd.read_csv(sarima_summary_path)
    print("\nSARIMA Metrics:")
    display(sarima_metrics.head())
else:
    print("SARIMA metrics not found.")

Neural Network Metrics:


Unnamed: 0,country,model,split,MASE,sMAPE,MSE,RMSE,MAPE,MAE,80_PI_coverage
0,DK_cleaned,NN_ann,dev,0.666981,2.873936,30154.296684,173.649926,2.870189,115.227086,
1,DK_cleaned,NN_ann,test,0.854774,3.327613,30028.605432,173.287638,3.344725,123.98876,
2,DK_cleaned,NN_rnn,dev,0.733105,3.174659,35555.213403,188.560901,3.169579,126.650614,
3,DK_cleaned,NN_rnn,test,0.902906,3.550687,34419.856474,185.525892,3.51897,130.970629,
4,DK_cleaned,NN_gru,dev,0.624693,2.734616,26282.18057,162.117798,2.725521,107.921444,



SARIMA Metrics:


Unnamed: 0,country,model,split,MASE,sMAPE,MSE,RMSE,MAPE,80_PI_coverage
0,DK_cleaned,SARIMA_FOURIER_FAST,dev,1.528368,9.980405,193654.3,440.061653,9.51866,
1,DK_cleaned,SARIMA_FOURIER_FAST,test,1.608288,9.206226,158076.3,397.588063,9.073562,
2,FR_cleaned,SARIMA_FOURIER_FAST,dev,1.144536,6.765464,20579670.0,4536.481608,6.610502,
3,FR_cleaned,SARIMA_FOURIER_FAST,test,2.230541,8.525399,826959100.0,28756.895915,12.844023,
4,ES_cleaned,SARIMA_FOURIER_FAST,dev,1.286659,8.847124,8587136.0,2930.381519,8.617038,


In [3]:
# Combine and Visualize
if nn_summary_path.exists() and sarima_summary_path.exists():
    # Standardize columns to Title Case for consistency
    rename_map = {'country': 'Country', 'model': 'Model'}
    
    df_nn = nn_metrics.rename(columns=rename_map)
    df_sarima = sarima_metrics.rename(columns=rename_map)
    
    if 'Model' not in df_sarima.columns:
        df_sarima['Model'] = 'SARIMA'
    
    # Align columns
    common_cols = list(set(df_nn.columns) & set(df_sarima.columns))
    combined = pd.concat([df_nn[common_cols], df_sarima[common_cols]])
    
    # Plot
    fig = px.bar(combined, x='Country', y='RMSE', color='Model', barmode='group', title='RMSE Comparison by Country and Model')
    fig.show()
    
    if 'MAE' in combined.columns:
        fig2 = px.bar(combined, x='Country', y='MAE', color='Model', barmode='group', title='MAE Comparison by Country and Model')
        fig2.show()

## 2. Anomaly Detection Results

We analyze the anomalies detected by the ensemble model.

In [4]:
anomaly_files = list(ANOMALIES_DIR.glob("*_anomalies_ensemble.csv"))

for f in anomaly_files:
    country = f.stem.split('_')[0]
    df = pd.read_csv(f, parse_dates=['timestamp'])
    
    # Identify anomalies (using the logic from anomaly_ml.py or pre-calculated columns)
    # Assuming 'z_resid' is present
    if 'z_resid' in df.columns:
        anomalies = df[df['z_resid'].abs() > 3.5]
        print(f"{country}: Found {len(anomalies)} anomalies (Z-score > 3.5)")
        
        # Plot
        fig = px.scatter(df, x='timestamp', y='y_true', title=f"{country} Price with Anomalies")
        # Add anomalies
        fig.add_scatter(x=anomalies['timestamp'], y=anomalies['y_true'], mode='markers', marker=dict(color='red', size=10), name='Anomaly')
        fig.show()

DK: Found 39 anomalies (Z-score > 3.5)


ES: Found 34 anomalies (Z-score > 3.5)


FR: Found 26 anomalies (Z-score > 3.5)
