In [22]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import r2_score

In [25]:
# Load energy data
energy = pd.read_csv('data/Pizzium - Via Buonarroti Milano_electricity_D.csv')
energy['date'] = pd.to_datetime(energy['utc time'])

# Load CDD data
cdd = pd.read_csv('data/LIML_CDD_22C_daily.csv', skiprows=6)
cdd['Date'] = pd.to_datetime(cdd['Date'])
cdd = cdd[['Date', 'CDD 22']]

# Load HDD data
hdd = pd.read_csv('data/LIML_HDD_15.5C_daily.csv', skiprows=6)
hdd['Date'] = pd.to_datetime(hdd['Date'])
hdd = hdd[['Date', 'HDD 15.5']]

# Merge all data on month/date
df = pd.merge(energy, cdd, left_on='date', right_on='Date', how='inner')
df = pd.merge(df, hdd, left_on='date', right_on='Date', how='inner')
# Filter df between 2023 and 2024
df = df[(df['date'] >= '2024-01-01') & (df['date'] <= '2024-12-31')]

# Prepare regression variables
X = df[['HDD 15.5', 'CDD 22']]
y = df['IT012E00135714']
#X = sm.add_constant(X)  # Adds intercept

# Fit regression model
model = sm.OLS(y, X).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:         IT012E00135714   R-squared (uncentered):                   0.741
Model:                            OLS   Adj. R-squared (uncentered):              0.740
Method:                 Least Squares   F-statistic:                              520.6
Date:                Mon, 15 Sep 2025   Prob (F-statistic):                   1.72e-107
Time:                        15:09:40   Log-Likelihood:                         -4868.9
No. Observations:                 366   AIC:                                      9742.
Df Residuals:                     364   BIC:                                      9750.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [26]:
# Use the model-aligned data and counts
resid = model.resid                     # residuals on the training rows actually used
y_bar = model.model.endog.mean()        # mean of y on those same rows
n = int(model.nobs)                     # number of obs actually used
p = len(model.params)                   # number of estimated parameters (incl. intercept if present)

nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100



print("Model Evaluation Metrics")
print(f"R²:      {model.rsquared:.3f}")
print(f"NMBE:    {nmbe:.2f}% (target: |≤10 daily|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤30% daily)")

Model Evaluation Metrics
R²:      0.741
NMBE:    30.39% (target: |≤10 daily|)
CV(RMSE):52.02% (target: ≤30% daily)


### Weekly

In [27]:
import pandas as pd
import statsmodels.api as sm

# --- Load energy data ---
energy = pd.read_csv('data/Pizzium - Via Buonarroti Milano_electricity_D.csv')
energy['date'] = pd.to_datetime(energy['utc time'])
energy = energy[['date', 'IT012E00135714']]

# --- Load CDD data ---
cdd = pd.read_csv('data/LIML_CDD_22C_daily.csv', skiprows=6)
cdd['Date'] = pd.to_datetime(cdd['Date'])
cdd = cdd[['Date', 'CDD 22']].rename(columns={'Date': 'date'})

# --- Load HDD data ---
hdd = pd.read_csv('data/LIML_HDD_15.5C_daily.csv', skiprows=6)
hdd['Date'] = pd.to_datetime(hdd['Date'])
hdd = hdd[['Date', 'HDD 15.5']].rename(columns={'Date': 'date'})

# --- Merge daily data ---
df = pd.merge(energy, cdd, on='date', how='inner')
df = pd.merge(df, hdd, on='date', how='inner')

# --- Filter between 2024 only ---
df = df[(df['date'] >= '2024-01-01') & (df['date'] <= '2024-12-31')]

# --- Weekly aggregation ---
weekly = df.resample('W', on='date').sum().reset_index()

# --- Add "days" column for proper baseline modelling ---
weekly['days'] = 7  # each resampled period is 7 days

# --- Prepare regression variables ---
X = weekly[['HDD 15.5', 'CDD 22']]
y = weekly['IT012E00135714']

# --- Fit regression model ---
model = sm.OLS(y, X).fit()
print(model.summary())


                                 OLS Regression Results                                
Dep. Variable:         IT012E00135714   R-squared (uncentered):                   0.778
Model:                            OLS   Adj. R-squared (uncentered):              0.769
Method:                 Least Squares   F-statistic:                              89.32
Date:                Mon, 15 Sep 2025   Prob (F-statistic):                    2.17e-17
Time:                        15:09:54   Log-Likelihood:                         -803.46
No. Observations:                  53   AIC:                                      1611.
Df Residuals:                      51   BIC:                                      1615.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [31]:
print(weekly[['IT012E00135714','HDD 15.5','CDD 22']].corr())


                IT012E00135714  HDD 15.5    CDD 22
IT012E00135714        1.000000 -0.082991  0.761561
HDD 15.5             -0.082991  1.000000 -0.492019
CDD 22                0.761561 -0.492019  1.000000


In [38]:
weekly.drop(columns=['days'], inplace=True)

In [39]:
weekly.to_csv('data/pizzium_weekly.csv', index=False)

In [23]:
# --- Cross-validated R² (LOOCV) ---
loo = LeaveOneOut()
y_true, y_pred = [], []

for train_idx, test_idx in loo.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    cv_model = sm.OLS(y_train, X_train).fit()
    y_pred.append(cv_model.predict(X_test).iloc[0])
    y_true.append(y_test.iloc[0])

r2_cv = r2_score(y_true, y_pred)
print(f"Cross-validated R² (LOOCV): {r2_cv:.3f}")

Cross-validated R² (LOOCV): -4.746


In [29]:
# Use the model-aligned data and counts
resid = model.resid                     # residuals on the training rows actually used
y_bar = model.model.endog.mean()        # mean of y on those same rows
n = int(model.nobs)                     # number of obs actually used
p = len(model.params)                   # number of estimated parameters (incl. intercept if present)

nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100



print("Model Evaluation Metrics")
print(f"R²:      {model.rsquared:.3f}")
print(f"NMBE:    {nmbe:.2f}% (target: |≤10% daily|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤30% daily)")

Model Evaluation Metrics
R²:      0.778
NMBE:    27.62% (target: |≤10% daily|)
CV(RMSE):49.03% (target: ≤30% daily)
