In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

### Unweighted day-normalized

In [5]:
# Load energy data
energy = pd.read_csv('data/fasce_monthly_IT012E00314418_full.csv')
energy['date'] = pd.to_datetime(energy['date'])

# Load CDD data
cdd = pd.read_csv('data/LIML_CDD_24C.csv', skiprows=6)
cdd['Month starting'] = pd.to_datetime(cdd['Month starting'])
cdd = cdd[['Month starting', 'CDD 24']]

# Load HDD data
hdd = pd.read_csv('data/LIML_HDD_23C.csv', skiprows=6)
hdd['Month starting'] = pd.to_datetime(hdd['Month starting'])
hdd = hdd[['Month starting', 'HDD 23']]

# Merge all data on month/date
df = pd.merge(energy, cdd, left_on='date', right_on='Month starting', how='inner')
df = pd.merge(df, hdd, left_on='date', right_on='Month starting', how='inner')
# Filter df between 2023 and 2024
df = df[(df['date'] >= '2023-01-01') & (df['date'] <= '2023-12-31')]

# Prepare regression variables
X = df[['days_in_month', 'HDD 23', 'CDD 24']]
y = df['total_kWh']
#X = sm.add_constant(X)  # Adds intercept

# Fit regression model
model = sm.OLS(y, X).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:              total_kWh   R-squared (uncentered):                   0.978
Model:                            OLS   Adj. R-squared (uncentered):              0.971
Method:                 Least Squares   F-statistic:                              133.4
Date:                Thu, 18 Sep 2025   Prob (F-statistic):                    8.89e-08
Time:                        09:19:26   Log-Likelihood:                         -114.99
No. Observations:                  12   AIC:                                      236.0
Df Residuals:                       9   BIC:                                      237.4
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

  return hypotest_fun_in(*args, **kwds)


In [6]:
# Use the model-aligned data and counts
resid = model.resid                     # residuals on the training rows actually used
y_bar = model.model.endog.mean()        # mean of y on those same rows
n = int(model.nobs)                     # number of obs actually used
p = len(model.params)                   # number of estimated parameters (incl. intercept if present)

nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100



print("Model Evaluation Metrics")
print(f"R²:      {model.rsquared:.3f}")
print(f"NMBE:    {nmbe:.2f}% (target: |≤5% monthly|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤15% monthly)")

Model Evaluation Metrics
R²:      0.978
NMBE:    0.18% (target: |≤5% monthly|)
CV(RMSE):17.43% (target: ≤15% monthly)


### Validation

In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score

def compute_metrics(y_true, y_pred, p):
    resid = y_true - y_pred
    n = len(y_true)

    # R²
    r2 = r2_score(y_true, y_pred)

    # RMSE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    # CV(RMSE) (%)
    cvrmse = rmse / y_true.mean() * 100

    # NMBE (%)
    nmbe = resid.sum() / ((n - p) * y_true.mean()) * 100

    return {"R²": r2, "CV(RMSE)": cvrmse, "NMBE": nmbe}

results = {}

for year in df['date'].dt.year.unique():
    # Train on all years except this one
    train = df[df['date'].dt.year != year]
    test = df[df['date'].dt.year == year]

    X_train = train[['days_in_month', 'HDD 23', 'CDD 24']]
    y_train = train['total_kWh']

    X_test = test[['days_in_month', 'HDD 23', 'CDD 24']]
    y_test = test['total_kWh']

    # Fit model (no intercept, to match your setup)
    model = sm.OLS(y_train, X_train).fit()

    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Metrics (p = number of predictors = 3 here)
    train_metrics = compute_metrics(y_train, y_pred_train, p=X_train.shape[1])
    test_metrics = compute_metrics(y_test, y_pred_test, p=X_test.shape[1])

    results[year] = {"train": train_metrics, "test": test_metrics}

# Show results
for year, metrics in results.items():
    print(f"\n=== Leaving out {year} ===")
    print("Train:", metrics["train"])
    print(" Test:", metrics["test"])


ValueError: zero-size array to reduction operation maximum which has no identity

In [None]:
summary = []
for year, metrics in results.items():
    row = {
        "Year left out": year,
        "Train R²": metrics["train"]["R²"],
        "Train CV(RMSE)%": metrics["train"]["CV(RMSE)"],
        "Train NMBE%": metrics["train"]["NMBE"],
        "Test R²": metrics["test"]["R²"],
        "Test CV(RMSE)%": metrics["test"]["CV(RMSE)"],
        "Test NMBE%": metrics["test"]["NMBE"]
    }
    summary.append(row)

summary_df = pd.DataFrame(summary)
print(summary_df.round(3))


   Year left out  Train R²  Train CV(RMSE)%  Train NMBE%  Test R²  \
0           2023     0.832            9.189        0.096    0.210   
1           2024     0.364           15.092        0.184    0.706   

   Test CV(RMSE)%  Test NMBE%  
0          16.826      -7.603  
1          12.153       7.955  


In [None]:
summary_df

NameError: name 'summary_df' is not defined

#### Test

In [None]:
test_data = pd.read_csv('data/WAO Romolo C32_electricity_M.csv')


In [None]:
# 1. Get predictions on test data
y_test = test_data[y_col]                # actual test values
y_pred = model.predict(test_data[X_cols]) # predicted test values

# 2. Residuals (errors on test data)
resid = y_test - y_pred

# 3. Needed values
y_bar = y_test.mean()      # mean of y on the test set
n = len(y_test)            # number of test observations
p = len(model.params)      # number of estimated parameters (same as training)

# 4. Metrics
nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100

# 5. Print results
print("Test Data Evaluation Metrics")
print(f"R²:      {model.rsquared:.3f} (training R², test doesn't have this by default)")
print(f"NMBE:    {nmbe:.2f}% (target: |≤5% monthly|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤15% monthly)")

# Optional: Show predictions vs actuals
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred, "Residual": resid})
print(results.head())

### Model with daily data

In [10]:
# Load energy data
energy = pd.read_csv('data/Viale Cassala, 30_electricity_D.csv')
energy['date'] = pd.to_datetime(energy['utc time'])

# Load CDD data
cdd = pd.read_csv('data/LIML_CDD_22C_daily.csv', skiprows=6)
cdd['Date'] = pd.to_datetime(cdd['Date'])
cdd = cdd[['Date', 'CDD 22']]

# Load HDD data
hdd = pd.read_csv('data/LIML_HDD_15.5C_daily.csv', skiprows=6)
hdd['Date'] = pd.to_datetime(hdd['Date'])
hdd = hdd[['Date', 'HDD 15.5']]

# Merge all data on month/date
df = pd.merge(energy, cdd, left_on='date', right_on='Date', how='inner')
df = pd.merge(df, hdd, left_on='date', right_on='Date', how='inner')
# Filter df between 2023 and 2024
df = df[(df['date'] >= '2024-01-01') & (df['date'] <= '2024-12-31')]

# Prepare regression variables
X = df[['HDD 15.5', 'CDD 22']]
y = df['IT012E00314418']
#X = sm.add_constant(X)  # Adds intercept

# Fit regression model
model = sm.OLS(y, X).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:         IT012E00314418   R-squared (uncentered):                   0.789
Model:                            OLS   Adj. R-squared (uncentered):              0.787
Method:                 Least Squares   F-statistic:                              660.2
Date:                Thu, 18 Sep 2025   Prob (F-statistic):                   3.56e-120
Time:                        09:42:33   Log-Likelihood:                         -5115.6
No. Observations:                 356   AIC:                                  1.024e+04
Df Residuals:                     354   BIC:                                  1.024e+04
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [11]:
# Use the model-aligned data and counts
resid = model.resid                     # residuals on the training rows actually used
y_bar = model.model.endog.mean()        # mean of y on those same rows
n = int(model.nobs)                     # number of obs actually used
p = len(model.params)                   # number of estimated parameters (incl. intercept if present)

nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100



print("Model Evaluation Metrics")
print(f"R²:      {model.rsquared:.3f}")
print(f"NMBE:    {nmbe:.2f}% (target: |≤10 daily|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤30% daily)")

Model Evaluation Metrics
R²:      0.789
NMBE:    23.89% (target: |≤10 daily|)
CV(RMSE):49.76% (target: ≤30% daily)
