In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

### Unweighted day-normalized

In [22]:
# Load energy data
energy = pd.read_csv('data/fasce_monthly_IT012E00314418.csv')
energy['date'] = pd.to_datetime(energy['date'])

# Load CDD data
cdd = pd.read_csv('data/LIML_CDD_22C+-3C.csv', skiprows=6)
cdd['Month starting'] = pd.to_datetime(cdd['Month starting'])
cdd = cdd[['Month starting', 'CDD 22']]

# Load HDD data
hdd = pd.read_csv('data/LIML_HDD_15.5C+-3C.csv', skiprows=6)
hdd['Month starting'] = pd.to_datetime(hdd['Month starting'])
hdd = hdd[['Month starting', 'HDD 15.5']]

# Merge all data on month/date
df = pd.merge(energy, cdd, left_on='date', right_on='Month starting', how='inner')
df = pd.merge(df, hdd, left_on='date', right_on='Month starting', how='inner')
# Filter df between 2023 and 2024
df = df[(df['date'] >= '2023-01-01') & (df['date'] <= '2024-12-31')]

# Prepare regression variables
X = df[['days_in_month', 'HDD 15.5', 'CDD 22']]
y = df['total_kWh']
#X = sm.add_constant(X)  # Adds intercept

# Fit regression model
model = sm.OLS(y, X).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:              total_kWh   R-squared (uncentered):                   0.987
Model:                            OLS   Adj. R-squared (uncentered):              0.985
Method:                 Least Squares   F-statistic:                              541.3
Date:                Mon, 15 Sep 2025   Prob (F-statistic):                    4.89e-20
Time:                        11:42:55   Log-Likelihood:                         -224.49
No. Observations:                  24   AIC:                                      455.0
Df Residuals:                      21   BIC:                                      458.5
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------

In [23]:
# Use the model-aligned data and counts
resid = model.resid                     # residuals on the training rows actually used
y_bar = model.model.endog.mean()        # mean of y on those same rows
n = int(model.nobs)                     # number of obs actually used
p = len(model.params)                   # number of estimated parameters (incl. intercept if present)

nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100



print("Model Evaluation Metrics")
print(f"R²:      {model.rsquared:.3f}")
print(f"NMBE:    {nmbe:.2f}% (target: |≤5% monthly|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤15% monthly)")

Model Evaluation Metrics
R²:      0.987
NMBE:    0.09% (target: |≤5% monthly|)
CV(RMSE):12.35% (target: ≤15% monthly)


#### Comparison

In [24]:
# --- Your regression (already fitted above) ---
X = df[['days_in_month', 'HDD 15.5', 'CDD 22']]
y = df['total_kWh']
model = sm.OLS(y, X).fit()

y_pred = model.fittedvalues
resid = y - y_pred
n, p = len(y), X.shape[1]  # 3 regressors

nmbe = resid.sum() / ((n - p) * y.mean()) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y.mean() * 100

# --- Fill one row in the degreedays.net format ---
row = {
    "Regression": "1",  # or ="1 (shortlist)" to mimic exactly
    "Equation": "E = b*days + h*HDD + c*CDD",
    "Station ID": "LIML",   # put your station/source name
    "Readings": n,
    "First day": df["date"].min().strftime("%Y-%m-%d"),
    "Last day": df["date"].max().strftime("%Y-%m-%d"),
    "Total days": int((df["date"].max() - df["date"].min()).days) + 1,
    "Gap days": 0,
    "DD % est": 0.0,  # leave 0 if not estimated
    "HDD base C": 15.5,
    "CDD base C": 22,
    "HDD total": df["HDD 15.5"].sum(),
    "CDD total": df["CDD 22"].sum(),
    "Baseload (b)": model.params["days_in_month"],
    "HDD coef (h)": model.params["HDD 15.5"],
    "CDD coef (c)": model.params["CDD 22"],
    "R2": model.rsquared,
    "R2 adjusted": model.rsquared_adj,
    "R2 cross-validated": np.nan,  # not computed locally
    "S": np.sqrt((resid**2).sum() / (n - p)),  # std. error of estimate
    "CVRMSE": cvrmse
}

# --- Save to CSV ---
cols = [
    "Regression","Equation","Station ID","Readings","First day","Last day","Total days","Gap days",
    "DD % est","HDD base C","CDD base C","HDD total","CDD total",
    "Baseload (b)","HDD coef (h)","CDD coef (c)",
    "R2","R2 adjusted","R2 cross-validated","S","CVRMSE",
    "Baseload S","Baseload P","HDD coef S","HDD coef P","CDD coef S","CDD coef P"
]

my_summary = pd.DataFrame([row], columns=cols)
my_summary.to_csv("data/regression_summary.csv", index=False)

print("Saved summary to data/regression_summary.csv")


Saved summary to data/regression_summary.csv


### Weighted day-normalized

In [38]:
# Per-day variables
df['E_pd']   = df['total_kWh'] / df['days_in_month']
df['HDD_pd'] = df['HDD 15.5']  / df['days_in_month']
df['CDD_pd'] = df['CDD 22']    / df['days_in_month']

X = sm.add_constant(df[['HDD_pd','CDD_pd']])   # intercept = baseload per day (b)
w = df['days_in_month']                        # weights = period length (days)
y = df['E_pd']

res = sm.WLS(y, X, weights=w).fit()

b = res.params['const']        # baseload per day
h = res.params['HDD_pd']       # per HDD
c = res.params['CDD_pd']       # per CDD

# Predictions on totals in the classic form: E = b*days + h*HDD + c*CDD
yhat = b*df['days_in_month'] + h*df['HDD 15.5'] + c*df['CDD 22']

# ASHRAE Guideline 14 metrics (monthly): p=3 (days, HDD, CDD)
resid = df['total_kWh'] - yhat
n, p = len(df), 3
nmbe   = resid.sum() / ((n - p) * df['total_kWh'].mean()) * 100
rmse_d = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse_d / df['total_kWh'].mean() * 100

print(f"b={b:.6f}, h={h:.6f}, c={c:.6f}")
print(f"R²={res.rsquared:.3f}, NMBE={nmbe:.2f}%, CV(RMSE)={cvrmse:.2f}%")


b=549.127433, h=41.051138, c=80.577296
R²=0.707, NMBE=-0.00%, CV(RMSE)=12.35%


In [None]:
'''
Model E = b*days + h*HDD + c*CDD Evaluation Metrics
CDD Base Temperature:22.0, HDD Base Temperature:15.5
R²:      0.672
NMBE:    22.54% (target: |≤5% monthly|)
CV(RMSE):32.37% (target: ≤15% monthly)'''

### No day-normalized

In [41]:
# Regression: E = h*HDD + c*CDD (without days)
X2 = df[['HDD 15.5', 'CDD 22']]
X2 = sm.add_constant(X2)  # Adds intercept
y2 = df['total_kWh']

model2 = sm.OLS(y2, X2).fit()
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:              total_kWh   R-squared:                       0.724
Model:                            OLS   Adj. R-squared:                  0.697
Method:                 Least Squares   F-statistic:                     27.48
Date:                Fri, 12 Sep 2025   Prob (F-statistic):           1.37e-06
Time:                        10:49:47   Log-Likelihood:                -223.69
No. Observations:                  24   AIC:                             453.4
Df Residuals:                      21   BIC:                             456.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.671e+04   1169.648     14.283      0.0

In [42]:
# Use the model-aligned data and counts
resid = model2.resid                     # residuals on the training rows actually used
y_bar = model2.model.endog.mean()        # mean of y on those same rows
n = int(model2.nobs)                     # number of obs actually used
p = len(model2.params)                   # number of estimated parameters (incl. intercept if present)

nmbe = resid.sum() / ((n - p) * y_bar) * 100
rmse = np.sqrt((resid**2).sum() / (n - p))
cvrmse = rmse / y_bar * 100



print("Model Evaluation Metrics")
print(f"R²:      {model2.rsquared:.3f}")
print(f"NMBE:    {nmbe:.2f}% (target: |≤5% monthly|)")
print(f"CV(RMSE):{cvrmse:.2f}% (target: ≤15% monthly)")

Model Evaluation Metrics
R²:      0.724
NMBE:    -0.00% (target: |≤5% monthly|)
CV(RMSE):11.94% (target: ≤15% monthly)
