In [42]:
import pandas as pd
from pathlib import Path

# Paths
RAW_DIR = Path("../data/raw/electric")

# Load the integrated daily dataset
df = pd.read_csv(RAW_DIR / "complete_dataset.csv", parse_dates=["date"])
df.head()


Unnamed: 0,date,demand,RRP,demand_pos_RRP,RRP_positive,demand_neg_RRP,RRP_negative,frac_at_neg_RRP,min_temperature,max_temperature,solar_exposure,rainfall,school_day,holiday
0,2015-01-01,99635.03,25.633696,97319.24,26.415953,2315.79,-7.24,0.020833,13.3,26.9,23.6,0.0,N,Y
1,2015-01-02,129606.01,33.138988,121082.015,38.837661,8523.995,-47.809777,0.0625,15.4,38.8,26.8,0.0,N,N
2,2015-01-03,142300.54,34.564855,142300.54,34.564855,0.0,0.0,0.0,20.0,38.2,26.5,0.0,N,N
3,2015-01-04,104330.715,25.00556,104330.715,25.00556,0.0,0.0,0.0,16.3,21.4,25.2,4.2,N,N
4,2015-01-05,118132.2,26.724176,118132.2,26.724176,0.0,0.0,0.0,15.0,22.0,30.7,0.0,N,N


1.1. Time & calendar features

In [43]:
# Time features
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["dayofweek"] = df["date"].dt.dayofweek       # 0 = Monday
df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)

# Normalize school_day / holiday into 0/1 flags
df["school_day_flag"] = (
    df["school_day"].astype(str).str.strip().str.lower()
      .map({"yes": 1, "y": 1, "true": 1, "no": 0, "n": 0, "false": 0})
)

df["holiday_flag"] = (
    df["holiday"].astype(str).str.strip().str.lower()
      .map({"yes": 1, "y": 1, "true": 1, "no": 0, "n": 0, "false": 0})
)

# Drop rows where flags couldn't be parsed
df = df.dropna(subset=["school_day_flag", "holiday_flag"]).copy()
df["school_day_flag"] = df["school_day_flag"].astype(int)
df["holiday_flag"] = df["holiday_flag"].astype(int)


1.2. Lag features (capture short-term dynamics)


In [44]:
# Sort chronologically to avoid data leakage
df = df.sort_values("date").reset_index(drop=True)

# Lag-1 demand and 7-day rolling mean
df["demand_lag1"] = df["demand"].shift(1)
df["demand_rolling7"] = df["demand"].rolling(7).mean()

# Drop first few rows where lags are NaN
df = df.dropna(subset=["demand_lag1", "demand_rolling7"]).reset_index(drop=True)


1.3. Classification target: high-demand days

In [45]:
threshold = df["demand"].quantile(0.80)
df["high_demand_flag"] = (df["demand"] >= threshold).astype(int)

threshold, df["high_demand_flag"].value_counts(normalize=True)


(np.float64(132815.599),
 high_demand_flag
 0    0.8
 1    0.2
 Name: proportion, dtype: float64)

2. Build feature matrix and time-aware train/test split

In [47]:
from sklearn.model_selection import train_test_split

# Feature columns for both tasks
feature_cols = [
    "year", "month", "dayofweek",
    "min_temperature", "max_temperature",
    "solar_exposure", "rainfall",
    "school_day_flag", "holiday_flag",
    "demand_lag1", "demand_rolling7"
]

X = df[feature_cols]
y_reg = df["demand"]
y_clf = df["high_demand_flag"]


In [48]:
n = len(df)
train_size = int(n * 0.8)

X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_reg_train, y_reg_test = y_reg.iloc[:train_size], y_reg.iloc[train_size:]
y_clf_train, y_clf_test = y_clf.iloc[:train_size], y_clf.iloc[train_size:]


3.1. Pipeline + baseline Linear Regression

In [49]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

numeric_features = feature_cols

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ],
    remainder="drop"
)

reg_lin = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", LinearRegression())
])

reg_lin.fit(X_train, y_reg_train)
y_pred_lin = reg_lin.predict(X_test)

mae = mean_absolute_error(y_reg_test, y_pred_lin)
mse = mean_squared_error(y_reg_test, y_pred_lin)
rmse = np.sqrt(mse)
r2 = r2_score(y_reg_test, y_pred_lin)

print("Linear Regression performance (test set)")
print(f"MAE  : {mae:,.2f} MW")
print(f"RMSE : {rmse:,.2f} MW")
print(f"R²   : {r2:,.3f}")


Linear Regression performance (test set)
MAE  : 5,683.89 MW
RMSE : 7,334.13 MW
R²   : 0.684


3.2 Random Forest Regressor

In [50]:
from sklearn.ensemble import RandomForestRegressor

reg_rf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=5,
        n_jobs=-1,
        random_state=42
    ))
])

reg_rf.fit(X_train, y_reg_train)
y_pred_rf = reg_rf.predict(X_test)

mae_rf = mean_absolute_error(y_reg_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_reg_test, y_pred_rf))
r2_rf = r2_score(y_reg_test, y_pred_rf)

print("Random Forest Regression performance (test set)")
print(f"MAE  : {mae_rf:,.2f} MW")
print(f"RMSE : {rmse_rf:,.2f} MW")
print(f"R²   : {r2_rf:,.3f}")


Random Forest Regression performance (test set)
MAE  : 3,763.58 MW
RMSE : 4,865.16 MW
R²   : 0.861


4.Classification model

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score
)


4.1. Build & train the classifier


In [52]:
clf_rf = Pipeline(steps=[
    ("prep", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=5,
        class_weight="balanced",   # handle imbalance in high-demand days
        n_jobs=-1,
        random_state=42
    ))
])

clf_rf.fit(X_train, y_clf_train)

y_clf_pred = clf_rf.predict(X_test)
y_clf_proba = clf_rf.predict_proba(X_test)[:, 1]


In [53]:
print("Random Forest Classifier – Test Set Performance")
print(classification_report(y_clf_test, y_clf_pred, digits=3))

cm = confusion_matrix(y_clf_test, y_clf_pred)
print("Confusion matrix:\n", cm)

roc_auc = roc_auc_score(y_clf_test, y_clf_proba)
print(f"ROC-AUC: {roc_auc:.3f}")


Random Forest Classifier – Test Set Performance
              precision    recall  f1-score   support

           0      0.994     0.955     0.974       375
           1      0.717     0.956     0.819        45

    accuracy                          0.955       420
   macro avg      0.856     0.955     0.897       420
weighted avg      0.965     0.955     0.958       420

Confusion matrix:
 [[358  17]
 [  2  43]]
ROC-AUC: 0.988


In [54]:
# Extract trained RF model from pipeline
rf_model = clf_rf.named_steps["model"]

importances = rf_model.feature_importances_
for feat, imp in sorted(zip(feature_cols, importances), key=lambda x: -x[1]):
    print(f"{feat:20s} {imp:.3f}")


demand_rolling7      0.236
demand_lag1          0.231
max_temperature      0.197
dayofweek            0.146
min_temperature      0.096
solar_exposure       0.048
month                0.028
year                 0.008
rainfall             0.007
holiday_flag         0.002
school_day_flag      0.002


## 4. Machine Learning – High-Demand Day Classification

### 4.1. Problem Framing

Using the engineered daily dataset (`complete_dataset.csv`), I formulated a **binary classification task**:

- **Target:** `high_demand_flag`
  - 1 = daily demand in the top 20% of the historical distribution (high-demand day)
  - 0 = all other days
- **Features:**
  - Time & calendar: `year`, `month`, `dayofweek`, `is_weekend`, `school_day_flag`, `holiday_flag`
  - Weather: `min_temperature`, `max_temperature`, `solar_exposure`, `rainfall`
  - Demand history: `demand_lag1`, `demand_rolling7`

The data was sorted chronologically and split into **training (first 80%)** and **test (last 20%)** segments to avoid temporal leakage.

A **RandomForestClassifier** with `class_weight="balanced"` was trained on the preprocessed feature set (imputation + scaling pipeline).

---

### 4.2. Test-Set Performance

**Classification report (test set)**

| Class | Meaning          | Precision | Recall | F1-score | Support |
|-------|------------------|-----------|--------|----------|---------|
| 0     | Normal-demand day | 0.994     | 0.955  | 0.974    | 375     |
| 1     | High-demand day   | 0.717     | 0.956  | 0.819    | 45      |

- **Overall accuracy:** **0.955**
- **Macro-average F1:** **0.897**
- **Weighted-average F1:** **0.958**
- **ROC–AUC:** **0.988**

**Confusion matrix (test set)**

\[
\begin{bmatrix}
\text{TN}=358 & \text{FP}=17 \\
\text{FN}=2   & \text{TP}=43
\end{bmatrix}
\]

- Out of **45 actual high-demand days**, the model correctly flags **43** and misses only **2** (false negatives ≈ **4.4%** of peaks).
- Out of **375 normal-demand days**, it incorrectly flags **17** as high (false positives ≈ **4.5%**).

---

### 4.3. Interpretation

From an operational standpoint:

- The model delivers **strong discriminatory power** (ROC–AUC ≈ **0.988**), indicating that the combination of **weather, calendar, and lagged demand** features separates high-demand days from normal days extremely well.
- **Recall for high-demand days is 0.956**, meaning the system successfully anticipates almost all peak days. This is desirable in a planning context where **missing a genuine peak** is more costly than occasionally over-predicting.
- **Precision for high-demand days is 0.717**, implying that roughly **72% of the days flagged as “high demand” are truly high-demand days**, while the remainder are false alarms. This trade-off is acceptable for early-warning use cases (operations can review a manageable number of false positives).
- Normal days (class 0) are classified with **very high precision and recall (≥ 0.955)**, so the model rarely mislabels calm days as peaks.

In short, the Random Forest classifier provides a **reliable peak-demand alert mechanism**: it detects almost all high-demand days with a relatively low false-positive rate, which is a strong result given the inherent class imbalance in the dataset.

---

### 4.4. Implications and Next Steps

- The current threshold (0.5 on predicted probability) already achieves a **good balance** between catching peaks and avoiding unnecessary alarms. For different risk appetites, the decision threshold could be **tuned**:
  - Lower threshold → higher recall (even fewer missed peaks) at the expense of more false positives.
  - Higher threshold → fewer false positives but a higher chance of missing some peaks.
- Feature-importance analysis (from the Random Forest) can be used to:
  - Quantify the relative contribution of **lagged demand**, **temperature**, **solar exposure**, and **school/holiday flags**.
  - Prioritize which signals must be monitored in real time for operational forecasting.

Overall, the model is suitable as a **first-line decision support tool** for identifying days that warrant additional attention in capacity planning, demand-response scheduling, and operational risk management.
