In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
sys.path.append("/kaggle/input/utility-smart-meter/src")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Suppress excessive TensorFlow logging
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

#%% ------------------------------------------------
# 1) IMPORT NECESSARY MODULES
#%% ------------------------------------------------
from data.data_loader import load_all_raw_data
from data.data_cleaner import clean_and_merge_all_data
from features.feature_pipeline import create_comprehensive_features
from features.splitters import prepare_forecasting_data, prepare_weekahead_data
from utils.helpers import reduce_memory_footprint

from features.splitters import prepare_forecasting_data
from utils.sequence_builder import build_global_sequences
from models.lstm_model import LSTMForecaster
from evaluation.forecast_evaluation import (
    compute_forecast_metrics,
    print_split_summary,
    evaluate_model,
    evaluate_peak_performance,
    evaluate_forecast_residuals
)
from visualization.forecast_plots import (
    plot_feature_importance,
    plot_actual_vs_predicted,
    plot_peak_actual_vs_predicted
)

from sklearn.preprocessing import LabelEncoder


In [None]:
# Set the base path where all your data files and folders are located
data_path = "/kaggle/input/smart-meters-in-london"

# Call the function to load all raw datasets: consumption, household, weather, and holiday data
raw_data = load_all_raw_data(data_path)

# Extract each dataset from the returned dictionary for easier access
df_consumption = raw_data["consumption"]  # Half-hourly electricity consumption records
df_household = raw_data["household"]      # Household metadata (e.g., tariff, ACORN group)
df_weather = raw_data["weather"]          # Daily weather data
df_holiday = raw_data["holiday"]          # UK bank holiday dates

# Display the first few rows of the consumption data to confirm successful loading
df_consumption.head()

In [None]:
# Clean and merge all raw data
df_final = clean_and_merge_all_data(raw_data)

# View the final cleaned and enriched dataset
df_final.head()

In [None]:
# Run the full feature pipeline
df_features = create_comprehensive_features(df_final)
df_features['household_code'] = LabelEncoder().fit_transform(df_features['LCLid'])

# Check the final DataFrame
df_features.head()

print(f"✅ Feature engineering completed:")
print(f"   📊 Total samples: {len(df_features):,}")
print(f"   📅 Date range: {df_features['day'].min()} to {df_features['day'].max()}")
print(f"   🏠 Households: {df_features['LCLid'].nunique()}")
print(f"   🔧 Features created: {len(df_features.columns)} total columns")

In [None]:
df_features = reduce_memory_footprint(df_features)

In [None]:
cat_cols = [
    "Acorn", "Acorn_grouped", "stdorToU", "season",
    "holiday_category",  # add any other categorical features here
]
for col in cat_cols:
    if col in df_features.columns and df_features[col].dtype.name in ("category", "object"):
        df_features[col] = df_features[col].astype("category").cat.codes

In [None]:
# LSTM FORECASTING CONFIGURATION

CONFIG = {
    # ───────────────────────────────────────────────────────────────────────────
    # Data Scope & Splitting
    # ───────────────────────────────────────────────────────────────────────────
    'forecast_scope': 'subset',       # 'single', 'subset', or 'all'
    'sample_household': 'MAC000001',  # used if forecast_scope='single'
    'subset_households': 100,         # int (number of random households) or list of LCLid
    'seed': 42,                       # random seed for reproducibility

    'test_days': 90,                  # number of days for test split
    'val_days': 60,                   # number of days for validation split

    # ───────────────────────────────────────────────────────────────────────────
    # Sequence & Model Hyperparameters
    # ───────────────────────────────────────────────────────────────────────────
    'seq_length': 14,                 # number of time steps per LSTM input window
    'hidden_units': [128, 64],        # LSTM layer sizes (list of integers)
    'dropout': 0.2,                   # dropout rate for regularization
    'learning_rate': 0.001,           # Adam optimizer learning rate

    'use_embedding': True,            # whether to include household embedding
    'embedding_dim': 8,               # dimension of embedding for household_code

    'scale_features': True,           # standard‐scale input features
    'scale_target': True,             # standard‐scale target variable

    'batch_size': 64,                 # batch size for training
    'epochs': 50,                     # maximum number of training epochs
    'early_stopping_patience': 7,     # patience for early stopping (in epochs)

    'forecast_horizon': 'day',        # 'day', 'week', or 'both'

    # ───────────────────────────────────────────────────────────────────────────
    # Column Names & Grouping
    # ───────────────────────────────────────────────────────────────────────────
    'group_col': 'LCLid',             # household identifier column
    'household_col': 'household_code', # integer‐coded household ID column
    'date_col': 'day'
}


In [None]:
# Determine which households to include
all_households = df_features['LCLid'].unique()

if CONFIG['forecast_scope'] == 'single':
    hh_filter = [CONFIG['sample_household']]
elif CONFIG['forecast_scope'] == 'subset':
    sh = CONFIG['subset_households']
    if isinstance(sh, int):
        np.random.seed(CONFIG['seed'])
        hh_filter = np.random.choice(all_households, size=sh, replace=False).tolist()
    elif isinstance(sh, list):
        hh_filter = sh
    else:
        raise ValueError("subset_households must be an int or a list of LCLid")
else:
    hh_filter = all_households.tolist()

print(f"🏠 Forecasting only these households ({len(hh_filter)}): {hh_filter[:5]}{'...' if len(hh_filter)>5 else ''}")

# Filter df_features to include only selected households
df_sel = df_features[df_features['LCLid'].isin(hh_filter)].copy()

#%% ------------------------------------------------
# 2) SPLIT DATA FOR DAY‐AHEAD FORECASTING (ON FILTERED DATA)
#%% ------------------------------------------------

print("\n📊 Preparing day‐ahead data split (filtered households)…")
train_df, val_df, test_df, feature_cols, target_col, _ = prepare_forecasting_data(
    df_sel,
    target_col="total_kwh",
    test_days=90,
    val_days=60
)
print(f"   ▶ Training samples:   {len(train_df):,}")
print(f"   ▶ Validation samples: {len(val_df):,}")
print(f"   ▶ Test samples:       {len(test_df):,}")
print(f"   ▶ Features:           {len(feature_cols)}")
print(f"   ▶ Target:             {target_col}\n")

In [None]:
#%% ------------------------------------------------
# BUILD SEQUENCES FOR GLOBAL LSTM
#%% ------------------------------------------------

seq_len = 14
print(f"🔄 Building sequences with seq_len = {seq_len}…")

(
 X_train, y_train, hh_train, date_train,
 X_val,   y_val,   hh_val,   date_val,
 X_test,  y_test,  hh_test,  date_test
) = build_global_sequences_with_dates(
    train_df, val_df, test_df,
    feature_cols=feature_cols,
    target_col="label_1",   # day‐ahead
    seq_len=seq_len,
    group_col="LCLid",
    household_col="household_code",
    date_col="day"
)


print(f"   ▶ X_train shape: {X_train.shape}")
print(f"   ▶ y_train shape: {y_train.shape}")
print(f"   ▶ X_val   shape: {X_val.shape}")
print(f"   ▶ y_val   shape: {y_val.shape}")
print(f"   ▶ X_test  shape: {X_test.shape}")
print(f"   ▶ y_test  shape: {y_test.shape}\n")

In [None]:
#%% ------------------------------------------------
# INSTANTIATE & BUILD LSTM MODEL
#%% ------------------------------------------------

lstm_config = {
    "seq_length": seq_len,
    "n_features": len(feature_cols),
    "hidden_units": CONFIG['hidden_units'],
    "dropout": CONFIG['dropout'],
    "learning_rate": CONFIG['learning_rate'],
    "use_embedding": CONFIG['use_embedding'],
    "embedding_dim": CONFIG['embedding_dim'],
    "scale_features": CONFIG['scale_features'],
    "scale_target": CONFIG['scale_target'],
    "random_state": CONFIG['seed']
}

print("🚀 Building LSTMForecaster…")
model = LSTMForecaster(**lstm_config)

# If you want to see the summary now:
model.build_model(n_features=len(feature_cols),
                  n_households=df_features['household_code'].nunique())
model.get_model_summary()

#%% ------------------------------------------------
# TRAIN LSTM MODEL
#%% ------------------------------------------------

print("\n🏋️ Training LSTM model…")
history = model.fit(
    X_train, y_train,
    X_val=X_val, y_val=y_val,
    household_train=hh_train, household_val=hh_val,
    epochs=CONFIG['epochs'],
    batch_size=CONFIG['batch_size'],
    early_stopping=True,
    patience=CONFIG['early_stopping_patience'],
    verbose=2
)

In [None]:
y_train_pred = model.predict(X_train, hh_train)
y_val_pred   = model.predict(X_val,   hh_val)
y_test_pred  = model.predict(X_test,  hh_test)

train_metrics = model.evaluate(X_train, y_train, hh_train)
val_metrics   = model.evaluate(X_val,   y_val,   hh_val)
test_metrics  = model.evaluate(X_test,  y_test,  hh_test)

print("\n📊 OVERALL LSTM FORECAST EVALUATION")
print("----------------------------------------")
print_split_summary("Train", y_train, y_train_pred)
print_split_summary("Val",   y_val,   y_val_pred)
print_split_summary("Test",  y_test,  y_test_pred)

In [None]:
#%% ------------------------------------------------
# PLOT ACTUAL vs PREDICTED – SAMPLE HOUSEHOLDS
#%% ------------------------------------------------

# Pick three random households from the test set
unique_hh = np.unique(hh_test)
sample_hhs = np.random.choice(unique_hh, size=3, replace=False)

for hh in sample_hhs:
    # Mask on hh_test, not on test_df
    hh_mask = (hh_test == hh)

    dates = date_test[hh_mask]
    y_true_hh = y_test[hh_mask]
    y_pred_hh = y_test_pred[hh_mask]

    plt.figure(figsize=(10, 3))
    plt.plot(dates, y_true_hh, label="Actual", marker='o', linestyle='-')
    plt.plot(dates, y_pred_hh, label="Predicted", marker='x', linestyle='--')
    plt.title(f"Household {hh}: Actual vs Predicted (Day‐Ahead)")
    plt.xlabel("Date")
    plt.ylabel("kWh")
    plt.legend()
    plt.tight_layout()
    plt.show()


#%% ------------------------------------------------
# PLOT RESIDUALS – TEST SPLIT
#%% ------------------------------------------------

# Residuals vs. time
residuals = y_test - y_test_pred
plt.figure(figsize=(10, 3))
plt.scatter(date_test, residuals, alpha=0.3)
plt.hlines(0, xmin=date_test.min(), xmax=date_test.max(), colors="red")
plt.title("Residuals vs. Time (Test Split)")
plt.xlabel("Date")
plt.ylabel("Residual (Actual − Predicted)")
plt.tight_layout()
plt.show()


In [None]:
print("\n📊 Preparing week‐ahead data split…")
train_df7, val_df7, test_df7, feature_cols7, target_col7, _ = prepare_weekahead_data(
    df_sel,        # unfiltered df_final if needed, but df_sel already contains desired households
    df_sel,        # we can pass df_sel for features too
    test_days=CONFIG['test_days'],
    val_days=CONFIG['val_days']
)
print(f"   ▶ Training samples (week‐ahead):   {len(train_df7):,}")
print(f"   ▶ Validation samples (week‐ahead): {len(val_df7):,}")
print(f"   ▶ Test samples (week‐ahead):       {len(test_df7):,}")
print(f"   ▶ Features:                        {len(feature_cols7)}")
print(f"   ▶ Target:                          {target_col7}\n")

In [None]:
# ### 5.2 Build Sequences (Week‐Ahead) with Dates


print(f"🔄 Building week‐ahead sequences with seq_len = {seq_len}…")

(
    X_train7, y_train7, hh_train7, date_train7,
    X_val7,   y_val7,   hh_val7,   date_val7,
    X_test7,  y_test7,  hh_test7,  date_test7
) = build_global_sequences_with_dates(
    train_df7, val_df7, test_df7,
    feature_cols=feature_cols7,
    target_col="label_7",             # week‐ahead label
    seq_len=seq_len,
    group_col=CONFIG['group_col'],
    household_col=CONFIG['household_col'],
    date_col=CONFIG['date_col']
)

print(f"   ▶ X_train7 shape: {X_train7.shape}")
print(f"   ▶ y_train7 shape: {y_train7.shape}")
print(f"   ▶ X_val7   shape: {X_val7.shape}")
print(f"   ▶ y_val7   shape: {y_val7.shape}")
print(f"   ▶ X_test7  shape: {X_test7.shape}")
print(f"   ▶ y_test7  shape: {y_test7.shape}\n")


In [None]:
# ### Instantiate LSTM Model (Week‐Ahead)

lstm_config_week = {
    "seq_length": seq_len,
    "n_features": len(feature_cols7),
    "hidden_units": CONFIG['hidden_units'],
    "dropout": CONFIG['dropout'],
    "learning_rate": CONFIG['learning_rate'],
    "use_embedding": CONFIG['use_embedding'],
    "embedding_dim": CONFIG['embedding_dim'],
    "scale_features": CONFIG['scale_features'],
    "scale_target": CONFIG['scale_target'],
    "random_state": CONFIG['seed']
}

print("🚀 Building Week‐Ahead LSTMForecaster…")
model_week = LSTMForecaster(**lstm_config_week)

# Build explicitly to inspect summary
model_week.build_model(
    n_features=len(feature_cols7),
    n_households=df_sel['household_code'].nunique()
)
model_week.get_model_summary()

# ### Train LSTM Model (Week‐Ahead)


print("\n🏋️ Training Week‐Ahead LSTM model…")
history_week = model_week.fit(
    X_train7, y_train7,
    X_val=X_val7, y_val=y_val7,
    household_train=hh_train7, household_val=hh_val7,
    epochs=CONFIG['epochs'],
    batch_size=CONFIG['batch_size'],
    early_stopping=True,
    patience=CONFIG['early_stopping_patience'],
    verbose=2
)


In [None]:
# ### Predict & Evaluate (Week‐Ahead)

#%%
y_train_pred_week = model_week.predict(X_train7, hh_train7)
y_val_pred_week   = model_week.predict(X_val7,   hh_val7)
y_test_pred_week  = model_week.predict(X_test7,  hh_test7)

metrics_train_week = model_week.evaluate(X_train7, y_train7, hh_train7)
metrics_val_week   = model_week.evaluate(X_val7,   y_val7,   hh_val7)
metrics_test_week  = model_week.evaluate(X_test7,  y_test7,  hh_test7)

print("\n📊 OVERALL Week‐Ahead LSTM Evaluation")
print("------------------------------------------------")
print_split_summary("Train7", y_train7, y_train_pred_week)
print_split_summary("Val7",   y_val7,   y_val_pred_week)
print_split_summary("Test7",  y_test7,  y_test_pred_week)


In [None]:
# ### Plot Actual vs Predicted (Week‐Ahead) for Sample Households

#%%
unique_hh_week = np.unique(hh_test7)
sample_hhs_week = np.random.choice(unique_hh_week, size=3, replace=False)

for hh in sample_hhs_week:
    hh_mask = (hh_test7 == hh)
    dates = date_test7[hh_mask]
    y_true_hh = y_test7[hh_mask]
    y_pred_hh = y_test_pred_week[hh_mask]

    plt.figure(figsize=(10, 3))
    plt.plot(dates, y_true_hh, label="Actual", marker='o', linestyle='-')
    plt.plot(dates, y_pred_hh, label="Predicted", marker='x', linestyle='--')
    plt.title(f"Household {hh}: Actual vs Predicted (Week‐Ahead)")
    plt.xlabel("Date")
    plt.ylabel("7‐Day Avg kWh")
    plt.legend()
    plt.tight_layout()
    plt.show()


# ### Plot Residuals vs Time (Week‐Ahead)

#%%
residuals_week = y_test7 - y_test_pred_week
plt.figure(figsize=(10, 3))
plt.scatter(date_test7, residuals_week, alpha=0.3)
plt.hlines(0, xmin=date_test7.min(), xmax=date_test7.max(), colors="red")
plt.title("Residuals vs Time (Week‐Ahead Test)")
plt.xlabel("Date")
plt.ylabel("Residual (Actual − Predicted)")
plt.tight_layout()
plt.show()


In [None]:
"""
src/utils/sequence_builder.py

Utility functions to convert a feature‐engineered DataFrame into sliding‐window
(“sequence”) inputs for LSTM training. Supports both global (all households)
and per‐household sequence construction. Handles optional household_code alignment,
and can optionally return the target dates for plotting.

Author: Shruthi Simha Chippagiri
Date: 2025
"""

import numpy as np
import pandas as pd


def build_sequences(
    df: pd.DataFrame,
    feature_cols: list,
    target_col: str,
    seq_len: int,
    group_col: str = "LCLid",
    include_household: bool = False,
    household_col: str = "household_code"
):
    """
    Build sliding‐window sequences for LSTM input from a long DataFrame sorted by group and time.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing time series data. Must be sorted by [group_col, date_col].
    feature_cols : list of str
        List of feature column names (e.g., lag_*, weather, calendar flags, etc.).
    target_col : str
        Name of the target column (e.g., 'label_1' or 'label_7').
    seq_len : int
        Number of consecutive time steps per sequence (e.g., 14).
    group_col : str, default "LCLid"
        Column name indicating household or group identifier.
    include_household : bool, default False
        If True, also return an array of household codes aligned to each sequence.
    household_col : str, default "household_code"
        Column name containing integer‐coded household IDs (0..n_households-1).

    Returns
    -------
    X : np.ndarray, shape (N_samples, seq_len, n_features)
        Sequence input array.
    y : np.ndarray, shape (N_samples,)
        Target values corresponding to each sequence.
    hh_codes : np.ndarray, shape (N_samples,), optional
        Household codes aligned to each sequence (only if include_household=True).

    Notes
    -----
    - The DataFrame df should be sorted first by group_col, then by date (chronological).
    - This function will only build sequences where all seq_len steps and the target step exist.
    - If any NaNs appear in the features or target within the window, that sequence is skipped.
    - If include_household=True, hh_codes[i] will be the household_code at the final time step of the i-th sequence.
    """
    X_list = []
    y_list = []
    hh_list = []

    for hh, group_df in df.groupby(group_col):
        group_df = group_df.reset_index(drop=True)

        if include_household:
            hh_series = group_df[household_col].values

        features_array = group_df[feature_cols].values
        target_array = group_df[target_col].values

        for i in range(len(group_df) - seq_len):
            window_feats = features_array[i : i + seq_len]
            window_target = target_array[i + seq_len]

            if np.isnan(window_feats).any() or np.isnan(window_target):
                continue

            X_list.append(window_feats)
            y_list.append(window_target)
            if include_household:
                hh_list.append(hh_series[i + seq_len])

    X = np.stack(X_list, axis=0)
    y = np.array(y_list, dtype=float)

    if include_household:
        hh_codes = np.array(hh_list, dtype=int)
        return X, y, hh_codes

    return X, y


def build_global_sequences(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    feature_cols: list,
    target_col: str,
    seq_len: int,
    group_col: str = "LCLid",
    household_col: str = "household_code"
):
    """
    Convenience function to build sequences for train/val/test splits all at once.
    Returns X_train, y_train, hh_train, X_val, y_val, hh_val, X_test, y_test, hh_test.

    Parameters
    ----------
    train_df, val_df, test_df : pd.DataFrame
        Pre‐split DataFrames. Each should be sorted by [group_col, date].
    feature_cols : list of str
        List of feature column names.
    target_col : str
        Name of the target column.
    seq_len : int
        Sequence length.
    group_col : str, default "LCLid"
        Household identifier column.
    household_col : str, default "household_code"
        Integer‐coded household ID column.

    Returns
    -------
    (X_train, y_train, hh_train,
     X_val,   y_val,   hh_val,
     X_test,  y_test,  hh_test)
    """
    X_train, y_train, hh_train = build_sequences(
        train_df, feature_cols, target_col, seq_len,
        group_col=group_col,
        include_household=True,
        household_col=household_col
    )
    X_val, y_val, hh_val = build_sequences(
        val_df, feature_cols, target_col, seq_len,
        group_col=group_col,
        include_household=True,
        household_col=household_col
    )
    X_test, y_test, hh_test = build_sequences(
        test_df, feature_cols, target_col, seq_len,
        group_col=group_col,
        include_household=True,
        household_col=household_col
    )

    return (X_train, y_train, hh_train,
            X_val,   y_val,   hh_val,
            X_test,  y_test,  hh_test)


def build_sequences_with_dates(
    df: pd.DataFrame,
    feature_cols: list,
    target_col: str,
    seq_len: int,
    group_col: str = "LCLid",
    include_household: bool = False,
    household_col: str = "household_code",
    date_col: str = "day"
):
    """
    Like build_sequences, but also returns the target date for each window.

    Returns:
      X : np.ndarray (N, seq_len, n_features)
      y : np.ndarray (N,)
      hh_codes : np.ndarray (N,)  # only if include_household=True
      dates  : np.ndarray (N,)    # the 'day' corresponding to the target for each sequence
    """
    X_list = []
    y_list = []
    hh_list = []
    date_list = []

    for hh, group_df in df.groupby(group_col):
        group_df = group_df.reset_index(drop=True)

        if include_household:
            hh_series = group_df[household_col].values

        features_array = group_df[feature_cols].values
        target_array = group_df[target_col].values
        date_array = pd.to_datetime(group_df[date_col]).values

        for i in range(len(group_df) - seq_len):
            window_feats = features_array[i : i + seq_len]
            window_target = target_array[i + seq_len]
            window_date = date_array[i + seq_len]

            if np.isnan(window_feats).any() or np.isnan(window_target):
                continue

            X_list.append(window_feats)
            y_list.append(window_target)
            date_list.append(window_date)

            if include_household:
                hh_list.append(hh_series[i + seq_len])

    X = np.stack(X_list, axis=0)
    y = np.array(y_list, dtype=float)
    dates = np.array(date_list, dtype="datetime64[ns]")

    if include_household:
        hh_codes = np.array(hh_list, dtype=int)
        return X, y, hh_codes, dates

    return X, y, dates


def build_global_sequences_with_dates(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: pd.DataFrame,
    feature_cols: list,
    target_col: str,
    seq_len: int,
    group_col: str = "LCLid",
    household_col: str = "household_code",
    date_col: str = "day"
):
    """
    Calls build_sequences_with_dates on train/val/test and returns
    (X_train, y_train, hh_train, date_train,
     X_val,   y_val,   hh_val,   date_val,
     X_test,  y_test,  hh_test,  date_test)
    """
    X_train, y_train, hh_train, date_train = build_sequences_with_dates(
        train_df, feature_cols, target_col, seq_len,
        group_col=group_col,
        include_household=True,
        household_col=household_col,
        date_col=date_col
    )
    X_val, y_val, hh_val, date_val = build_sequences_with_dates(
        val_df, feature_cols, target_col, seq_len,
        group_col=group_col,
        include_household=True,
        household_col=household_col,
        date_col=date_col
    )
    X_test, y_test, hh_test, date_test = build_sequences_with_dates(
        test_df, feature_cols, target_col, seq_len,
        group_col=group_col,
        include_household=True,
        household_col=household_col,
        date_col=date_col
    )

    return (
        X_train, y_train, hh_train, date_train,
        X_val,   y_val,   hh_val,   date_val,
        X_test,  y_test,  hh_test,  date_test
    )
