In [1]:
# ============================================================
# SmartMeterX (AMPds) — Auto-load CSVs, train 1D-CNN disaggregator,
# save: preprocessor.pkl, model_disagg.h5, model_config.yaml, metrics.json
# ============================================================
import os, re, csv, json, math, pickle, warnings, glob, random
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# --------------------------
# Paths (YOUR EXACT FOLDERS)
# --------------------------
DATA_DIR   = r"C:\Users\sagni\Downloads\SmartMeterX\archive (2)"
OUTPUT_DIR = r"C:\Users\sagni\Downloads\SmartMeterX"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --------------------------
# Reproducibility
# --------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# --------------------------
# Helpers: robust CSV reader & timestamp inference
# --------------------------
def robust_read_csv(path, expect_min_cols=2):
    encs = ["utf-8","utf-8-sig","cp1252","latin1"]
    seps = [",",";","\t","|"]
    # Try sniffing delimiter from header preview
    try:
        with open(path, "rb") as f:
            head = f.read(8192).decode("latin1", errors="ignore")
        sn = csv.Sniffer().sniff(head)
        if sn.delimiter in seps:
            seps = [sn.delimiter] + [s for s in seps if s != sn.delimiter]
    except Exception:
        pass
    last_err = None
    for enc in encs:
        for sep in seps:
            try:
                df = pd.read_csv(path, encoding=enc, sep=sep, engine="python")
                if df.shape[1] >= expect_min_cols:
                    return df
            except Exception as e:
                last_err = e
    raise RuntimeError(f"Could not parse {path}. Last error: {last_err}")

def pick_time_column(cols):
    # Common timestamp names
    cand = ["timestamp","time","datetime","date","ts","utc","localtime","index"]
    lmap = {c.lower(): c for c in cols}
    for c in cand:
        if c in lmap: return lmap[c]
    # fallback: if first col looks like time
    return cols[0]

def parse_time_column(df, tcol):
    # Try pandas to_datetime with multiple formats
    # AMPds often uses ISO or numeric epoch-like fields
    s = df[tcol]
    # If numeric epoch-ish (seconds/minutes), try to detect scale
    if pd.api.types.is_numeric_dtype(s):
        # Heuristic: large numbers ~ epoch seconds/ms
        m = s.median()
        try:
            if m > 10**12:  # ms
                return pd.to_datetime(s, unit="ms", errors="coerce")
            elif m > 10**9:  # s
                return pd.to_datetime(s, unit="s", errors="coerce")
            else:
                # Could be minute index offset; try as datetime anyway
                return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)
        except Exception:
            pass
    # String-like datetimes
    return pd.to_datetime(s, errors="coerce", infer_datetime_format=True)

def add_prefix(df, prefix):
    # Avoid duplicate names across files
    newcols = {}
    for c in df.columns:
        if c.lower() in ["timestamp","time","datetime","date","ts","utc","localtime","index"]:
            newcols[c] = c  # keep time
        else:
            newcols[c] = f"{prefix}__{c}"
    return df.rename(columns=newcols)

# --------------------------
# 1) Load every CSV in folder (ignore HTML), outer-join on minute index
# --------------------------
all_csvs = [p for p in glob.glob(os.path.join(DATA_DIR, "**", "*"), recursive=True)
            if os.path.isfile(p) and p.lower().endswith(".csv")]

if not all_csvs:
    raise FileNotFoundError(f"No CSVs found under: {DATA_DIR}")

frames = []
for path in all_csvs:
    try:
        df = robust_read_csv(path, expect_min_cols=2)
        tcol = pick_time_column(list(df.columns))
        df = df.dropna(subset=[tcol]).copy()
        df[tcol] = parse_time_column(df, tcol)
        df = df.dropna(subset=[tcol]).copy()
        df = df.set_index(tcol).sort_index()

        # Keep only numeric channels
        num_df = df.select_dtypes(include=["number"]).copy()
        if num_df.empty:
            continue
        # Prefix to keep uniqueness
        stem = os.path.splitext(os.path.basename(path))[0]
        num_df = add_prefix(num_df, stem)

        # Resample to 1-minute mean/sum (AMPds is minutely already; mean is harmless)
        # Choose mean by default; if a column name contains 'energy' or 'wh', use sum.
        def agg_func(colname):
            ln = colname.lower()
            if any(k in ln for k in ["wh","kwh","energy","consumption"]):
                return "sum"
            return "mean"
        # Build dict of agg funcs
        aggmap = {c: agg_func(c) for c in num_df.columns}
        num_df = num_df.resample("1T").agg(aggmap)

        frames.append(num_df)
        print(f"[INFO] Loaded {os.path.basename(path)} -> cols={num_df.shape[1]}, rows={num_df.shape[0]}")
    except Exception as e:
        print(f"[WARN] Skipped {os.path.basename(path)}: {e}")

if not frames:
    raise RuntimeError("No usable numeric CSVs could be parsed.")

data = pd.concat(frames, axis=1).sort_index()
# Drop columns entirely empty
data = data.dropna(axis=1, how="all")
# Fill gaps (forward/back fill then zeros for any leading/trailing)
data = data.ffill().bfill().fillna(0.0)

print("[INFO] Combined frame:", data.shape)

# --------------------------
# 2) Choose mains + appliances automatically
#    - mains: column with the highest total energy/mean that looks like aggregate
#    - appliances: top-N by total energy excluding mains
# --------------------------
def looks_like_mains(colname):
    ln = colname.lower()
    return any(k in ln for k in [
        "mains","aggregate","total","house","net","use","whole","summed","site","mainspower"
    ])

energy_by_col = data.sum()  # assuming minutely mean/sum; proportional to energy
# Preferred: candidates that look like mains; else pick the absolute max
mains_candidates = [c for c in data.columns if looks_like_mains(c)]
if mains_candidates:
    mains_col = max(mains_candidates, key=lambda c: energy_by_col.get(c, 0))
else:
    mains_col = energy_by_col.idxmax()

# Pick top appliances (exclude mains). Limit to 5 for a fast baseline.
appliance_pool = [c for c in data.columns if c != mains_col]
appliance_pool_sorted = sorted(appliance_pool, key=lambda c: energy_by_col.get(c, 0), reverse=True)
APPLIANCE_COLS = appliance_pool_sorted[:5] if len(appliance_pool_sorted) > 5 else appliance_pool_sorted

if len(APPLIANCE_COLS) == 0:
    raise RuntimeError("No appliance-like columns found after parsing. Check CSV contents.")

print("[INFO] Selected mains:", mains_col)
print("[INFO] Selected appliances:", APPLIANCE_COLS)

# --------------------------
# 3) Build features: mains + calendar (sin/cos hour & day)
# --------------------------
def calendar_features(index):
    # hour of day (0-23)
    h = index.hour.values
    # day of week (0-6)
    d = index.dayofweek.values
    # sin/cos transforms
    h_sin = np.sin(2*np.pi*h/24.0)
    h_cos = np.cos(2*np.pi*h/24.0)
    d_sin = np.sin(2*np.pi*d/7.0)
    d_cos = np.cos(2*np.pi*d/7.0)
    return np.column_stack([h_sin, h_cos, d_sin, d_cos]).astype("float32")

features = pd.DataFrame(index=data.index)
features["mains"] = data[mains_col].astype("float32")
cal = calendar_features(features.index)
for i, name in enumerate(["h_sin","h_cos","d_sin","d_cos"]):
    features[name] = cal[:, i]

targets = data[APPLIANCE_COLS].astype("float32").copy()

# --------------------------
# 4) Time-based split: 70/15/15
# --------------------------
n = len(features)
i1 = int(n * 0.70); i2 = int(n * 0.85)
X_train_df = features.iloc[:i1]
X_val_df   = features.iloc[i1:i2]
X_test_df  = features.iloc[i2:]

y_train_df = targets.iloc[:i1]
y_val_df   = targets.iloc[i1:i2]
y_test_df  = targets.iloc[i2:]

# --------------------------
# 5) Windowed sequences for 1D-CNN (seq2seq)
# --------------------------
WINDOW = 256   # ~ 4.3 hours at 1-minute sampling
STRIDE = 16    # ~ 16 minutes hop

def make_windows(Xdf, ydf, window=WINDOW, stride=STRIDE):
    X = Xdf.values
    Y = ydf.values
    N, Din = X.shape
    Dout = Y.shape[1]
    xs, ys = [], []
    for start in range(0, N - window + 1, stride):
        end = start + window
        xs.append(X[start:end, :])
        ys.append(Y[start:end, :])
    if not xs:
        return None, None
    return np.stack(xs), np.stack(ys)

Xtr_win, ytr_win = make_windows(X_train_df, y_train_df)
Xva_win, yva_win = make_windows(X_val_df,   y_val_df)
Xte_win, yte_win = make_windows(X_test_df,  y_test_df)

for nm, arr in [("Xtr", Xtr_win), ("ytr", ytr_win), ("Xva", Xva_win), ("yva", yva_win), ("Xte", Xte_win), ("yte", yte_win)]:
    if arr is None:
        raise RuntimeError(f"{nm} windows are empty. Try reducing WINDOW/STRIDE or check data.")
print("[INFO] Windowed shapes:", Xtr_win.shape, ytr_win.shape, Xva_win.shape, yva_win.shape, Xte_win.shape, yte_win.shape)

# --------------------------
# 6) Scale features and targets (fit on train windows)
# --------------------------
# Flatten windows for scaler fit
Xtr_flat = Xtr_win.reshape(-1, Xtr_win.shape[-1])
ytr_flat = ytr_win.reshape(-1, ytr_win.shape[-1])

scaler_X = StandardScaler().fit(Xtr_flat)
scaler_Y = StandardScaler().fit(ytr_flat)

def scale_xy(Xw, Yw):
    X = scaler_X.transform(Xw.reshape(-1, Xw.shape[-1])).reshape(Xw.shape)
    Y = scaler_Y.transform(Yw.reshape(-1, Yw.shape[-1])).reshape(Yw.shape)
    return X.astype("float32"), Y.astype("float32")

Xtr, ytr = scale_xy(Xtr_win, ytr_win)
Xva, yva = scale_xy(Xva_win, yva_win)
Xte, yte = scale_xy(Xte_win, yte_win)

# --------------------------
# 7) Build 1D-CNN seq2seq disaggregator
# --------------------------
Din  = Xtr.shape[-1]             # mains + 4 calendar = 5
Dout = ytr.shape[-1]             # number of appliances

def build_disagg_cnn(window=WINDOW, in_ch=Din, out_ch=Dout):
    inp = keras.Input(shape=(window, in_ch))
    x = layers.Conv1D(64, 5, padding="same", activation="relu")(inp)
    x = layers.Conv1D(64, 5, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv1D(96, 5, padding="same", activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Conv1D(out_ch, 1, padding="same", activation=None)(x)  # linear output
    # Non-negativity: we can clamp at 0 during postprocess; keep linear for training stability
    model = keras.Model(inp, x, name="smartmeterx_disagg_cnn")
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="mae",
        metrics=[keras.metrics.MeanAbsoluteError(name="mae")]
    )
    return model

model = build_disagg_cnn()
early = keras.callbacks.EarlyStopping(monitor="val_mae", mode="min", patience=5, restore_best_weights=True)
hist = model.fit(
    Xtr, ytr,
    validation_data=(Xva, yva),
    epochs=40, batch_size=64,
    callbacks=[early],
    verbose=1
)

# --------------------------
# 8) Evaluate on test (MAE per appliance, sMAPE)
# --------------------------
# Predict (scaled), invert scaling
yte_pred_scaled = model.predict(Xte, batch_size=128, verbose=0)
# flatten and invert scaler_Y
yte_pred_flat = yte_pred_scaled.reshape(-1, Dout)
yte_true_flat = yte.reshape(-1, Dout)
yte_pred_unscaled = scaler_Y.inverse_transform(yte_pred_flat)
yte_true_unscaled = scaler_Y.inverse_transform(yte_true_flat)

# Non-negativity clamp
yte_pred_unscaled = np.clip(yte_pred_unscaled, 0.0, None)

def smape(a, f, eps=1e-6):
    return 100.0 * np.mean(2.0 * np.abs(f - a) / (np.abs(a) + np.abs(f) + eps))

mae_per_appliance = {}
smape_per_appliance = {}
for i, ap in enumerate(APPLIANCE_COLS):
    mae_i = mean_absolute_error(yte_true_unscaled[:, i], yte_pred_unscaled[:, i])
    smape_i = smape(yte_true_unscaled[:, i], yte_pred_unscaled[:, i])
    mae_per_appliance[ap] = float(mae_i)
    smape_per_appliance[ap] = float(smape_i)

# Aggregate (sum over appliances)
mae_avg = float(np.mean(list(mae_per_appliance.values())))
smape_avg = float(np.mean(list(smape_per_appliance.values())))

metrics = {
    "window": int(WINDOW),
    "stride": int(STRIDE),
    "num_appliances": int(Dout),
    "appliances": APPLIANCE_COLS,
    "mains_column": mains_col,
    "mae_per_appliance": mae_per_appliance,
    "smape_per_appliance": smape_per_appliance,
    "mae_mean": mae_avg,
    "smape_mean": smape_avg
}
with open(os.path.join(OUTPUT_DIR, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(metrics, f, indent=2)
print("[INFO] Saved metrics.json")

# --------------------------
# 9) Save artifacts (.pkl, .h5, .yaml)
# --------------------------
preproc = {
    "mains_column": mains_col,
    "appliance_columns": APPLIANCE_COLS,
    "feature_columns": ["mains","h_sin","h_cos","d_sin","d_cos"],
    "window": WINDOW,
    "stride": STRIDE,
    "scaler_X": scaler_X,
    "scaler_Y": scaler_Y,
    "sampling": "1T",
    "notes": "X = [mains + calendar]; Y = appliances; scalers fitted on flattened train windows."
}
with open(os.path.join(OUTPUT_DIR, "preprocessor.pkl"), "wb") as f:
    pickle.dump(preproc, f)
print("[INFO] Saved preprocessor.pkl")

# Save Keras model in HDF5 legacy format (as requested)
h5_path = os.path.join(OUTPUT_DIR, "model_disagg.h5")
model.save(h5_path)
print("[INFO] Saved model_disagg.h5 ->", h5_path)

# YAML config dump (paths + hyperparams)
config = {
    "project": "SmartMeterX — NILM Disaggregation (AMPds auto-loader)",
    "paths": {
        "data_dir": DATA_DIR,
        "output_dir": OUTPUT_DIR
    },
    "selection": {
        "mains_column": mains_col,
        "appliance_columns": APPLIANCE_COLS
    },
    "preprocessing": {
        "resample": "1T",
        "calendar_features": ["h_sin","h_cos","d_sin","d_cos"],
        "fill": "ffill->bfill->0",
        "scalers": {"X": "StandardScaler", "Y": "StandardScaler"}
    },
    "windows": {"length": WINDOW, "stride": STRIDE},
    "model": {
        "type": "1D-CNN seq2seq",
        "layers": [
            {"Conv1D": {"filters": 64, "kernel_size": 5, "activation": "relu", "padding": "same"}},
            {"Conv1D": {"filters": 64, "kernel_size": 5, "activation": "relu", "padding": "same"}},
            {"BatchNorm": {}},
            {"Conv1D": {"filters": 96, "kernel_size": 5, "activation": "relu", "padding": "same"}},
            {"Dropout": 0.2},
            {"Conv1D": {"filters": len(APPLIANCE_COLS), "kernel_size": 1, "activation": "linear", "padding": "same"}}
        ],
        "optimizer": "adam",
        "learning_rate": 1e-3,
        "loss": "mae",
        "metrics": ["mae"],
        "epochs": 40,
        "batch_size": 64,
        "early_stopping": {"monitor":"val_mae","mode":"min","patience":5}
    }
}
yaml_path = os.path.join(OUTPUT_DIR, "model_config.yaml")
try:
    import yaml
    with open(yaml_path, "w", encoding="utf-8") as f:
        yaml.safe_dump(config, f, sort_keys=False)
except Exception:
    # minimal YAML fallback
    def to_yaml(d, indent=0):
        lines, pad = [], "  " * indent
        if isinstance(d, dict):
            for k, v in d.items():
                if isinstance(v, (dict, list)):
                    lines.append(f"{pad}{k}:")
                    lines.extend(to_yaml(v, indent+1))
                else:
                    lines.append(f"{pad}{k}: {repr(v)}")
        elif isinstance(d, list):
            for it in d:
                if isinstance(it, (dict, list)):
                    lines.append(f"{pad}-")
                    lines.extend(to_yaml(it, indent+1))
                else:
                    lines.append(f"{pad}- {repr(it)}")
        return lines
    with open(yaml_path, "w", encoding="utf-8") as f:
        f.write("\n".join(to_yaml(config)))
print("[INFO] Saved model_config.yaml ->", yaml_path)

print("\n[DONE] Artifacts saved in:", OUTPUT_DIR)
print(" - preprocessor.pkl")
print(" - model_disagg.h5")
print(" - model_config.yaml")
print(" - metrics.json")


[INFO] Loaded Climate_HourlyWeather.csv -> cols=15, rows=1381
[INFO] Loaded Electricity_B1E.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_B2E.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_Billing.csv -> cols=14, rows=1056961
[INFO] Loaded Electricity_BME.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_CDE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_CWE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_DNE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_DWE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_EBE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_EQE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_FGE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_FRE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_GRE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_HPE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_HTE.csv -> cols=11, rows=1051200
[INFO] Loaded Electricity_I.csv -> cols=23, rows=



[INFO] Saved metrics.json
[INFO] Saved preprocessor.pkl
[INFO] Saved model_disagg.h5 -> C:\Users\sagni\Downloads\SmartMeterX\model_disagg.h5
[INFO] Saved model_config.yaml -> C:\Users\sagni\Downloads\SmartMeterX\model_config.yaml

[DONE] Artifacts saved in: C:\Users\sagni\Downloads\SmartMeterX
 - preprocessor.pkl
 - model_disagg.h5
 - model_config.yaml
 - metrics.json
