오성사 전처리

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import io

# ============================================================
# 0) 기본 설정 (1시간 기준) - 단일 구간으로 통합
# ============================================================
SMARTCARE_DIR = Path("/home/hd/Desktop/LOG_SMARTCARE_20250930_OS")
EREPORT_DIR   = Path("/home/hd/Desktop/LOG_EREPORT_OS")
WEATHER_CSV   = Path("/home/hd/Desktop/LG/OBS_ASOS_TIM_20250701_20250930_OS.csv")

OUT_CSV = "./preprocessed_1h_master_with_weather_delta_20250701_20250930_OS.csv"

start = pd.Timestamp("2025-07-01 00:00:00")
end   = pd.Timestamp("2025-09-30 23:00:00")
master_index = pd.date_range(start=start, end=end, freq="1H")

# 사용자 정의 휴일(날짜만)
HOLIDAY_DATES = pd.to_datetime([
    # 7~8월
    "2025-07-05",
    "2025-07-06", "2025-07-12",
    "2025-07-13", "2025-07-19",
    "2025-07-20",
    "2025-07-26", "2025-07-27",
    "2025-08-02", "2025-08-03",
    "2025-08-09", "2025-08-10",

    # 8/12 결측 이벤트를 "주말/휴일"로 마크하고 싶다면 날짜 추가
    "2025-08-12",

    # 8~9월
    "2025-08-15", "2025-08-16", "2025-08-17",
    "2025-08-23", "2025-08-24", "2025-08-30", "2025-08-31",
    "2025-09-06", "2025-09-07", "2025-09-13", "2025-09-14",
    "2025-09-20", "2025-09-21", "2025-09-27", "2025-09-28"
]).date

# ------------------------------------------------------------
# Coverage / Smartcare coverage
# ------------------------------------------------------------
POWER_COVERAGE_THRESHOLD = 0.85
SMART_PER_UNIT_MIN_RATIO = 0.70
N_UNITS = 7
SMART_SAMPLE_SEC = 5

# ---- Fourier(자정 롤링) 설정
FOURIER_S = 168.0
FOURIER_K = 10
FOURIER_FIT_HOURS = 14 * 24          # 2 weeks = 336
FOURIER_RIDGE_L2 = 1e-2

# ------------------------------------------------------------
# "강제로 OFF(전력 0)로 처리"할 구간 지정 (ceil 기준 일관성)
# - 당신의 의도: 8/11 21시 이후(= 22시부터) + 8/12 전체 OFF
# - ceil("1h")이면 8/12 23시(이전 1시간)가 8/13 00시 slot로 귀속됨
#   => OFF 강제 범위 끝을 8/13 00:00까지 포함해야 함
# ------------------------------------------------------------
FORCED_OFF_RANGES = [
    ("2025-08-11 22:00:00", "2025-08-13 00:00:00"),
]

# ============================================================
# 1) 파일 이름에서 날짜(YYYYMMDD) 뽑기
# ============================================================
def extract_date_from_filename(filename: str) -> pd.Timestamp:
    m = re.search(r"(\d{8})", filename)
    if not m:
        raise ValueError(f"파일명에서 날짜(YYYYMMDD)를 찾을 수 없습니다: {filename}")
    return pd.to_datetime(m.group(1), format="%Y%m%d")

# ============================================================
# 2) Time 컬럼과 파일명 날짜로 datetime 만들기
# ============================================================
def make_datetime_from_time_and_filename(df: pd.DataFrame, file_path: Path) -> pd.DataFrame:
    if "Time" not in df.columns:
        raise ValueError(f"'Time' 컬럼이 없습니다. 파일: {file_path}")

    file_date = extract_date_from_filename(file_path.name)

    df = df.copy()
    df["datetime"] = pd.to_datetime(
        file_date.strftime("%Y-%m-%d") + " " + df["Time"].astype(str),
        errors="coerce"
    )
    df = df.dropna(subset=["datetime"])
    df = df.sort_values("datetime").set_index("datetime")
    return df

# ============================================================
# 3) NULL(\x00) 제거 후 read_csv
# ============================================================
def read_csv_remove_nulls(path: Path) -> pd.DataFrame:
    with open(path, "rb") as fh:
        raw = fh.read()
    if b"\x00" in raw:
        print(f"[WARN] NULL 바이트 감지 → 제거 후 로드: {path.name}")
        raw = raw.replace(b"\x00", b"")
    return pd.read_csv(io.BytesIO(raw))

# ============================================================
# 4) 폴더 안 CSV를 모두 읽어서 concat
# ============================================================
def load_all_csv_time_from_filename(folder: Path, pattern: str) -> pd.DataFrame | None:
    files = sorted(folder.glob(pattern))
    if not files:
        print(f"[WARN] {folder} 에 {pattern}에 해당하는 파일이 없습니다.")
        return None

    dfs = []
    for f in files:
        try:
            df_raw = read_csv_remove_nulls(f)
            df = make_datetime_from_time_and_filename(df_raw, f)
            df["src_file"] = f.name
            dfs.append(df)
        except Exception as e:
            print(f"[ERROR] {f} 로드 실패: {e}")

    if not dfs:
        return None

    return pd.concat(dfs, axis=0).sort_index()

# ============================================================
# 5) 시간 feature 생성 (1시간 기준)
#    - 요구: "주말 마크" -> 토/일도 is_holiday=1로 포함
# ============================================================
def create_time_features_1h(index: pd.DatetimeIndex) -> pd.DataFrame:
    df_time = pd.DataFrame(index=index)

    dates = df_time.index.date
    is_custom_holiday = pd.Series(dates).isin(set(HOLIDAY_DATES)).to_numpy()

    dow = df_time.index.dayofweek.to_numpy()  # Mon=0 .. Sun=6
    is_weekend = (dow >= 5)

    df_time["is_holiday"] = (is_custom_holiday | is_weekend).astype(int)

    hour_of_day = df_time.index.hour.astype(int)
    df_time["day_sin"] = np.sin(2 * np.pi * hour_of_day / 24.0)
    df_time["day_cos"] = np.cos(2 * np.pi * hour_of_day / 24.0)

    # week cycle: master_index 시작을 0으로 둔 위상
    k = np.arange(len(df_time), dtype=float)
    df_time["week_sin"] = np.sin(2 * np.pi * k / 168.0)
    df_time["week_cos"] = np.cos(2 * np.pi * k / 168.0)

    return df_time

# ============================================================
# 6) EREPORT 전처리 (1시간 Power 합산 + 결측 보정) + 강제 OFF(0)
#    - slot = ceil("1h") 유지 (이전 1시간을 다음 정각 slot로 귀속)
# ============================================================
def preprocess_ereport_power_1h(
    df_ereport: pd.DataFrame,
    master_index: pd.DatetimeIndex,
    freq: str = "1h",
    coverage_threshold: float = 0.85,
    forced_zero_ranges: list[tuple[str | pd.Timestamp, str | pd.Timestamp]] | None = None,
) -> pd.DataFrame:
    POWER_COL = "Power"
    if POWER_COL not in df_ereport.columns:
        raise ValueError(f"EREPORT에 '{POWER_COL}' 컬럼이 없습니다. 현재 컬럼: {df_ereport.columns.tolist()}")

    df = df_ereport.copy()
    df["slot"] = df.index.ceil(freq)  # ✅ 기준 유지

    diffs = df.index.to_series().diff().dropna().dt.total_seconds()
    if len(diffs) == 0:
        raise RuntimeError("EREPORT 로그에서 시간 간격을 추정할 수 없습니다.")
    median_step = diffs.median()
    if median_step <= 0 or np.isnan(median_step):
        median_step = 60.0

    expected_rows = int(round(3600 / median_step))
    expected_rows = max(expected_rows, 1)
    min_required_rows = int(expected_rows * coverage_threshold)

    grp_sum = df.groupby("slot")[POWER_COL].sum()
    grp_n   = df.groupby("slot")[POWER_COL].size()

    out = pd.DataFrame(index=grp_sum.index)
    out["Power_1h_rawsum"] = grp_sum
    out["n_rows"] = grp_n
    out["Power_1h"] = np.nan

    for slot, row in out.iterrows():
        n = int(row["n_rows"])
        if n < min_required_rows:
            continue

        P_obs = float(row["Power_1h_rawsum"])
        m = expected_rows - n
        if m <= 0:
            out.at[slot, "Power_1h"] = P_obs
            continue

        sub = df[df["slot"] == slot][[POWER_COL]].copy()
        if sub.empty:
            continue

        # ceil 기준 윈도우는 (slot-1h, slot]
        w_end = pd.Timestamp(slot)
        w_start = w_end - pd.Timedelta(freq)

        step = pd.Timedelta(seconds=float(median_step))
        grid = pd.date_range(start=w_start + step, end=w_end, freq=step)

        obs_set = set(sub.index)
        missing_times = [t for t in grid if t not in obs_set]
        m_cnt = m

        obs_series = sub[POWER_COL].sort_index()
        past_nonzero = False
        future_nonzero = False

        probe = missing_times
        if len(probe) > 10:
            probe = probe[:: max(1, len(probe)//10)]

        for tmiss in probe:
            past = obs_series.loc[:tmiss]
            if len(past) > 0 and float(past.iloc[-1]) != 0.0:
                past_nonzero = True

            fut = obs_series.loc[tmiss:]
            if len(fut) > 0 and float(fut.iloc[0]) != 0.0:
                future_nonzero = True

            if past_nonzero and future_nonzero:
                break

        if past_nonzero and future_nonzero:
            P_adj = P_obs * (expected_rows / n)
        elif past_nonzero ^ future_nonzero:
            avg_obs = P_obs / max(n, 1)
            P_adj = P_obs + 0.5 * avg_obs * m_cnt
        else:
            P_adj = P_obs

        out.at[slot, "Power_1h"] = P_adj

    # =========================================================
    # ✅ 핵심: 먼저 master_index로 reindex해서 "빈 slot 행"을 생성
    # =========================================================
    out = out.reindex(master_index)

    # =========================================================
    # ✅ 그 다음 강제 OFF를 적용해야 실제로 NaN이 0으로 덮임
    # =========================================================
    if forced_zero_ranges:
        for a, b in forced_zero_ranges:
            a = pd.Timestamp(a)
            b = pd.Timestamp(b)
            mask = (out.index >= a) & (out.index <= b)
            out.loc[mask, "Power_1h"] = 0.0

    return out[["Power_1h"]]

# ============================================================
# 7) SMARTCARE 전처리 (1시간 Tod)
#    - 기존과 동일 (ceil 유지)
# ============================================================
def preprocess_smartcare_tod_1h(
    df_smart: pd.DataFrame,
    master_index: pd.DatetimeIndex,
    n_units: int = 7,
    sample_interval_sec: int = 5,
    freq: str = "1h",
    per_unit_min_ratio: float = 0.70
) -> pd.DataFrame:
    REQUIRED_COLS = ["Auto Id", "Tod"]
    missing = [c for c in REQUIRED_COLS if c not in df_smart.columns]
    if missing:
        raise ValueError(f"SMARTCARE에 필요한 컬럼이 없습니다: {missing}")

    df = df_smart.copy()
    df["slot"] = df.index.ceil(freq)

    expected_per_unit = int(round(3600 / sample_interval_sec))
    min_required = int(np.floor(expected_per_unit * per_unit_min_ratio))

    cnt = df.groupby(["slot", "Auto Id"])["Tod"].size().unstack("Auto Id")
    has_all_units = cnt.notna().sum(axis=1) == n_units
    enough_each = (cnt >= min_required).all(axis=1)
    valid_slots = has_all_units & enough_each

    unit_mean = df.groupby(["slot", "Auto Id"])["Tod"].mean().unstack("Auto Id")

    tod_1h = pd.Series(index=unit_mean.index, dtype=float)
    ok = valid_slots.reindex(unit_mean.index).fillna(False)
    tod_1h.loc[ok] = unit_mean.loc[ok].mean(axis=1)
    tod_1h.loc[~ok] = np.nan

    return pd.DataFrame({"Tod_1h": tod_1h}).reindex(master_index)

# ============================================================
# 8) WEATHER 전처리 (Temperature/Humidity 1시간 정렬)
# ============================================================
def preprocess_weather_1h(weather_csv: Path, master_index: pd.DatetimeIndex) -> pd.DataFrame:
    dfw = pd.read_csv(weather_csv)
    if "Time" not in dfw.columns:
        raise ValueError(f"날씨 CSV에 'Time' 컬럼이 없습니다. columns={dfw.columns.tolist()}")
    for c in ["Temperature", "Humidity"]:
        if c not in dfw.columns:
            raise ValueError(f"날씨 CSV에 '{c}' 컬럼이 없습니다. columns={dfw.columns.tolist()}")

    dfw = dfw.copy()
    dfw["datetime"] = pd.to_datetime(dfw["Time"], errors="coerce")
    dfw = dfw.dropna(subset=["datetime"]).set_index("datetime").sort_index()

    w1h = dfw[["Temperature", "Humidity"]].resample("1h").mean()
    w1h = w1h.reindex(master_index)
    return w1h

# ============================================================
# 9) Fourier feature (자정 롤링)
# ============================================================
def make_fourier_design(T: int, K: int, s: float) -> np.ndarray:
    t = np.arange(T, dtype=float)
    X = np.ones((T, 1 + 2*K), dtype=float)
    for k in range(1, K+1):
        X[:, k]     = np.sin(2*np.pi*k*t/s)
        X[:, K + k] = np.cos(2*np.pi*k*t/s)
    return X

def fit_fourier_ridge(y: np.ndarray, K: int, s: float, l2: float) -> np.ndarray:
    y = np.asarray(y, dtype=float).reshape(-1)
    X = make_fourier_design(len(y), K, s)
    D = X.shape[1]
    theta = np.linalg.solve(X.T @ X + l2*np.eye(D), X.T @ y)
    return theta

def build_midnight_fourier_feature(
    power_1h: pd.Series,
    index: pd.DatetimeIndex,
    fit_hours: int,
    pred_hours: int,
    K: int,
    s: float,
    l2: float
) -> pd.Series:
    out = pd.Series(index=index, dtype=float)
    power = power_1h.reindex(index)

    midnights = [t for t in index if t.hour == 0 and t.minute == 0]
    for t0 in midnights:
        fit_start = t0 - pd.Timedelta(hours=fit_hours)
        fit_end   = t0

        y_fit = power.loc[(power.index >= fit_start) & (power.index < fit_end)].values
        if len(y_fit) != fit_hours:
            continue
        if np.any(~np.isfinite(y_fit)):
            continue

        theta = fit_fourier_ridge(y_fit, K=K, s=s, l2=l2)

        T_pred = fit_hours + pred_hours
        X_pred = make_fourier_design(T_pred, K, s)
        y_hat_24 = (X_pred @ theta)[fit_hours:fit_hours+pred_hours]

        pred_idx = pd.date_range(start=t0, periods=pred_hours, freq="1h").intersection(index)
        out.loc[pred_idx] = y_hat_24[:len(pred_idx)]
    return out

# ============================================================
# 10) 전체 파이프라인
# ============================================================
def main():
    # 1) 시간 feature
    df_time = create_time_features_1h(master_index)

    # 2) SMARTCARE 로드 (전체기간)
    df_smart_raw = load_all_csv_time_from_filename(SMARTCARE_DIR, "LOG_SMARTCARE_*.csv")
    if df_smart_raw is None:
        raise RuntimeError("SMARTCARE 로그를 읽지 못했습니다. 경로/패턴 확인 필요")

    raw_start = start - pd.Timedelta("1h")  # ceil 때문에 여유
    df_smart_raw = df_smart_raw.loc[(df_smart_raw.index >= raw_start) & (df_smart_raw.index <= end)]
    smart_1h = preprocess_smartcare_tod_1h(
        df_smart_raw,
        master_index=master_index,
        n_units=N_UNITS,
        sample_interval_sec=SMART_SAMPLE_SEC,
        freq="1h",
        per_unit_min_ratio=SMART_PER_UNIT_MIN_RATIO
    )

    # 3) EREPORT 로드 (전체기간)
    df_ereport_raw = load_all_csv_time_from_filename(EREPORT_DIR, "DBG_EREPORT_*.csv")
    if df_ereport_raw is None:
        raise RuntimeError("EREPORT 로그를 읽지 못했습니다. 경로/패턴 확인 필요")

    # ✅ 핵심: forced_off는 preprocess 내부에서 단 1회만 적용
    power_1h = preprocess_ereport_power_1h(
        df_ereport_raw,
        master_index=master_index,
        freq="1h",
        coverage_threshold=POWER_COVERAGE_THRESHOLD,
        forced_zero_ranges=FORCED_OFF_RANGES,
    )

    # 4) WEATHER 1h
    weather_1h = preprocess_weather_1h(WEATHER_CSV, master_index)

    # 5) Fourier feature (power 기반)
    fourier_midnight = build_midnight_fourier_feature(
        power_1h["Power_1h"],
        index=master_index,
        fit_hours=FOURIER_FIT_HOURS,
        pred_hours=24,
        K=FOURIER_K,
        s=float(FOURIER_S),
        l2=FOURIER_RIDGE_L2
    )

    # 6) merge
    df_all = (
        df_time
        .join(smart_1h, how="left")
        .join(power_1h, how="left")
        .join(weather_1h, how="left")
    )
    df_all["fourier_base_midnight"] = fourier_midnight

    # 7) 디버그 출력: 문제 구간 확인
    print("[INFO] Final shape:", df_all.shape)
    print(df_all.loc["2025-08-11 18:00:00":"2025-08-13 06:00:00",
                     ["is_holiday","Power_1h","fourier_base_midnight"]].head(60))

    # 8) 저장
    df_all.to_csv(OUT_CSV, index_label="datetime")
    print(f"[INFO] Saved -> {OUT_CSV}")

if __name__ == "__main__":
    main()


서울대 전처라

In [None]:
서울대 전처리

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import io

# ============================================================
# 0) 기본 설정 (1시간 기준)
# ============================================================
SMARTCARE_DIR = Path("/home/hd/Desktop/LOG_SMARTCARE_20250930_snu")
EREPORT_DIR   = Path("/home/hd/Desktop/LOG_EREPORT_snu")

WEATHER_CSV = Path("/home/hd/Desktop/LG/OBS_ASOS_TIM_20250701_20250930_snu.csv")

OUT_CSV = "./preprocessed_1h_master_with_weather_delta_20250813_20250930_snu.csv"

start = pd.Timestamp("2025-07-01 00:00:00")
end   = pd.Timestamp("2025-09-30 23:00:00")
master_index = pd.date_range(start=start, end=end, freq="1H")

# 사용자 정의 휴일(날짜만)
HOLIDAY_DATES = pd.to_datetime([
    # 7~8월
    "2025-07-05",
    "2025-07-06", "2025-07-12",
    "2025-07-13", "2025-07-19",
    "2025-07-20",
    "2025-07-26", "2025-07-27",
    "2025-08-02", "2025-08-03",
    "2025-08-09", "2025-08-10",

    # 8~9월
    "2025-08-15", "2025-08-16", "2025-08-17",
    "2025-08-23", "2025-08-24", "2025-08-30", "2025-08-31",
    "2025-09-06", "2025-09-07", "2025-09-13", "2025-09-14",
    "2025-09-20", "2025-09-21", "2025-09-27", "2025-09-28"
]).date

# ------------------------------------------------------------
# Power coverage / Smartcare per-unit coverage
# ------------------------------------------------------------
POWER_COVERAGE_THRESHOLD = 0.85   # 60분 중 최소 관측 비율
SMART_PER_UNIT_MIN_RATIO = 0.70   # 1시간 중 유닛별 최소 관측 비율
N_UNITS = 7
SMART_SAMPLE_SEC = 5

# ---- Fourier(자정 롤링) 설정
FOURIER_S = 168.0              # weekly period in hours (원하면 24.0으로 바꿀 수 있음)
FOURIER_K = 10                # 요청값 유지 (다만 파라미터 수가 매우 큼: 1+2K=337)
FOURIER_FIT_HOURS = 14 * 24    # 2 weeks = 336
FOURIER_RIDGE_L2 = 1e-2        # 안정화용 ridge(특히 K=168이면 권장)

# ============================================================
# 1) 파일 이름에서 날짜(YYYYMMDD) 뽑기
# ============================================================
def extract_date_from_filename(filename: str) -> pd.Timestamp:
    m = re.search(r"(\d{8})", filename)
    if not m:
        raise ValueError(f"파일명에서 날짜(YYYYMMDD)를 찾을 수 없습니다: {filename}")
    return pd.to_datetime(m.group(1), format="%Y%m%d")

# ============================================================
# 2) Time 컬럼과 파일명 날짜로 datetime 만들기
# ============================================================
def make_datetime_from_time_and_filename(df: pd.DataFrame, file_path: Path) -> pd.DataFrame:
    if "Time" not in df.columns:
        raise ValueError(f"'Time' 컬럼이 없습니다. 파일: {file_path}")

    file_date = extract_date_from_filename(file_path.name)

    df = df.copy()
    df["datetime"] = pd.to_datetime(
        file_date.strftime("%Y-%m-%d") + " " + df["Time"].astype(str),
        errors="coerce"
    )
    df = df.dropna(subset=["datetime"])
    df = df.sort_values("datetime").set_index("datetime")
    return df

# ============================================================
# 3) NULL(\x00) 제거 후 read_csv
# ============================================================
def read_csv_remove_nulls(path: Path) -> pd.DataFrame:
    with open(path, "rb") as fh:
        raw = fh.read()
    if b"\x00" in raw:
        print(f"[WARN] NULL 바이트 감지 → 제거 후 로드: {path.name}")
        raw = raw.replace(b"\x00", b"")
    return pd.read_csv(io.BytesIO(raw))

# ============================================================
# 4) 폴더 안 CSV를 모두 읽어서 concat
# ============================================================
def load_all_csv_time_from_filename(folder: Path, pattern: str) -> pd.DataFrame | None:
    files = sorted(folder.glob(pattern))
    if not files:
        print(f"[WARN] {folder} 에 {pattern}에 해당하는 파일이 없습니다.")
        return None

    dfs = []
    for f in files:
        try:
            df_raw = read_csv_remove_nulls(f)
            df = make_datetime_from_time_and_filename(df_raw, f)
            df["src_file"] = f.name
            dfs.append(df)
        except Exception as e:
            print(f"[ERROR] {f} 로드 실패: {e}")

    if not dfs:
        return None

    return pd.concat(dfs, axis=0).sort_index()

# ============================================================
# 5) 시간 feature 생성 (1시간 기준)
# ============================================================
def create_time_features_1h(index: pd.DatetimeIndex) -> pd.DataFrame:
    df_time = pd.DataFrame(index=index)

    df_time["date"] = df_time.index.date
    df_time["is_holiday"] = df_time["date"].isin(set(HOLIDAY_DATES)).astype(int)

    hour_of_day = df_time.index.hour.astype(int)
    df_time["day_sin"] = np.sin(2 * np.pi * hour_of_day / 24.0)
    df_time["day_cos"] = np.cos(2 * np.pi * hour_of_day / 24.0)

    # week cycle: "master_index의 시작을 0으로 한" 위상
    k = np.arange(len(df_time), dtype=float)
    df_time["week_sin"] = np.sin(2 * np.pi * k / 168.0)
    df_time["week_cos"] = np.cos(2 * np.pi * k / 168.0)

    return df_time.drop(columns=["date"])

# ============================================================
# 6) EREPORT 전처리 (1시간 Power 합산 + 결측 보정)
# ============================================================
def preprocess_ereport_power_1h(df_ereport: pd.DataFrame,
                               freq: str = "1h",
                               coverage_threshold: float = 0.85) -> pd.DataFrame:
    POWER_COL = "Power"
    if POWER_COL not in df_ereport.columns:
        raise ValueError(f"EREPORT에 '{POWER_COL}' 컬럼이 없습니다. 현재 컬럼: {df_ereport.columns.tolist()}")

    df = df_ereport.copy()
    df["slot"] = df.index.ceil(freq)

    diffs = df.index.to_series().diff().dropna().dt.total_seconds()
    if len(diffs) == 0:
        raise RuntimeError("EREPORT 로그에서 시간 간격을 추정할 수 없습니다.")
    median_step = diffs.median()
    if median_step <= 0 or np.isnan(median_step):
        median_step = 60.0

    expected_rows = int(round(3600 / median_step))
    expected_rows = max(expected_rows, 1)
    min_required_rows = int(expected_rows * coverage_threshold)

    grp_sum = df.groupby("slot")[POWER_COL].sum()
    grp_n   = df.groupby("slot")[POWER_COL].size()

    out = pd.DataFrame(index=grp_sum.index)
    out["Power_1h_rawsum"] = grp_sum
    out["n_rows"] = grp_n
    out["Power_1h"] = np.nan

    for slot, row in out.iterrows():
        n = int(row["n_rows"])
        if n < min_required_rows:
            continue

        P_obs = float(row["Power_1h_rawsum"])
        m = expected_rows - n
        if m <= 0:
            out.at[slot, "Power_1h"] = P_obs
            continue

        sub = df[df["slot"] == slot][[POWER_COL]].copy()
        if sub.empty:
            continue

        w_end = pd.Timestamp(slot)
        w_start = w_end - pd.Timedelta(freq)

        step = pd.Timedelta(seconds=float(median_step))
        grid = pd.date_range(start=w_start + step, end=w_end, freq=step)

        obs_set = set(sub.index)
        missing_times = [t for t in grid if t not in obs_set]
        m_cnt = m  # 당신 정의대로 N-n 사용

        obs_series = sub[POWER_COL].sort_index()
        past_nonzero = False
        future_nonzero = False

        probe = missing_times
        if len(probe) > 10:
            probe = probe[:: max(1, len(probe)//10)]

        for tmiss in probe:
            past = obs_series.loc[:tmiss]
            if len(past) > 0 and float(past.iloc[-1]) != 0.0:
                past_nonzero = True

            fut = obs_series.loc[tmiss:]
            if len(fut) > 0 and float(fut.iloc[0]) != 0.0:
                future_nonzero = True

            if past_nonzero and future_nonzero:
                break

        if past_nonzero and future_nonzero:
            P_adj = P_obs * (expected_rows / n)
        elif past_nonzero ^ future_nonzero:
            avg_obs = P_obs / max(n, 1)
            P_adj = P_obs + 0.5 * avg_obs * m_cnt
        else:
            P_adj = P_obs

        out.at[slot, "Power_1h"] = P_adj

    return out[["Power_1h"]].reindex(master_index)

# ============================================================
# 7) SMARTCARE 전처리 (1시간 Tod)
# ============================================================
def preprocess_smartcare_tod_1h(df_smart: pd.DataFrame,
                               n_units: int = 11,
                               sample_interval_sec: int = 5,
                               freq: str = "1h",
                               per_unit_min_ratio: float = 0.70) -> pd.DataFrame:
    REQUIRED_COLS = ["Auto Id", "Tod"]
    missing = [c for c in REQUIRED_COLS if c not in df_smart.columns]
    if missing:
        raise ValueError(f"SMARTCARE에 필요한 컬럼이 없습니다: {missing}")

    df = df_smart.copy()
    df["slot"] = df.index.ceil(freq)

    expected_per_unit = int(round(3600 / sample_interval_sec))
    min_required = int(np.floor(expected_per_unit * per_unit_min_ratio))

    cnt = df.groupby(["slot", "Auto Id"])["Tod"].size().unstack("Auto Id")
    has_all_units = cnt.notna().sum(axis=1) == n_units
    enough_each = (cnt >= min_required).all(axis=1)
    valid_slots = has_all_units & enough_each

    unit_mean = df.groupby(["slot", "Auto Id"])["Tod"].mean().unstack("Auto Id")

    tod_1h = pd.Series(index=unit_mean.index, dtype=float)
    ok = valid_slots.reindex(unit_mean.index).fillna(False)
    tod_1h.loc[ok] = unit_mean.loc[ok].mean(axis=1)
    tod_1h.loc[~ok] = np.nan

    return pd.DataFrame({"Tod_1h": tod_1h}).reindex(master_index)

# ============================================================
# 8) WEATHER 전처리 (Temperature/Humidity 원본값을 1시간 row에 붙임)
#    - "그 시간에 맞는 값"을 들고 오기 위해
#      1) Time 컬럼을 datetime으로 파싱
#      2) 1시간 그리드(master_index)에 reindex
#      3) 같은 시간에 여러 샘플이 있으면 시간별 평균
#    - 원본을 그대로 쓰되, 시간 정렬을 위해 resample/mean만 사용
# ============================================================
def preprocess_weather_1h(weather_csv: Path,
                          master_index: pd.DatetimeIndex) -> pd.DataFrame:
    dfw = pd.read_csv(weather_csv)

    # 어떤 컬럼명이 올지 확정이 아니라서, 우선 아래 3가지만 강제
    if "Time" not in dfw.columns:
        raise ValueError(f"날씨 CSV에 'Time' 컬럼이 없습니다. columns={dfw.columns.tolist()}")
    for c in ["Temperature", "Humidity"]:
        if c not in dfw.columns:
            raise ValueError(f"날씨 CSV에 '{c}' 컬럼이 없습니다. columns={dfw.columns.tolist()}")

    dfw = dfw.copy()
    dfw["datetime"] = pd.to_datetime(dfw["Time"], errors="coerce")
    dfw = dfw.dropna(subset=["datetime"]).set_index("datetime").sort_index()

    # 1시간 평균으로 정규화(원본값 그대로 쓰되, 같은 시간 다중 레코드가 있을 수 있어서)
    w1h = dfw[["Temperature", "Humidity"]].resample("1h").mean()

    # master_index에 맞춰 정렬
    w1h = w1h.reindex(master_index)

    return w1h.rename(columns={"Temperature": "Temperature", "Humidity": "Humidity"})

# ============================================================
# 9) 자정마다 2주 파워로 Fourier(K=168) 피팅 후,
#    그 자정의 24시간에 대해 baseline 예측값 생성하여 저장
# ============================================================
def make_fourier_design(T: int, K: int, s: float) -> np.ndarray:
    t = np.arange(T, dtype=float)
    X = np.ones((T, 1 + 2*K), dtype=float)
    for k in range(1, K+1):
        X[:, k]     = np.sin(2*np.pi*k*t/s)
        X[:, K + k] = np.cos(2*np.pi*k*t/s)
    return X

def fit_fourier_ridge(y: np.ndarray, K: int, s: float, l2: float) -> np.ndarray:
    y = np.asarray(y, dtype=float).reshape(-1)
    X = make_fourier_design(len(y), K, s)
    D = X.shape[1]
    # ridge: (X'X + l2 I)^{-1} X'y
    theta = np.linalg.solve(X.T @ X + l2*np.eye(D), X.T @ y)
    return theta

def build_midnight_fourier_feature(power_1h: pd.Series,
                                   index: pd.DatetimeIndex,
                                   fit_hours: int = 336,
                                   pred_hours: int = 24,
                                   K: int = 168,
                                   s: float = 168.0,
                                   l2: float = 1e-2) -> pd.Series:
    """
    반환:
      fourier_midnight_1h: index와 동일 길이
      - 매일 00:00에 직전 fit_hours로 피팅
      - 그날 24시간(00~23h)의 fourier 값을 채움
      - fit window 부족/NaN 포함이면 해당 day는 NaN 유지
    """
    out = pd.Series(index=index, dtype=float)

    power = power_1h.reindex(index)

    # master_index 내부의 자정들만
    midnights = [t for t in index if t.hour == 0 and t.minute == 0]

    for t0 in midnights:
        fit_start = t0 - pd.Timedelta(hours=fit_hours)
        fit_end   = t0  # exclusive

        y_fit = power.loc[(power.index >= fit_start) & (power.index < fit_end)].values

        if len(y_fit) != fit_hours:
            continue
        if np.any(~np.isfinite(y_fit)):
            continue

        theta = fit_fourier_ridge(y_fit, K=K, s=s, l2=l2)

        # 예측용 time index: fit 구간 뒤에 이어붙인다고 보고 t=fit_hours..fit_hours+23
        T_pred = fit_hours + pred_hours
        X_pred = make_fourier_design(T_pred, K, s)
        y_hat_all = X_pred @ theta
        y_hat_24 = y_hat_all[fit_hours:fit_hours+pred_hours]

        pred_idx = pd.date_range(start=t0, periods=pred_hours, freq="1h").intersection(index)
        out.loc[pred_idx] = y_hat_24[:len(pred_idx)]

    return out

# ============================================================
# 10) 전체 파이프라인
# ============================================================
def main():
    # 1) 시간 feature
    df_time = create_time_features_1h(master_index)

    # 2) SMARTCARE 로드 & 슬라이싱
    df_smart_raw = load_all_csv_time_from_filename(SMARTCARE_DIR, "LOG_SMARTCARE_*.csv")
    if df_smart_raw is None:
        raise RuntimeError("SMARTCARE 로그를 읽지 못했습니다. 경로/패턴 확인 필요")

    raw_start = start - pd.Timedelta("1h")  # slot=ceil이므로 직전 1시간 여유
    df_smart_raw = df_smart_raw.loc[(df_smart_raw.index >= raw_start) & (df_smart_raw.index <= end)]
    smart_1h = preprocess_smartcare_tod_1h(
        df_smart_raw,
        n_units=N_UNITS,
        sample_interval_sec=SMART_SAMPLE_SEC,
        freq="1h",
        per_unit_min_ratio=SMART_PER_UNIT_MIN_RATIO
    )

    # 3) EREPORT 로드 & 슬라이싱
    df_ereport_raw = load_all_csv_time_from_filename(EREPORT_DIR, "DBG_EREPORT_*.csv")
    if df_ereport_raw is None:
        raise RuntimeError("EREPORT 로그를 읽지 못했습니다. 경로/패턴 확인 필요")

    df_ereport_raw = df_ereport_raw.loc[(df_ereport_raw.index >= raw_start) & (df_ereport_raw.index <= end)]
    power_1h = preprocess_ereport_power_1h(
        df_ereport_raw,
        freq="1h",
        coverage_threshold=POWER_COVERAGE_THRESHOLD
    )

    # 4) WEATHER 1h (원본 Temperature/Humidity)
    weather_1h = preprocess_weather_1h(WEATHER_CSV, master_index)

    # 5) 자정 롤링 Fourier feature (power 기반)
    fourier_midnight = build_midnight_fourier_feature(
        power_1h["Power_1h"],
        index=master_index,
        fit_hours=FOURIER_FIT_HOURS,
        pred_hours=24,
        K=FOURIER_K,
        s=float(FOURIER_S),
        l2=FOURIER_RIDGE_L2
    )

    # 6) merge
    df_all = (
        df_time
        .join(smart_1h, how="left")
        .join(power_1h, how="left")
        .join(weather_1h, how="left")  # Temperature, Humidity
    )
    df_all["fourier_base_midnight"] = fourier_midnight

    print("[INFO] Final shape:", df_all.shape)
    print(df_all.head(3))
    print(df_all.tail(3))

    df_all.to_csv(OUT_CSV, index_label="datetime")
    print(f"[INFO] Saved -> {OUT_CSV}")

if __name__ == "__main__":
    main()


n주 학습 1주 발리드 윈도우 (전처리 파일 경로 바꿔야 함)

In [None]:
# ============================================================
# Rolling (weekly step) training (UPDATED: "latest run only" support)
# - buffer + train + valid windows, weekly shift
# - Train model per window and save checkpoints:
#     (1) EXP_DIR/run_xxxx.../best.pt, last.pt
#     (2) EXP_DIR/ckpts_flat/best_wXXXX_...pt, last_wXXXX_...pt  (sortable)
# - Write RUNS_DIR/LATEST_RUN.json so eval code can load ONLY the newest run
# ============================================================

import os
import json
from datetime import datetime

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "./preprocessed_1h_master_with_weather_delta_20250701_20250930_OS.csv"
DT_COL = "datetime"

TARGET_COL = "Power_1h"

EXOG_COLS_BASE = ["is_holiday", "day_sin", "day_cos"]
FOURIER_COL = "fourier_base_midnight"
WEATHER_COLS = ["Temperature", "Humidity"]

USE_TOD = False
TOD_COL = "Tod_1h"

LOOKBACK = 168
HORIZON  = 24

# rolling window config
BUFFER_HOURS = 14 * 24        # 2 weeks buffer
TRAIN_HOURS  = 28 * 24        # 4 weeks (as in your code)
VALID_HOURS  =  7 * 24        # 1 week
STEP_HOURS   =  7 * 24        # shift by 1 week

BATCH_SIZE = 64
EPOCHS = 400
LR = 1e-3
WEIGHT_DECAY = 1e-4
GRAD_CLIP = 5.0
SEED = 42

# ---- Loss weights
LAMBDA_DIFF  = 1.0
LAMBDA_COS   = 1.0
LAMBDA_SHARE = 0.0

COS_EPS = 1e-8
SHARE_EPS = 1e-6
DAY_SUM_MASK_EPS = 1e-3
TOP_ALPHA = 0.20

RUNS_DIR = "./runs_lstm24_roll"
os.makedirs(RUNS_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def seed_all(seed: int):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

seed_all(SEED)

# ============================================================
# ✅ NEW: "this execution only" experiment folder + latest pointer
# ============================================================
RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
EXP_DIR = os.path.join(RUNS_DIR, f"exp_{RUN_ID}")
os.makedirs(EXP_DIR, exist_ok=True)

CKPT_FLAT_DIR = os.path.join(EXP_DIR, "ckpts_flat")
os.makedirs(CKPT_FLAT_DIR, exist_ok=True)

LATEST_JSON = os.path.join(RUNS_DIR, "LATEST_RUN.json")

manifest = {
    "run_id": RUN_ID,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "exp_dir": os.path.abspath(EXP_DIR),
    "ckpts": [],  # append {"type":"last"/"best", "wi":int, "path":abs_path}
}

print(f"[INFO] RUN_ID={RUN_ID}")
print(f"[INFO] EXP_DIR={EXP_DIR}")
print(f"[INFO] CKPT_FLAT_DIR={CKPT_FLAT_DIR}")
print(f"[INFO] LATEST_JSON will be written to: {LATEST_JSON}")

# ============================================================
# 1) Load CSV (NO global dropna)
# ============================================================
df = pd.read_csv(CSV_PATH, parse_dates=[DT_COL]).set_index(DT_COL).sort_index()

req_min = [TARGET_COL] + EXOG_COLS_BASE + [FOURIER_COL] + WEATHER_COLS
if USE_TOD:
    req_min += [TOD_COL]
missing = [c for c in req_min if c not in df.columns]
if missing:
    raise ValueError(f"CSV에 필요한 컬럼이 없습니다: {missing}")

EXOG_COLS = [FOURIER_COL] + WEATHER_COLS + EXOG_COLS_BASE
if USE_TOD:
    EXOG_COLS = [FOURIER_COL] + WEATHER_COLS + [TOD_COL] + EXOG_COLS_BASE

df_feat = df[[TARGET_COL] + EXOG_COLS].copy()

y = df_feat[TARGET_COL].values.astype(np.float32)
xe = df_feat[EXOG_COLS].values.astype(np.float32)
idx = df_feat.index
N = len(df_feat)

is_finite_y  = np.isfinite(y)
is_finite_xe = np.isfinite(xe).all(axis=1)

# ============================================================
# 2) Dataset
# ============================================================
class Power24Dataset(Dataset):
    def __init__(self, y: np.ndarray, xe: np.ndarray, t_list: np.ndarray, lookback: int, horizon: int):
        self.y = y
        self.xe = xe
        self.t_list = t_list
        self.lookback = int(lookback)
        self.horizon = int(horizon)

    def __len__(self):
        return len(self.t_list)

    def __getitem__(self, idx_):
        t = int(self.t_list[idx_])
        x_power = self.y[t-self.lookback:t][:, None]      # (L,1)
        x_exog  = self.xe[t:t+self.horizon, :]            # (H,d)
        y_out   = self.y[t:t+self.horizon]                # (H,)
        return torch.from_numpy(x_power), torch.from_numpy(x_exog), torch.from_numpy(y_out)

# ============================================================
# 3) Model
# ============================================================
class LSTM24(nn.Module):
    def __init__(self, exog_dim: int, horizon: int, hidden: int = 32):
        super().__init__()
        self.horizon = horizon
        self.hidden = hidden

        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden, num_layers=1, batch_first=True)

        self.exog_mlp = nn.Sequential(
            nn.Linear(horizon * exog_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
        )

        self.head = nn.Sequential(
            nn.Linear(2 * hidden + 64, 128),
            nn.ReLU(),
            nn.Linear(128, horizon),
        )

        self.out_act = nn.Softplus(beta=1.0, threshold=20.0)

    def forward(self, x_power, x_exog_future):
        _, (h_n, c_n) = self.lstm(x_power)
        h = h_n[-1]
        c = c_n[-1]
        hc = torch.cat([h, c], dim=1)

        B = x_exog_future.size(0)
        ex = x_exog_future.reshape(B, -1)
        ex = self.exog_mlp(ex)

        z = torch.cat([hc, ex], dim=1)
        y_raw = self.head(z)
        return self.out_act(y_raw)

# ============================================================
# 4) Loss + Metrics
# ============================================================
mse_loss = nn.MSELoss()

def diff_mse(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
    dy_pred = y_pred[:, 1:] - y_pred[:, :-1]
    dy_true = y_true[:, 1:] - y_true[:, :-1]
    return (dy_pred - dy_true).pow(2).mean()

def cos_centered(y_pred: torch.Tensor, y_true: torch.Tensor, eps: float = COS_EPS) -> torch.Tensor:
    yp = y_pred - y_pred.mean(dim=1, keepdim=True)
    yt = y_true - y_true.mean(dim=1, keepdim=True)
    num = (yp * yt).sum(dim=1)
    den = yp.norm(p=2, dim=1) * yt.norm(p=2, dim=1) + eps
    return (num / den).mean()

def share_tv_loss(y_pred: torch.Tensor, y_true: torch.Tensor,
                  eps: float = SHARE_EPS,
                  day_sum_mask_eps: float = DAY_SUM_MASK_EPS) -> torch.Tensor:
    y_true_pos = torch.clamp(y_true, min=0.0)
    y_pred_pos = torch.clamp(y_pred, min=0.0)

    sum_t = (y_true_pos + eps).sum(dim=1, keepdim=True)
    sum_p = (y_pred_pos + eps).sum(dim=1, keepdim=True)

    p = (y_true_pos + eps) / sum_t
    q = (y_pred_pos + eps) / sum_p

    tv = 0.5 * torch.sum(torch.abs(p - q), dim=1)

    w = (sum_t.squeeze(1) >= day_sum_mask_eps).float()
    denom = w.sum().clamp(min=1.0)
    return (tv * w).sum() / denom

def total_loss(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
    base = mse_loss(y_pred, y_true)
    dloss = diff_mse(y_pred, y_true)
    csim = cos_centered(y_pred, y_true)
    clos = 1.0 - csim
    sh   = share_tv_loss(y_pred, y_true)
    return base + LAMBDA_DIFF * dloss + LAMBDA_COS * clos + LAMBDA_SHARE * sh

@torch.no_grad()
def topk_iou(y_pred: torch.Tensor, y_true: torch.Tensor, alpha: float = TOP_ALPHA, eps: float = 1e-12) -> torch.Tensor:
    B, H = y_true.shape
    K = int(np.ceil(alpha * H))
    _, idx_t = torch.topk(y_true, k=K, dim=1, largest=True, sorted=False)
    _, idx_p = torch.topk(y_pred, k=K, dim=1, largest=True, sorted=False)

    mask_t = torch.zeros((B, H), device=y_true.device, dtype=torch.bool)
    mask_p = torch.zeros((B, H), device=y_true.device, dtype=torch.bool)
    mask_t.scatter_(1, idx_t, True)
    mask_p.scatter_(1, idx_p, True)

    inter = (mask_t & mask_p).sum(dim=1).float()
    union = (mask_t | mask_p).sum(dim=1).float()
    return (inter / (union + eps)).mean()

@torch.no_grad()
def share_overlap_percent(y_pred: torch.Tensor, y_true: torch.Tensor, eps: float = SHARE_EPS) -> torch.Tensor:
    y_true_pos = torch.clamp(y_true, min=0.0)
    y_pred_pos = torch.clamp(y_pred, min=0.0)
    p = (y_true_pos + eps) / (y_true_pos + eps).sum(dim=1, keepdim=True)
    q = (y_pred_pos + eps) / (y_pred_pos + eps).sum(dim=1, keepdim=True)
    overlap = torch.minimum(p, q).sum(dim=1)
    return 100.0 * overlap.mean()

@torch.no_grad()
def eval_all_metrics(model, dataloader, device):
    model.eval()
    se_sum, n_elem = 0.0, 0
    dse_sum, d_n   = 0.0, 0
    cos_sum, b_cnt = 0.0, 0
    iou_sum        = 0.0
    sh_ov_sum      = 0.0
    sh_tv_sum      = 0.0

    for x_power, x_exog, y_true in dataloader:
        x_power = x_power.to(device)
        x_exog  = x_exog.to(device)
        y_true  = y_true.to(device)

        y_pred = model(x_power, x_exog)

        diff = y_pred - y_true
        se_sum += float((diff**2).sum().item())
        n_elem += int(y_true.numel())

        dy_pred = y_pred[:, 1:] - y_pred[:, :-1]
        dy_true = y_true[:, 1:] - y_true[:, :-1]
        dd = dy_pred - dy_true
        dse_sum += float((dd**2).sum().item())
        d_n     += int(dd.numel())

        cos_sum += float(cos_centered(y_pred, y_true).item())
        b_cnt   += 1

        iou_sum += float(topk_iou(y_pred, y_true).item())
        sh_ov_sum += float(share_overlap_percent(y_pred, y_true).item())
        sh_tv_sum += float(share_tv_loss(y_pred, y_true).item())

    rmse      = float(np.sqrt(se_sum / max(n_elem, 1)))
    diff_rmse = float(np.sqrt(dse_sum / max(d_n, 1)))
    cos_mean  = float(cos_sum / max(b_cnt, 1))
    iou_mean  = float(iou_sum / max(b_cnt, 1))
    sh_ov     = float(sh_ov_sum / max(b_cnt, 1))
    sh_tv     = float(sh_tv_sum / max(b_cnt, 1))
    return rmse, diff_rmse, cos_mean, iou_mean, sh_ov, sh_tv

# ============================================================
# 5) t_list builder
# ============================================================
def make_t_list_in_range(t_start_inclusive: int, t_end_exclusive: int) -> np.ndarray:
    t_list = []
    lo = max(t_start_inclusive, LOOKBACK)
    hi = min(t_end_exclusive, N - HORIZON)
    for t in range(lo, hi):
        if not is_finite_y[t-LOOKBACK:t].all():
            continue
        if not is_finite_y[t:t+HORIZON].all():
            continue
        if not is_finite_xe[t:t+HORIZON].all():
            continue
        t_list.append(t)
    return np.array(t_list, dtype=int)

def fmt_dt(ts: pd.Timestamp) -> str:
    return ts.strftime("%Y%m%d")

# ============================================================
# 6) Build rolling windows
# ============================================================
train_start0 = BUFFER_HOURS
train_end0   = train_start0 + TRAIN_HOURS
val_start0   = train_end0
val_end0     = val_start0 + VALID_HOURS

windows = []
while val_end0 <= N:
    windows.append((train_start0, train_end0, val_start0, val_end0))
    train_start0 += STEP_HOURS
    train_end0   += STEP_HOURS
    val_start0   += STEP_HOURS
    val_end0     += STEP_HOURS

if len(windows) == 0:
    raise RuntimeError("가능한 rolling window가 0개입니다. (데이터 길이/버퍼/윈도우 설정 확인)")

print(f"[INFO] total rows={N}, total windows={len(windows)}")
print(f"[INFO] first window train={idx[windows[0][0]]}~{idx[windows[0][1]-1]} | "
      f"val={idx[windows[0][2]]}~{idx[windows[0][3]-1]}")

# ============================================================
# 7) Train per window + save best/last (EXP_DIR only)
# ============================================================
for wi, (tr_s, tr_e, va_s, va_e) in enumerate(windows, start=1):
    t_list_train = make_t_list_in_range(tr_s, tr_e)
    t_list_val   = make_t_list_in_range(va_s, va_e)

    if len(t_list_train) == 0 or len(t_list_val) == 0:
        print(f"[WARN] window {wi:04d}: skip (train_t={len(t_list_train)}, val_t={len(t_list_val)})")
        continue

    tr_start_dt = idx[tr_s]
    tr_end_dt   = idx[tr_e-1]
    va_start_dt = idx[va_s]
    va_end_dt   = idx[va_e-1]

    run_name = (
        f"run_{wi:04d}_"
        f"TR{fmt_dt(tr_start_dt)}-{fmt_dt(tr_end_dt)}_"
        f"VA{fmt_dt(va_start_dt)}-{fmt_dt(va_end_dt)}"
    )

    # ✅ save under EXP_DIR (not RUNS_DIR root)
    run_dir = os.path.join(EXP_DIR, run_name)
    os.makedirs(run_dir, exist_ok=True)

    print(f"\n========== [{wi:04d}/{len(windows):04d}] {run_name} ==========")
    print(f"  train_t={len(t_list_train)} | val_t={len(t_list_val)}")
    print(f"  train_range: {tr_start_dt} ~ {tr_end_dt}")
    print(f"  valid_range: {va_start_dt} ~ {va_end_dt}")

    ds_train = Power24Dataset(y, xe, t_list_train, LOOKBACK, HORIZON)
    ds_val   = Power24Dataset(y, xe, t_list_val,   LOOKBACK, HORIZON)
    dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,  drop_last=True)
    dl_val   = DataLoader(ds_val,   batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

    seed_all(SEED)
    model = LSTM24(exog_dim=len(EXOG_COLS), horizon=HORIZON, hidden=32).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    best_val_rmse = float("inf")
    best_state = None
    best_epoch = None

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss_sum, n_batches = 0.0, 0

        for x_power, x_exog, y_true in dl_train:
            x_power = x_power.to(DEVICE)
            x_exog  = x_exog.to(DEVICE)
            y_true  = y_true.to(DEVICE)

            opt.zero_grad()
            y_pred = model(x_power, x_exog)
            loss = total_loss(y_pred, y_true)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
            opt.step()

            total_loss_sum += float(loss.item())
            n_batches += 1

        train_rmse, train_drmse, train_cos, train_iou, train_shov, train_shtv = eval_all_metrics(model, dl_train, DEVICE)
        val_rmse,   val_drmse,   val_cos,   val_iou,   val_shov,   val_shtv   = eval_all_metrics(model, dl_val,   DEVICE)

        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_epoch = epoch
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

        avg_train_loss = total_loss_sum / max(n_batches, 1)

        if (epoch == 1) or (epoch % 10 == 0) or (epoch == EPOCHS):
            print(
                f"[Epoch {epoch:03d}] "
                f"train_loss={avg_train_loss:.6f} | "
                f"train_RMSE={train_rmse:.6f}, dRMSE={train_drmse:.6f}, cosC={train_cos:.4f}, "
                f"IoU={train_iou:.4f}, shareOv={train_shov:.2f}%, shareTV={train_shtv:.4f} | "
                f"val_RMSE={val_rmse:.6f}, dRMSE={val_drmse:.6f}, cosC={val_cos:.4f}, "
                f"IoU={val_iou:.4f}, shareOv={val_shov:.2f}%, shareTV={val_shtv:.4f} | "
                f"best_val_RMSE={best_val_rmse:.6f} (epoch={best_epoch})"
            )

    # -----------------------------
    # Save (run_dir + ckpts_flat under EXP_DIR)
    # -----------------------------
    tr_s_ymd = fmt_dt(tr_start_dt)
    tr_e_ymd = fmt_dt(tr_end_dt)
    va_s_ymd = fmt_dt(va_start_dt)
    va_e_ymd = fmt_dt(va_end_dt)

    flat_last_name = f"last_w{wi:04d}_TR{tr_s_ymd}-{tr_e_ymd}_VA{va_s_ymd}-{va_e_ymd}.pt"
    flat_best_name = f"best_w{wi:04d}_TR{tr_s_ymd}-{tr_e_ymd}_VA{va_s_ymd}-{va_e_ymd}.pt"

    last_path = os.path.join(run_dir, "last.pt")
    best_path = os.path.join(run_dir, "best.pt")

    flat_last_path = os.path.join(CKPT_FLAT_DIR, flat_last_name)
    flat_best_path = os.path.join(CKPT_FLAT_DIR, flat_best_name)

    window_meta = {
        "wi": wi,
        "train_start": str(tr_start_dt),
        "train_end":   str(tr_end_dt),
        "val_start":   str(va_start_dt),
        "val_end":     str(va_end_dt),
        "train_hours": TRAIN_HOURS,
        "val_hours":   VALID_HOURS,
        "step_hours":  STEP_HOURS,
        "buffer_hours": BUFFER_HOURS,
    }
    config_meta = {
        "LOOKBACK": LOOKBACK,
        "HORIZON": HORIZON,
        "TARGET_COL": TARGET_COL,
        "EXOG_COLS": EXOG_COLS,
        "USE_TOD": USE_TOD,
        "LOSS": f"MSE + {LAMBDA_DIFF}*diffMSE + {LAMBDA_COS}*(1-cosCentered) + {LAMBDA_SHARE}*shareTV",
        "TOP_ALPHA": TOP_ALPHA,
        "SHARE_EPS": SHARE_EPS,
        "DAY_SUM_MASK_EPS": DAY_SUM_MASK_EPS,
        "N_TRAIN_T": int(len(t_list_train)),
        "N_VAL_T": int(len(t_list_val)),
        "SEED": SEED,
        "LR": LR,
        "WEIGHT_DECAY": WEIGHT_DECAY,
        "EPOCHS": EPOCHS,
        "BATCH_SIZE": BATCH_SIZE,
        "GRAD_CLIP": GRAD_CLIP,
        "MODEL": "LSTM24(hidden=32)+exogMLP+Softplus",
    }

    last_payload = {
        "epoch": EPOCHS,
        "model_state": model.state_dict(),
        "opt_state": opt.state_dict(),
        "best_val_rmse": best_val_rmse,
        "best_epoch": best_epoch,
        "run_name": run_name,
        "window": window_meta,
        "config": config_meta,
    }
    torch.save(last_payload, last_path)
    torch.save(last_payload, flat_last_path)

    manifest["ckpts"].append({"type": "last", "wi": int(wi), "path": os.path.abspath(flat_last_path)})

    if best_state is not None:
        best_payload = {
            "epoch": best_epoch,
            "model_state": best_state,
            "best_val_rmse": best_val_rmse,
            "run_name": run_name,
            "window": window_meta,
            "config": config_meta,
        }
        torch.save(best_payload, best_path)
        torch.save(best_payload, flat_best_path)
        manifest["ckpts"].append({"type": "best", "wi": int(wi), "path": os.path.abspath(flat_best_path)})
        print(f"[SAVE] best -> {best_path}")
        print(f"[SAVE] best(flat) -> {flat_best_path}")
    else:
        print(f"[WARN] {run_name}: best_state is None -> best.pt not saved")

    print(f"[SAVE] last -> {last_path}")
    print(f"[SAVE] last(flat) -> {flat_last_path}")

# ============================================================
# ✅ NEW: Write latest pointer JSON (so eval loads ONLY newest run)
# ============================================================
with open(LATEST_JSON, "w") as f:
    json.dump(manifest, f, indent=2)

print(f"\n[INFO] wrote latest run pointer -> {LATEST_JSON}")
print(f"[INFO] this run ckpts count = {len(manifest['ckpts'])}")


테스트

In [None]:
# ============================================================
# Jupyter-friendly evaluation (UPDATED: ONLY latest run)
# - Reads RUNS_DIR/LATEST_RUN.json written by the UPDATED training code
# - Evaluates ONLY the "last" ckpts created in that latest run
# - On each model: evaluate on its OWN val range
# - midnight-only samples (00:00 start -> next 24h)
# - SHOW plots inline (VS Code Jupyter)
# ============================================================

import os, json, math, glob
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from IPython.display import display

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "./preprocessed_1h_master_with_weather_delta_20250701_20250930_OS.csv"
DT_COL = "datetime"
TARGET_COL = "Power_1h"

RUNS_DIR = "./runs_lstm24_roll"
LATEST_JSON = os.path.join(RUNS_DIR, "LATEST_RUN.json")  # ✅ latest pointer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

plt.rcParams["figure.figsize"] = (10, 4)

SHOW_PLOTS = True
SAVE_PLOTS = False
MAX_DAYS_PER_MODEL = 7

OUT_DIR = os.path.join(RUNS_DIR, "eval_last_midnight_on_own_val_inline_latest")
PLOTS_DIR = os.path.join(OUT_DIR, "plots")
os.makedirs(PLOTS_DIR, exist_ok=True)

# ============================================================
# Load dataset once
# ============================================================
df = pd.read_csv(CSV_PATH, parse_dates=[DT_COL]).set_index(DT_COL).sort_index()

# ============================================================
# Model (must match training)
# ============================================================
class LSTM24(nn.Module):
    def __init__(self, exog_dim: int, horizon: int, hidden: int = 32):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden, num_layers=1, batch_first=True)
        self.exog_mlp = nn.Sequential(
            nn.Linear(horizon * exog_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
        )
        self.head = nn.Sequential(
            nn.Linear(2 * hidden + 64, 128),
            nn.ReLU(),
            nn.Linear(128, horizon),
        )
        self.out_act = nn.Softplus(beta=1.0, threshold=20.0)

    def forward(self, x_power, x_exog_future):
        _, (h_n, c_n) = self.lstm(x_power)
        h = h_n[-1]
        c = c_n[-1]
        hc = torch.cat([h, c], dim=1)

        B = x_exog_future.size(0)
        ex = x_exog_future.reshape(B, -1)
        ex = self.exog_mlp(ex)

        z = torch.cat([hc, ex], dim=1)
        return self.out_act(self.head(z))

# ============================================================
# Helpers (numpy metrics)
# ============================================================
TOP_ALPHA = 0.20
COS_EPS = 1e-8
SHARE_EPS = 1e-6
DAY_SUM_MASK_EPS = 1e-3

def diff_rmse_np(y_pred, y_true):
    dy_p = y_pred[:, 1:] - y_pred[:, :-1]
    dy_t = y_true[:, 1:] - y_true[:, :-1]
    return float(np.sqrt(np.mean((dy_p - dy_t) ** 2)))

def cos_centered_np(y_pred, y_true, eps=COS_EPS):
    yp = y_pred - y_pred.mean(axis=1, keepdims=True)
    yt = y_true - y_true.mean(axis=1, keepdims=True)
    num = np.sum(yp * yt, axis=1)
    den = (np.linalg.norm(yp, axis=1) * np.linalg.norm(yt, axis=1) + eps)
    return float(np.mean(num / den))

def topk_iou_np(y_pred, y_true, alpha=TOP_ALPHA, eps=1e-12):
    M, H = y_true.shape
    K = int(math.ceil(alpha * H))
    ious = []
    for i in range(M):
        idx_t = np.argpartition(-y_true[i], K-1)[:K]
        idx_p = np.argpartition(-y_pred[i], K-1)[:K]
        set_t, set_p = set(idx_t.tolist()), set(idx_p.tolist())
        inter = len(set_t & set_p)
        union = len(set_t | set_p)
        ious.append(inter / (union + eps))
    return float(np.mean(ious))

def share_overlap_percent_np(y_pred, y_true, eps=SHARE_EPS):
    y_true_pos = np.clip(y_true, 0.0, None)
    y_pred_pos = np.clip(y_pred, 0.0, None)
    p = (y_true_pos + eps) / np.sum(y_true_pos + eps, axis=1, keepdims=True)
    q = (y_pred_pos + eps) / np.sum(y_pred_pos + eps, axis=1, keepdims=True)
    overlap = np.sum(np.minimum(p, q), axis=1)
    return float(100.0 * np.mean(overlap))

def share_tv_np(y_pred, y_true, eps=SHARE_EPS, day_sum_mask_eps=DAY_SUM_MASK_EPS):
    y_true_pos = np.clip(y_true, 0.0, None)
    y_pred_pos = np.clip(y_pred, 0.0, None)
    sum_t = np.sum(y_true_pos + eps, axis=1, keepdims=True)
    p = (y_true_pos + eps) / sum_t
    q = (y_pred_pos + eps) / np.sum(y_pred_pos + eps, axis=1, keepdims=True)
    tv = 0.5 * np.sum(np.abs(p - q), axis=1)
    w = (sum_t.squeeze(1) >= day_sum_mask_eps).astype(np.float32)
    denom = max(float(np.sum(w)), 1.0)
    return float(np.sum(tv * w) / denom)

def _strict_index_loc(idx_dt: pd.DatetimeIndex, ts: pd.Timestamp) -> int:
    """
    Make sure val_start/val_end are exactly on idx_dt.
    If not present (e.g. timezone mismatch), fallback to nearest but warn.
    """
    try:
        loc = idx_dt.get_loc(ts)
        if isinstance(loc, slice):
            return int(loc.start)
        if isinstance(loc, (np.ndarray, list)):
            return int(loc[0])
        return int(loc)
    except KeyError:
        # fallback
        loc = int(idx_dt.get_indexer([ts], method="nearest")[0])
        print(f"[WARN] timestamp not exactly in index: {ts} -> using nearest {idx_dt[loc]}")
        return loc

def build_midnight_val_t_list(idx_dt, y, xe, lookback, horizon, val_start_ts, val_end_ts):
    """
    Build t_list within [val_start_ts, val_end_ts] inclusive,
    only for t at midnight (00:00), and with full finite past/future.
    """
    va_s = _strict_index_loc(idx_dt, pd.Timestamp(val_start_ts))
    va_e_incl = _strict_index_loc(idx_dt, pd.Timestamp(val_end_ts))
    va_e = va_e_incl + 1  # exclusive

    N = len(idx_dt)
    is_finite_y  = np.isfinite(y)
    is_finite_xe = np.isfinite(xe).all(axis=1)

    t_list = []
    lo = max(va_s, lookback)
    hi = min(va_e, N - horizon)
    for t in range(lo, hi):
        if idx_dt[t].hour != 0:
            continue
        if not is_finite_y[t-lookback:t].all():
            continue
        if not is_finite_y[t:t+horizon].all():
            continue
        if not is_finite_xe[t:t+horizon].all():
            continue
        t_list.append(t)
    return np.array(t_list, dtype=int)

@torch.no_grad()
def infer_one(model, y, xe, t, lookback, horizon, device):
    x_power = torch.from_numpy(y[t-lookback:t][:, None]).float().unsqueeze(0).to(device)
    x_exog  = torch.from_numpy(xe[t:t+horizon, :]).float().unsqueeze(0).to(device)
    y_pred = model(x_power, x_exog).squeeze(0).detach().cpu().numpy().astype(np.float32)
    y_true = y[t:t+horizon].astype(np.float32)
    return y_true, y_pred

def plot_one_day(run_name, day_start_ts, y_true_24, y_pred_24, save_path=None):
    h = np.arange(len(y_true_24))
    plt.figure()
    plt.plot(h, y_true_24, label="True")
    plt.plot(h, y_pred_24, label="Pred")
    plt.title(f"{run_name} | {day_start_ts} | midnight->24h")
    plt.xlabel("Hour ahead (0~23)")
    plt.ylabel(TARGET_COL)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    if save_path is not None:
        plt.savefig(save_path, dpi=150)
    if SHOW_PLOTS:
        plt.show()
    plt.close()

# ============================================================
# ✅ Load ONLY latest run ckpts via LATEST_RUN.json
# ============================================================
if not os.path.exists(LATEST_JSON):
    raise RuntimeError(f"LATEST_RUN.json not found: {LATEST_JSON}\n"
                       f"먼저 '수정된 학습 코드'를 실행해서 LATEST_RUN.json을 생성하세요.")

with open(LATEST_JSON, "r") as f:
    latest = json.load(f)

exp_dir = latest.get("exp_dir", None)
ckpt_items = latest.get("ckpts", [])

if not exp_dir or not ckpt_items:
    raise RuntimeError(f"LATEST_RUN.json 내용이 비어있습니다. exp_dir={exp_dir}, ckpts={len(ckpt_items)}")

# only last ckpts
last_ckpts = [c for c in ckpt_items if c.get("type") == "last"]
last_ckpts = sorted(last_ckpts, key=lambda x: int(x.get("wi", 0)))

if not last_ckpts:
    raise RuntimeError("LATEST_RUN.json에 last ckpt가 없습니다. (학습 코드에서 last 저장 확인)")

print(f"[INFO] Latest RUN_ID={latest.get('run_id')} | exp_dir={exp_dir}")
print(f"[INFO] last ckpts in latest run: {len(last_ckpts)}")

# ============================================================
# Run evaluation
# ============================================================
summary_rows = []

for item in last_ckpts:
    p = item["path"]
    wi = int(item.get("wi", -1))

    if not os.path.exists(p):
        print(f"[WARN] ckpt path missing -> skip: {p}")
        continue

    ckpt = torch.load(p, map_location="cpu")
    run_name = ckpt.get("run_name", os.path.basename(p).replace(".pt", ""))

    window = ckpt.get("window", {})
    cfg = ckpt.get("config", {})

    lookback = int(cfg.get("LOOKBACK", 168))
    horizon  = int(cfg.get("HORIZON", 24))
    exog_cols = cfg.get("EXOG_COLS", None)
    if exog_cols is None:
        raise ValueError(f"[{run_name}] ckpt에 EXOG_COLS가 없습니다.")

    val_start = pd.to_datetime(window.get("val_start"))
    val_end   = pd.to_datetime(window.get("val_end"))

    # ---- build arrays per this ckpt config
    df_feat_local = df[[TARGET_COL] + list(exog_cols)].copy()
    y_arr = df_feat_local[TARGET_COL].values.astype(np.float32)
    xe_arr = df_feat_local[list(exog_cols)].values.astype(np.float32)
    idx_dt = df_feat_local.index

    # ---- midnight-only inside OWN val range
    t_list = build_midnight_val_t_list(idx_dt, y_arr, xe_arr, lookback, horizon, val_start, val_end)

    print(f"\n[w={wi:04d}] {run_name}")
    print(f"  val_range={val_start} ~ {val_end}")
    if len(t_list) > 0:
        print(f"  midnight_samples={len(t_list)} (first={idx_dt[t_list[0]]}, last={idx_dt[t_list[-1]]})")
    else:
        print(f"  midnight_samples=0")

    if len(t_list) == 0:
        summary_rows.append({
            "wi": wi, "run_name": run_name, "ckpt_path": p,
            "val_start": str(val_start), "val_end": str(val_end),
            "midnight_samples": 0,
            "rmse": np.nan, "diff_rmse": np.nan, "cos_centered": np.nan,
            "top_iou": np.nan, "share_overlap_pct": np.nan, "share_tv": np.nan,
            "note": "no valid midnight samples",
        })
        continue

    # load model
    model = LSTM24(exog_dim=len(exog_cols), horizon=horizon, hidden=32).to(DEVICE)
    model.load_state_dict(ckpt["model_state"], strict=True)
    model.eval()

    # infer
    y_true_mat, y_pred_mat = [], []
    for t in t_list:
        yt, yp = infer_one(model, y_arr, xe_arr, t, lookback, horizon, DEVICE)
        y_true_mat.append(yt)
        y_pred_mat.append(yp)
    y_true_mat = np.stack(y_true_mat, axis=0)
    y_pred_mat = np.stack(y_pred_mat, axis=0)

    # metrics
    rmse = float(np.sqrt(np.mean((y_pred_mat - y_true_mat) ** 2)))
    drmse = diff_rmse_np(y_pred_mat, y_true_mat)
    cosc = cos_centered_np(y_pred_mat, y_true_mat)
    tiou = topk_iou_np(y_pred_mat, y_true_mat, alpha=TOP_ALPHA)
    shov = share_overlap_percent_np(y_pred_mat, y_true_mat)
    shtv = share_tv_np(y_pred_mat, y_true_mat)

    summary_rows.append({
        "wi": wi, "run_name": run_name, "ckpt_path": p,
        "val_start": str(val_start), "val_end": str(val_end),
        "midnight_samples": int(len(t_list)),
        "rmse": rmse, "diff_rmse": drmse, "cos_centered": cosc,
        "top_iou": tiou, "share_overlap_pct": shov, "share_tv": shtv,
        "note": "",
    })

    print(f"  metrics: RMSE={rmse:.4f}, dRMSE={drmse:.4f}, cosC={cosc:.4f}, IoU={tiou:.4f}, shareOv={shov:.2f}%")

    # plots
    take = min(MAX_DAYS_PER_MODEL, len(t_list))
    for i in range(take):
        day_ts = idx_dt[t_list[i]]
        save_path = None
        if SAVE_PLOTS:
            safe = run_name.replace(":", "").replace(" ", "_")
            model_dir = os.path.join(PLOTS_DIR, safe)
            os.makedirs(model_dir, exist_ok=True)
            save_path = os.path.join(model_dir, f"w{wi:04d}_day_{i:03d}_{day_ts.strftime('%Y%m%d')}.png")
        plot_one_day(run_name, day_ts, y_true_mat[i], y_pred_mat[i], save_path=save_path)

# summary table inline
summary_df = pd.DataFrame(summary_rows).sort_values(["wi"])
display(summary_df)

# save summary
os.makedirs(OUT_DIR, exist_ok=True)
out_csv = os.path.join(OUT_DIR, "summary_midnight_last_latest.csv")
summary_df.to_csv(out_csv, index=False)
print(f"[INFO] saved summary -> {out_csv}")
