In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit


# -----------------------------
# 1) CME 전처리: 월별 txt 모두 로드 -> Halo만 -> 일별 합/횟수 -> 누락일 0 채움
# -----------------------------
def load_and_preprocess_cme(year: int, cme_dir: str) -> pd.DataFrame:
    cme_dir = Path(cme_dir)
    file_names = [cme_dir / f"univ{year}_{month:02d}.txt" for month in range(1, 13)]

    cme_all = []
    for fp in file_names:
        if fp.exists():
            cme_month = pd.read_fwf(fp, skiprows=4, header=None)
            cme_all.append(cme_month)

    if not cme_all:
        raise FileNotFoundError("CME txt files not found. Check cme_dir and filenames.")

    cme_all = pd.concat(cme_all, ignore_index=True)

    # 네가 했던 방식 그대로:
    # - 12열(인덱스 12)에 "Halo" 포함 여부
    is_halo = cme_all.iloc[:, 12].astype(str).str.contains("Halo", na=False)

    # - 10열(인덱스 10)에서 앞 3글자만 -> 숫자로 변환
    strength = pd.to_numeric(cme_all.iloc[:, 10].astype(str).str[:3], errors="coerce")

    # - NaN이거나 Halo가 아니면 0
    strength = strength.where(is_halo, 0)
    strength = strength.fillna(0).astype(float)

    # 날짜
    dates = pd.to_datetime(cme_all.iloc[:, 0], errors="coerce")

    cme_event = pd.DataFrame({"date": dates, "strength": strength})
    cme_event = cme_event.dropna(subset=["date"])  # 날짜 파싱 실패 제거

    # ✅ 일별 합 + ✅ 일별 Halo 횟수(= strength>0 인 이벤트 수)
    cme_daily = (
        cme_event
        .groupby("date", as_index=False)
        .agg(
            cme_strength_sum=("strength", "sum"),
            cme_count=("strength", lambda s: (s > 0).sum())
        )
    )

    # ✅ 빠진 날짜 채우기(연중 전체를 n=365/366으로 만들고 싶으면 start/end를 1/1~12/31로 고정)
    start = pd.Timestamp(year=year, month=1, day=1)
    end   = pd.Timestamp(year=year, month=12, day=31)
    full_dates = pd.date_range(start, end, freq="D")

    cme_filled = (
        cme_daily
        .set_index("date")
        .reindex(full_dates, fill_value=0)
        .rename_axis("date")
        .reset_index()
    )

    return cme_filled


# -----------------------------
# 2) hproton 전처리: 메타라인 섞여도 안전하게 -> date 만들기 -> p_gt_100만
# -----------------------------
import pandas as pd
import numpy as np

def load_and_preprocess_hproton(dpd_path: str) -> pd.DataFrame:
    rows = []
    with open(dpd_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split()
            if len(parts) < 6:   # 최소 year month day p1 p10 p100 있어야 함
                continue

            # 첫 3개가 정수여야 데이터 줄로 인정
            try:
                y = int(float(parts[0]))
                m = int(float(parts[1]))
                d = int(float(parts[2]))
            except ValueError:
                continue

            # 데이터 줄이면 앞 9개까지 안전하게 읽기 (없으면 NaN 채움)
            vals = parts[:9] + [np.nan] * (9 - len(parts[:9]))
            rows.append([y, m, d] + vals[3:9])

    hp = pd.DataFrame(
        rows,
        columns=[
            "year", "month", "day",
            "p_gt_1MeV", "p_gt_10MeV", "p_gt_100MeV",
            "e_gt_0p6MeV", "e_gt_2MeV",
            "neutron_pct"
        ]
    )

    # 숫자 변환
    for c in ["p_gt_1MeV","p_gt_10MeV","p_gt_100MeV","e_gt_0p6MeV","e_gt_2MeV","neutron_pct"]:
        hp[c] = pd.to_numeric(hp[c], errors="coerce")

    hp["date"] = pd.to_datetime(hp[["year","month","day"]])
    hp = hp[["date", "p_gt_100MeV"]].sort_values("date").reset_index(drop=True)

    # 결측은 0으로(원하면 그대로 NaN 둬도 됨)
    hp["p_gt_100MeV"] = hp["p_gt_100MeV"].fillna(0.0)

    return hp



# -----------------------------
# 3) CME + hproton 길이 n 맞추기(동일 date index로 reindex)
# -----------------------------
def align_daily_series(cme_daily: pd.DataFrame, hp_daily: pd.DataFrame) -> pd.DataFrame:
    # 공통 날짜 범위(교집합)로 맞추고 싶으면 여기서 start/end 조절 가능
    start = max(cme_daily["date"].min(), hp_daily["date"].min())
    end   = min(cme_daily["date"].max(), hp_daily["date"].max())
    full_dates = pd.date_range(start, end, freq="D")

    cme_aligned = (
        cme_daily.set_index("date")
        .reindex(full_dates, fill_value=0)
        .rename_axis("date")
        .reset_index()
    )
    hp_aligned = (
        hp_daily.set_index("date")
        .reindex(full_dates, fill_value=0)
        .rename_axis("date")
        .reset_index()
    )

    merged = pd.merge(cme_aligned, hp_aligned, on="date", how="inner")
    return merged


# -----------------------------
# 4) (과거 k일 CME) -> (오늘 SEP) 학습용 테이블 생성
# -----------------------------
def make_supervised_dataset(df: pd.DataFrame, k_lag: int = 3, use_log_target: bool = True):
    # df columns: date, cme_strength_sum, cme_count, p_gt_100MeV

    out = df.copy()

    # lag feature 생성: lag1~lagk (어제~k일 전)
    for lag in range(1, k_lag + 1):
        out[f"cme_strength_sum_lag{lag}"] = out["cme_strength_sum"].shift(lag)
        out[f"cme_count_lag{lag}"] = out["cme_count"].shift(lag)

    # 롤링 요약(선택): 최근 k일 합/최대
    out[f"cme_strength_sum_roll{ k_lag }"] = out["cme_strength_sum"].shift(1).rolling(k_lag).sum()
    out[f"cme_count_roll{ k_lag }"] = out["cme_count"].shift(1).rolling(k_lag).sum()

    # 타깃: 오늘의 p_gt_100MeV
    y = out["p_gt_100MeV"].copy()

    if use_log_target:
        y = np.log10(y + 1.0)

    # 결측 제거(초기 lag 때문에)
    feature_cols = sorted(
    set(
        [c for c in out.columns
         if c.startswith("cme_strength_sum_lag")
         or c.startswith("cme_count_lag")
         or c.startswith("cme_strength_sum_roll")
         or c.startswith("cme_count_roll")]
    )
)

    out_model = out[["date"] + feature_cols + ["p_gt_100MeV"]].dropna().reset_index(drop=True)
    X = out_model[feature_cols].values
    y = (np.log10(out_model["p_gt_100MeV"].values + 1.0) if use_log_target else out_model["p_gt_100MeV"].values)

    return out_model, feature_cols, X, y


# -----------------------------
# 5) 모델 학습/평가 (시간 순서 유지)
# -----------------------------
def train_and_evaluate(X, y):
    # 가장 단순한 baseline: RandomForestRegressor
    model = RandomForestRegressor(
        n_estimators=500,
        max_depth=8,
        random_state=0,
        n_jobs=-1
    )

    # 마지막 20%를 테스트로 (time split)
    n = len(X)
    split = int(n * 0.8)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    mse = mean_squared_error(y_test, pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred)

    return model, {"rmse": rmse, "r2": r2}


# -----------------------------
# 실행 예시
# -----------------------------
if __name__ == "__main__":
    year = 2002
    cme_dir = f"./{year}cme"          # 너 폴더 구조에 맞게
    dpd_path = f"./{year}_DPD.txt"    # 너 파일명에 맞게

    k_lag = 3  # 과거 3일 CME로 오늘 SEP 예측

    cme_daily = load_and_preprocess_cme(year, cme_dir)
    hp_daily  = load_and_preprocess_hproton(dpd_path)

    merged = align_daily_series(cme_daily, hp_daily)

    ds, feature_cols, X, y = make_supervised_dataset(merged, k_lag=k_lag, use_log_target=True)

    model, metrics = train_and_evaluate(X, y)

    print("Features:", feature_cols)
    print("Metrics:", metrics)
    print("Dataset length:", len(ds))


Features: ['cme_count_lag1', 'cme_count_lag2', 'cme_count_lag3', 'cme_count_roll3', 'cme_strength_sum_lag1', 'cme_strength_sum_lag2', 'cme_strength_sum_lag3', 'cme_strength_sum_roll3']
Metrics: {'rmse': np.float64(0.06759536261175762), 'r2': -0.2755896133365727}
Dataset length: 362
