In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
import joblib

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    mean_squared_error, r2_score,
    roc_auc_score, average_precision_score
)

In [20]:
def load_and_preprocess_cme_year(year: int, cme_data_dir: str) -> pd.DataFrame:
    cme_data_dir = Path(cme_data_dir)
    files = [cme_data_dir / f"univ{year}_{m:02d}.txt" for m in range(1, 13)]
    full_dates = pd.date_range(pd.Timestamp(year, 1, 1), pd.Timestamp(year, 12, 31), freq="D")

    dfs = []
    for f in files:
        if f.exists():
            dfs.append(pd.read_fwf(f, skiprows=4, header=None))

    if not dfs:
        daily = pd.DataFrame({"date": full_dates})
        cols = [
            "cme_count","halo_count","partial_halo_count",
            "width_max","width_mean",
            "speed_linear_max","speed_linear_mean",
            "speed_init_max","speed_final_max","speed_20R_max",
            "accel_max","accel_min",
            "mass_sum","mass_max",
            "ke_sum","ke_max"
        ]
        for c in cols:
            daily[c] = 0.0
        return daily

    cme = pd.concat(dfs, ignore_index=True)

    if cme.shape[1] < 13:
        for _ in range(13 - cme.shape[1]):
            cme[cme.shape[1]] = np.nan

    out = pd.DataFrame()
    out["date"] = pd.to_datetime(cme.iloc[:, 0], errors="coerce")

    central = cme.iloc[:, 2].astype(str)
    out["is_halo"] = central.str.contains("Halo", case=False, na=False).astype(int)
    out["is_partial_halo"] = cme.iloc[:, 12].astype(str).str.contains("Partial", case=False, na=False).astype(int)

    def num(col):
        return pd.to_numeric(
            cme.iloc[:, col].astype(str).str.replace("*", "", regex=False),
            errors="coerce"
        )

    out["width"] = num(3)
    out["speed_linear"] = num(4)
    out["speed_init"] = num(5)
    out["speed_final"] = num(6)
    out["speed_20R"] = num(7)
    out["accel"] = num(8)
    out["mass"] = num(9)
    out["kinetic_energy"] = num(10)

    out = out.dropna(subset=["date"])

    daily = (
        out.groupby("date")
        .agg(
            cme_count=("date", "count"),
            halo_count=("is_halo", "sum"),
            partial_halo_count=("is_partial_halo", "sum"),
            width_max=("width", "max"),
            width_mean=("width", "mean"),
            speed_linear_max=("speed_linear", "max"),
            speed_linear_mean=("speed_linear", "mean"),
            speed_init_max=("speed_init", "max"),
            speed_final_max=("speed_final", "max"),
            speed_20R_max=("speed_20R", "max"),
            accel_max=("accel", "max"),
            accel_min=("accel", "min"),
            mass_sum=("mass", "sum"),
            mass_max=("mass", "max"),
            ke_sum=("kinetic_energy", "sum"),
            ke_max=("kinetic_energy", "max"),
        )
        .reset_index()
    )

    daily = (
        daily.set_index("date")
        .reindex(full_dates)
        .rename_axis("date")
        .reset_index()
        .fillna(0.0)
    )
    return daily


def load_and_preprocess_hproton_year(year: int, hproton_data_dir: str) -> pd.DataFrame:
    dpd_path = Path(hproton_data_dir) / f"{year}_DPD.txt"
    if not dpd_path.exists():
        raise FileNotFoundError(f"Missing DPD file: {dpd_path}")

    rows = []
    with open(dpd_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 6:
                continue
            try:
                y = int(float(parts[0])); m = int(float(parts[1])); d = int(float(parts[2]))
            except ValueError:
                continue

            vals = parts[:9] + [np.nan] * (9 - len(parts[:9]))
            rows.append([y, m, d] + vals[3:9])

    hp = pd.DataFrame(
        rows,
        columns=[
            "year","month","day",
            "p_gt_1MeV","p_gt_10MeV","p_gt_100MeV",
            "e_gt_0p6MeV","e_gt_2MeV",
            "neutron_pct"
        ]
    )

    hp["p_gt_100MeV"] = pd.to_numeric(hp["p_gt_100MeV"], errors="coerce")
    hp["p_gt_100MeV"] = hp["p_gt_100MeV"].where(hp["p_gt_100MeV"] >= 0)
    hp["p_gt_100MeV"] = hp["p_gt_100MeV"].fillna(0.0).astype(float)
    hp["date"] = pd.to_datetime(hp[["year","month","day"]])

    daily = hp[["date","p_gt_100MeV"]].sort_values("date").reset_index(drop=True)
    return daily


def build_merged_all_years(start_year: int, end_year: int, cme_dir: str, hproton_dir: str) -> pd.DataFrame:
    cme_all = []
    hp_all = []

    for year in range(start_year, end_year + 1):
        cme_all.append(load_and_preprocess_cme_year(year, cme_dir))

        try:
            hp_all.append(load_and_preprocess_hproton_year(year, hproton_dir))
        except FileNotFoundError:
            pass

    cme_all = pd.concat(cme_all, ignore_index=True).sort_values("date").reset_index(drop=True)
    hp_all = pd.concat(hp_all, ignore_index=True).sort_values("date").reset_index(drop=True)

    start = max(cme_all["date"].min(), hp_all["date"].min())
    end = min(cme_all["date"].max(), hp_all["date"].max())
    full_dates = pd.date_range(start, end, freq="D")

    cme_aligned = (
        cme_all.set_index("date")
        .reindex(full_dates, fill_value=0.0)
        .rename_axis("date")
        .reset_index()
    )
    hp_aligned = (
        hp_all.set_index("date")
        .reindex(full_dates, fill_value=0.0)
        .rename_axis("date")
        .reset_index()
    )

    merged = pd.merge(cme_aligned, hp_aligned, on="date", how="inner")
    return merged


def make_supervised_dataset(df: pd.DataFrame, k_lag: int = 3, future_window: int = 3):
    out = df.sort_values("date").reset_index(drop=True).copy()
    base_cols = [c for c in out.columns if c not in ["date", "p_gt_100MeV"]]

    feat_cols = []
    for col in base_cols:
        for lag in range(1, k_lag + 1):
            name = f"{col}_lag{lag}"
            out[name] = out[col].shift(lag)
            feat_cols.append(name)

        name = f"{col}_rollsum{k_lag}"
        out[name] = out[col].shift(1).rolling(k_lag).sum()
        feat_cols.append(name)

        name = f"{col}_rollmax{k_lag}"
        out[name] = out[col].shift(1).rolling(k_lag).max()
        feat_cols.append(name)

    out["target_sep"] = out["p_gt_100MeV"].shift(-1).rolling(future_window).max()

    ds = out[["date"] + feat_cols + ["target_sep"]].dropna().reset_index(drop=True)

    return ds, feat_cols, base_cols


def train_two_stage(ds: pd.DataFrame, feature_cols: list, threshold: float, split_ratio: float = 0.8, random_state: int = 0):
    ds = ds.sort_values("date").reset_index(drop=True)
    n = len(ds)
    split = int(n * split_ratio)

    train_df = ds.iloc[:split].copy()
    test_df = ds.iloc[split:].copy()

    X_train = train_df[feature_cols].values
    X_test = test_df[feature_cols].values

    y_train_cls = (train_df["target_sep"].values > threshold).astype(int)
    y_test_cls = (test_df["target_sep"].values > threshold).astype(int)

    clf = RandomForestClassifier(
        n_estimators=500, max_depth=8,
        random_state=random_state, n_jobs=-1,
        class_weight="balanced"
    )
    clf.fit(X_train, y_train_cls)

    if 1 in clf.classes_:
        idx1 = int(np.where(clf.classes_ == 1)[0][0])
        p_test = clf.predict_proba(X_test)[:, idx1]
    else:
        p_test = np.zeros(len(X_test), dtype=float)

    auc = roc_auc_score(y_test_cls, p_test) if len(np.unique(y_test_cls)) > 1 else np.nan
    pr_auc = average_precision_score(y_test_cls, p_test) if len(np.unique(y_test_cls)) > 1 else np.nan

    train_event = train_df[train_df["target_sep"] > threshold].copy()
    test_event = test_df[test_df["target_sep"] > threshold].copy()

    if len(train_event) < 10:
        raise ValueError(f"Too few train events: {len(train_event)} (threshold={threshold})")

    X_train_reg = train_event[feature_cols].values
    y_train_reg = np.log10(train_event["target_sep"].values + 1.0)

    reg = RandomForestRegressor(
        n_estimators=800, max_depth=10,
        random_state=random_state, n_jobs=-1
    )
    reg.fit(X_train_reg, y_train_reg)

    if len(test_event) > 0:
        X_test_reg = test_event[feature_cols].values
        y_test_reg = np.log10(test_event["target_sep"].values + 1.0)
        pred_reg = reg.predict(X_test_reg)
        rmse = float(np.sqrt(mean_squared_error(y_test_reg, pred_reg)))
        r2 = float(r2_score(y_test_reg, pred_reg)) if len(y_test_reg) >= 2 else np.nan
    else:
        rmse, r2 = np.nan, np.nan

    pred_reg_all = reg.predict(X_test)
    intensity_all = np.maximum(0.0, 10**pred_reg_all - 1.0)
    expected_sep = p_test * intensity_all

    pred_df = pd.DataFrame({
        "date": test_df["date"].values,
        "target_sep": test_df["target_sep"].values,
        "event_true": y_test_cls,
        "p_event": p_test,
        "intensity_pred": intensity_all,
        "expected_sep": expected_sep
    })

    results = {
        "classifier_auc": auc,
        "classifier_pr_auc": pr_auc,
        "regressor_rmse": rmse,
        "regressor_r2": r2,
        "n_train": len(train_df),
        "n_test": len(test_df),
        "n_train_event": int((train_df["target_sep"] > threshold).sum()),
        "n_test_event": int((test_df["target_sep"] > threshold).sum()),
    }

    return clf, reg, results, pred_df


def save_model(path: str, clf, reg, meta: dict):
    joblib.dump({"clf": clf, "reg": reg, "meta": meta}, path)


def load_model(path: str):
    pkg = joblib.load(path)
    return pkg["clf"], pkg["reg"], pkg["meta"]


def build_feature_row_asof(merged: pd.DataFrame, base_cols: list, feature_cols: list, k_lag: int, asof_date: str):
    df = merged[["date"] + base_cols].sort_values("date").reset_index(drop=True).copy()
    asof_date = pd.to_datetime(asof_date)

    for col in base_cols:
        for lag in range(1, k_lag + 1):
            df[f"{col}_lag{lag}"] = df[col].shift(lag)
        df[f"{col}_rollsum{k_lag}"] = df[col].shift(1).rolling(k_lag).sum()
        df[f"{col}_rollmax{k_lag}"] = df[col].shift(1).rolling(k_lag).max()

    row = df[df["date"] == asof_date]
    if row.empty:
        raise ValueError(f"asof_date not found in merged date range: {asof_date.date()}")

    row = row[["date"] + feature_cols].copy()
    if row.isna().any(axis=None):
        raise ValueError("Not enough history before asof_date to build lag/rolling features.")

    return row


def predict_two_stage(clf, reg, meta: dict, feature_row: pd.DataFrame):
    feature_cols = meta["feature_cols"]
    X = feature_row[feature_cols].values

    if 1 in clf.classes_:
        idx1 = int(np.where(clf.classes_ == 1)[0][0])
        p_event = float(clf.predict_proba(X)[:, idx1][0])
    else:
        p_event = 0.0

    pred_log_int = float(reg.predict(X)[0])
    intensity = max(0.0, 10**pred_log_int - 1.0)
    expected = p_event * intensity

    return {
        "asof_date": pd.to_datetime(feature_row["date"].iloc[0]),
        "p_event": p_event,
        "intensity_if_event": intensity,
        "expected_intensity": expected
    }


if __name__ == "__main__":
    cme_dir = "./cme_data"
    hproton_dir = "./hproton_data"
    start_year, end_year = 1996, 2018
    k_lag, future_window = 3, 3

    merged = build_merged_all_years(start_year, end_year, cme_dir, hproton_dir)
    ds, feature_cols, base_cols = make_supervised_dataset(merged, k_lag=k_lag, future_window=future_window)

    thr = float(ds["target_sep"].quantile(0.995))

    clf, reg, results, pred_df = train_two_stage(ds, feature_cols, threshold=thr)

    print("threshold:", thr)
    print("event_rate:", float((ds["target_sep"] > thr).mean()))
    print("results:", results)

    meta = {
        "feature_cols": feature_cols,
        "base_cols": base_cols,
        "k_lag": k_lag,
        "future_window": future_window,
        "threshold": thr,
        "use_log_intensity": True,
        "start_year": start_year,
        "end_year": end_year
    }

    model_path = "sep_two_stage_1996_2018.joblib"
    save_model(model_path, clf, reg, meta)

    clf2, reg2, meta2 = load_model(model_path)

    test_date = str(pred_df["date"].iloc[0].date())
    feature_row = build_feature_row_asof(merged, meta2["base_cols"], meta2["feature_cols"], meta2["k_lag"], test_date)
    pred = predict_two_stage(clf2, reg2, meta2, feature_row)

    print("test_date:", test_date)
    print("prediction:", pred)
    print("true_target_sep:", float(ds.loc[ds["date"] == pd.to_datetime(test_date), "target_sep"].iloc[0]))


threshold: 1185000.0000000363
event_rate: 0.005105217283028265
results: {'classifier_auc': np.float64(0.9912718204488778), 'classifier_pr_auc': np.float64(0.46883468834688347), 'regressor_rmse': 0.1169556244015193, 'regressor_r2': 0.0, 'n_train': 6424, 'n_test': 1607, 'n_train_event': 38, 'n_test_event': 3}
test_date: 2014-08-07
prediction: {'asof_date': Timestamp('2014-08-07 00:00:00'), 'p_event': 0.007710076687263456, 'intensity_if_event': 2990336.304347603, 'expected_intensity': 23055.722227228012}
true_target_sep: 2600.0


In [21]:
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import joblib

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


# -----------------------------
# CME txt(월별/단일파일) -> 일별 집계(daily)로 변환
# -----------------------------
def load_cme_txt_to_daily(cme_txt_path: Union[str, Path], year: int) -> pd.DataFrame:
    cme_txt_path = Path(cme_txt_path)

    # fixed width
    df = pd.read_fwf(cme_txt_path, skiprows=4, header=None)

    # 열 부족 방어(포맷 변형)
    if df.shape[1] < 13:
        for _ in range(13 - df.shape[1]):
            df[df.shape[1]] = np.nan

    out = pd.DataFrame()
    out["date"] = pd.to_datetime(df.iloc[:, 0], errors="coerce")

    central = df.iloc[:, 2].astype(str)
    out["is_halo"] = central.str.contains("Halo", case=False, na=False).astype(int)
    out["is_partial_halo"] = df.iloc[:, 12].astype(str).str.contains("Partial", case=False, na=False).astype(int)

    def num(col: int):
        return pd.to_numeric(
            df.iloc[:, col].astype(str).str.replace("*", "", regex=False),
            errors="coerce"
        )

    out["width"] = num(3)
    out["speed_linear"] = num(4)
    out["speed_init"] = num(5)
    out["speed_final"] = num(6)
    out["speed_20R"] = num(7)
    out["accel"] = num(8)
    out["mass"] = num(9)
    out["kinetic_energy"] = num(10)

    out = out.dropna(subset=["date"])

    daily = (
        out.groupby("date")
        .agg(
            cme_count=("date", "count"),
            halo_count=("is_halo", "sum"),
            partial_halo_count=("is_partial_halo", "sum"),

            width_max=("width", "max"),
            width_mean=("width", "mean"),

            speed_linear_max=("speed_linear", "max"),
            speed_linear_mean=("speed_linear", "mean"),

            speed_init_max=("speed_init", "max"),
            speed_final_max=("speed_final", "max"),
            speed_20R_max=("speed_20R", "max"),

            accel_max=("accel", "max"),
            accel_min=("accel", "min"),

            mass_sum=("mass", "sum"),
            mass_max=("mass", "max"),

            ke_sum=("kinetic_energy", "sum"),
            ke_max=("kinetic_energy", "max"),
        )
        .reset_index()
    )

    # 예측할 때도 "연도 전체 날짜축"으로 맞춰주는 게 편함
    full_dates = pd.date_range(pd.Timestamp(year, 1, 1), pd.Timestamp(year, 12, 31), freq="D")
    daily = (
        daily.set_index("date")
        .reindex(full_dates)
        .rename_axis("date")
        .reset_index()
        .fillna(0.0)
    )

    return daily


# -----------------------------
# daily CME -> (학습과 동일한 방식) lag/rolling feature 생성 (target 없음)
# -----------------------------
def make_features_only(
    daily_cme: pd.DataFrame,
    base_feature_cols: Optional[List[str]] = None,
    k_lag: int = 3,
) -> Tuple[pd.DataFrame, List[str]]:
    out = daily_cme.sort_values("date").reset_index(drop=True).copy()

    if base_feature_cols is None:
        base_feature_cols = [c for c in out.columns if c != "date"]

    feat_cols: List[str] = []
    for col in base_feature_cols:
        for lag in range(1, k_lag + 1):
            name = f"{col}_lag{lag}"
            out[name] = out[col].shift(lag)
            feat_cols.append(name)

        name = f"{col}_rollsum{k_lag}"
        out[name] = out[col].shift(1).rolling(k_lag).sum()
        feat_cols.append(name)

        name = f"{col}_rollmax{k_lag}"
        out[name] = out[col].shift(1).rolling(k_lag).max()
        feat_cols.append(name)

    feat_df = out[["date"] + feat_cols].dropna().reset_index(drop=True)
    return feat_df, feat_cols


# -----------------------------
# 모델 저장/불러오기
# -----------------------------
def save_two_stage_model(
    path: Union[str, Path],
    clf: RandomForestClassifier,
    reg: RandomForestRegressor,
    feature_cols: List[str],
    *,
    threshold: float,
    k_lag: int,
    use_log_intensity: bool = True,
) -> None:
    payload = {
        "clf": clf,
        "reg": reg,
        "feature_cols": feature_cols,
        "threshold": float(threshold),
        "k_lag": int(k_lag),
        "use_log_intensity": bool(use_log_intensity),
    }
    joblib.dump(payload, Path(path))


def load_two_stage_model(path: Union[str, Path]) -> Dict:
    return joblib.load(Path(path))


# -----------------------------
# 저장된 모델로 예측
#   - p_event: "미래 window max SEP > threshold"일 확률(위험도)
#   - intensity_pred: event일 때 강도 예측(학습과 동일한 log 스케일 변환을 되돌림)
#   - expected_sep = p_event * intensity_pred
# -----------------------------
def predict_sep_from_cme_daily(
    model_bundle: Dict,
    daily_cme: pd.DataFrame,
    *,
    base_feature_cols: Optional[List[str]] = None,
) -> pd.DataFrame:
    clf: RandomForestClassifier = model_bundle["clf"]
    reg: RandomForestRegressor = model_bundle["reg"]

    feature_cols: List[str] = model_bundle["meta"]["feature_cols"]
    k_lag: int = model_bundle["meta"]["k_lag"]
    use_log_intensity: bool = model_bundle["meta"]["use_log_intensity"]

    feat_df, _ = make_features_only(daily_cme, base_feature_cols=base_feature_cols, k_lag=k_lag)

    # 학습 당시 feature_cols 순서/이름과 반드시 일치해야 함
    X = feat_df[feature_cols].values

    proba = clf.predict_proba(X)
    # 어떤 경우(학습 데이터에 class가 1종류만 있는 경우) shape이 (n,1)일 수 있어 방어
    if proba.shape[1] == 2:
        p_event = proba[:, 1]
    else:
        # class가 1개뿐이면 "항상 그 class"라고 보면 됨
        only_class = int(getattr(clf, "classes_", [0])[0])
        p_event = np.ones(len(X)) if only_class == 1 else np.zeros(len(X))

    pred_reg = reg.predict(X)
    if use_log_intensity:
        intensity = np.maximum(0.0, (10 ** pred_reg) - 1.0)
    else:
        intensity = np.maximum(0.0, pred_reg)

    expected_sep = p_event * intensity

    out = pd.DataFrame({
        "date": feat_df["date"].values,
        "p_event": p_event,
        "intensity_pred": intensity,
        "expected_sep": expected_sep,
    })
    return out


# -----------------------------
# (예시) "이런 CME txt 1개"로 바로 테스트
# -----------------------------
if __name__ == "__main__":
    # (A) 학습이 끝난 뒤 저장할 때:
    # save_two_stage_model(
    #     "sep_two_stage.joblib",
    #     clf, reg, feature_cols,
    #     threshold=thr, k_lag=3,
    #     use_log_intensity=True
    # )

    # (B) 저장된 모델 불러오기
    bundle = load_two_stage_model("sep_two_stage_1996_2018.joblib")

    # (C) 지금 받은 CME txt(예: 업로드된 파일) -> daily -> 예측
    cme_txt = "./cme_data/univ2002_04.txt"
    daily = load_cme_txt_to_daily(cme_txt, year=2002)

    pred = predict_sep_from_cme_daily(bundle, daily)

    # 상위 위험일(확률 높은 날) 확인
    print(pred.sort_values("p_event", ascending=False).head(10))

    # 특정 날짜만 보고 싶으면:
    # print(pred[pred["date"].between("2002-03-10", "2002-03-20")])


          date   p_event  intensity_pred  expected_sep
108 2002-04-22  0.758742    2.056767e+06  1.560555e+06
107 2002-04-21  0.633779    1.802825e+06  1.142593e+06
106 2002-04-20  0.628227    2.441486e+06  1.533807e+06
110 2002-04-24  0.621117    3.576100e+06  2.221176e+06
109 2002-04-23  0.575246    3.725235e+06  2.142927e+06
105 2002-04-19  0.350141    3.805157e+06  1.332340e+06
104 2002-04-18  0.191780    3.023989e+06  5.799405e+05
116 2002-04-30  0.100704    2.488224e+06  2.505743e+05
115 2002-04-29  0.073505    2.805780e+06  2.062376e+05
117 2002-05-01  0.068787    5.024932e+06  3.456483e+05


In [22]:
def make_multiple_cme_txts(
    txt_list,
    year
):
    
    # clf, reg, meta = bundle
    # 1) 각 txt → daily
    daily_list = []
    for p in txt_list:
        d = load_cme_txt_to_daily(p, year=year)
        daily_list.append(d)

    # 2) 날짜 기준으로 합치기
    daily_all = (
        pd.concat(daily_list, ignore_index=True)
        .groupby("date", as_index=False)
        .sum()
        .sort_values("date")
        .reset_index(drop=True)
    )

    return daily_all


In [None]:
# from glob import glob

# 저장된 모델 로드
clf, reg, meta = load_model("sep_two_stage_1996_2018.joblib")

bundle = {"clf": clf, "reg": reg, "meta": meta}

year = 2000

txt_list = [f"./cme_data/univ{year}_01.txt",
            f"./cme_data/univ{year}_02.txt",
            f"./cme_data/univ{year}_03.txt",
            f"./cme_data/univ{year}_04.txt",
            f"./cme_data/univ{year}_05.txt",
            f"./cme_data/univ{year}_06.txt",
            f"./cme_data/univ{year}_07.txt",
            f"./cme_data/univ{year}_08.txt",
            f"./cme_data/univ{year}_09.txt",
            f"./cme_data/univ{year}_10.txt",
            f"./cme_data/univ{year}_11.txt",
            f"./cme_data/univ{year}_12.txt"]

data_list = make_multiple_cme_txts(
    txt_list,
    year=year
)

pred = predict_sep_from_cme_daily(bundle, data_list)

print(pred.head())
print(pred.sort_values("p_event", ascending=False).head(10))


        date   p_event  intensity_pred  expected_sep
0 2000-01-04  0.001787    3.175243e+06   5672.771331
1 2000-01-05  0.003502    3.702144e+06  12966.091915
2 2000-01-06  0.000000    4.413691e+06      0.000000
3 2000-01-07  0.005675    6.517853e+06  36987.197899
4 2000-01-08  0.002000    3.116318e+06   6232.636113
          date   p_event  intensity_pred  expected_sep
193 2000-07-15  0.867414    7.660187e+06  6.644554e+06
194 2000-07-16  0.817765    4.411257e+06  3.607371e+06
191 2000-07-13  0.754831    8.360004e+06  6.310386e+06
311 2000-11-10  0.753068    8.102358e+06  6.101623e+06
192 2000-07-14  0.728511    6.803753e+06  4.956611e+06
130 2000-05-13  0.728029    4.383728e+06  3.191481e+06
131 2000-05-14  0.692193    5.799346e+06  4.014264e+06
310 2000-11-09  0.666109    7.968035e+06  5.307582e+06
190 2000-07-12  0.656959    6.909205e+06  4.539064e+06
309 2000-11-08  0.633626    7.016172e+06  4.445628e+06


In [27]:
pred.to_csv("pred_2000.csv", index=False, encoding="utf-8-sig")