In [1]:
import pandas as pd
df = pd.read_csv("../data/전처리데이터/감자_filled_weekly_item.csv")
df = df[df['품목명']=='수미']
df

Unnamed: 0,품목명,연도,주차,주차_일수,주간_평균단가,금액_합,반입량_합
1280,수미,2015,27,5,750.066841,1.608976e+09,2132969.0
1281,수미,2015,28,6,849.298310,1.588329e+09,1880199.0
1282,수미,2015,29,6,849.919122,1.201872e+09,1411665.0
1283,수미,2015,30,6,869.357803,1.340159e+09,1537356.0
1284,수미,2015,31,6,895.237946,1.109608e+09,1247207.0
...,...,...,...,...,...,...,...
1798,수미,2025,23,6,1309.562632,2.423299e+09,1897918.0
1799,수미,2025,24,6,1045.136702,2.097496e+09,2050022.0
1800,수미,2025,25,6,1417.806468,2.191902e+09,1550722.0
1801,수미,2025,26,6,1135.717153,1.693074e+09,1480916.0


In [10]:
# -*- coding: utf-8 -*-
# "한 달 뒤(다음 4주)" 예측을 품목별로 생성하고, 표로 확인/CSV 저장
# - 베이스라인: HW-Mul(가용 시) → HW-Add → SeasonalNaive → RollMean(8w) 우선순위
# - 짧은 시계열/0·음수 존재 시 자동 폴백
# - 노트북에서 재실행하면 최신 데이터로 즉시 갱신됨

import pandas as pd
import numpy as np
import warnings

# (옵션) HW 최적화 경고 감소
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter("ignore", ConvergenceWarning)

from statsmodels.tsa.holtwinters import ExponentialSmoothing
from IPython.display import display

# =========================
# 0) 파라미터
# =========================
PATH = "../data/전처리데이터/감자_filled_weekly_item.csv"  # 데이터 경로
H = 4                 # 다음 4주(≈한 달)
PERIOD = 52           # 주간 시즌 길이
MIN_TRAIN_RATIO = 0.5 # 최소 학습 길이 비율(50%)
CLIP_MIN = 0          # 예측 음수 방지용 하한(원/kg)

# =========================
# 1) 로드 & 전처리 (품목명×주차 단위)
# =========================
df = pd.read_csv(PATH)

for col in ["금액_합","반입량_합","주간_평균단가","금액","반입량"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

def build_week_start(y, w):
    # ISO 주차 기준 '해-주차-월요일' -> W-MON
    return pd.to_datetime(f"{int(y)}-W{int(w):02d}-1", format="%G-W%V-%u", errors="coerce")

df["week_start"] = [build_week_start(y, w) for y, w in zip(df["연도"], df["주차"])]

# 타깃 확정
if "주간_평균단가" in df.columns:
    df["target"] = df["주간_평균단가"]
else:
    df["target"] = df["금액_합"] / df["반입량_합"]

# 품목×주차로 평균 집계(중복 행 방지)
base = (
    df.dropna(subset=["품목명","week_start","target"])
      .groupby(["품목명","week_start"], as_index=False)["target"].mean()
      .sort_values(["품목명","week_start"])
)

# =========================
# 2) 유틸 & 베이스라인 함수
# =========================
def create_future_weeks(last_week_start, H):
    # 마지막 관측 주의 다음 주(월요일)부터 H개 월요일 생성
    start = pd.to_datetime(last_week_start) + pd.Timedelta(days=7)
    return pd.date_range(start=start, periods=H, freq="W-MON")

def seasonal_naive(train, horizon=4, period=52):
    hist = train["target"].values
    if len(hist) >= period:
        last_season = hist[-period:]
        reps = int(np.ceil(horizon / period))
        return np.tile(last_season, reps)[:horizon]
    else:
        return np.repeat(hist[-1], horizon)

def weekmean_forecast(train, future_week_start):
    """
    주차별 평균(학습기간)으로 미래 주차를 매핑해 예측.
    future_week_start는 DatetimeIndex/Series/array/list 모두 허용.
    """
    # 1) 학습 구간: ISO 주차별 평균
    wk_train = pd.to_datetime(train["week_start"]).dt.isocalendar().week.astype(int)
    wk_mean = train.groupby(wk_train)["target"].mean().to_dict()

    # 2) 미래 주차 입력 정규화 (Series로 변환)
    if isinstance(future_week_start, (pd.Series, pd.Index, np.ndarray, list)):
        fw = pd.Series(pd.to_datetime(future_week_start))
    else:
        fw = pd.Series([pd.to_datetime(future_week_start)])

    # 3) 판다스 버전 호환: isocalendar().week → (구버전) dt.week
    try:
        wk_future = fw.dt.isocalendar().week.astype(int).to_numpy()
    except AttributeError:
        wk_future = fw.dt.week.astype(int).to_numpy()  # 구버전 fallback

    # 4) 매핑 (없는 주차는 전체 평균으로 대체)
    default = float(np.mean(list(wk_mean.values()))) if len(wk_mean) else float(train["target"].mean())
    return np.array([wk_mean.get(int(w), default) for w in wk_future])


def rolling_mean_forecast(train, horizon=4, window=8):
    last = train["target"].rolling(window, min_periods=1).mean().iloc[-1]
    return np.repeat(last, horizon)

def hw_forecast_safe(train, horizon=4, seasonal="add", period=52):
    """
    - 길이 < 2*period: 계절 성분 제외(HW(Trend only))
    - 승법+non-positive: 가법으로 강제
    - 실패 시: period 이상이면 SeasonalNaive, 아니면 RollMean으로 폴백
    """
    y = train["target"].astype(float).values
    n = len(y)

    use_seasonal = (seasonal in ["add","mul"]) and (period is not None) and (n >= 2*period)
    seas = seasonal if use_seasonal else None
    if seas == "mul" and (train["target"] <= 0).any():
        seas = "add" if use_seasonal else None

    try:
        model = ExponentialSmoothing(
            y, trend="add",
            seasonal=seas, seasonal_periods=(period if seas else None),
            initialization_method="estimated"
        )
        fit = model.fit(optimized=True)
        return fit.forecast(horizon)
    except Exception:
        if n >= period:
            return seasonal_naive(train, horizon, period)
    return rolling_mean_forecast(train, horizon, window=8)

def choose_and_forecast(train, future_weeks, period=52, horizon=4):
    """
    우선순위로 단일 모델 선택하여 미래 H주 예측 반환 + 참고용 다른 모델 예측도 함께 반환
    Priority: HW-Mul → HW-Add → SeasonalNaive → RollMean(8w) → WeekMean
    """
    preds = {}
    # 예약어: 미래 주차 기반 방법 먼저 준비
    preds["WeekMean"] = weekmean_forecast(train, future_weeks)
    preds["RollMean(8w)"] = rolling_mean_forecast(train, horizon, 8)

    # HW 라인 (안전 가드)
    preds["HW-Mul"] = hw_forecast_safe(train, horizon, "mul", period)
    preds["HW-Add"] = hw_forecast_safe(train, horizon, "add", period)

    # SeasonalNaive
    preds["SeasonalNaive"] = seasonal_naive(train, horizon, period)

    # 단일 선택 규칙
    for name in ["HW-Mul", "HW-Add", "SeasonalNaive", "RollMean(8w)", "WeekMean"]:
        yhat = np.asarray(preds[name], dtype=float)
        if np.isfinite(yhat).all():
            return name, yhat.clip(min=CLIP_MIN), preds  # 선택 모델, 선택 예측, 모든 예측
    # 모두 실패할 일은 거의 없지만, 안전 폴백
    return "RollMean(8w)", preds["RollMean(8w)"].clip(min=CLIP_MIN), preds

# =========================
# 3) 품목별 한 달 뒤 예측 생성
# =========================
rows = []
details = []  # (옵션) 모델별 예측 비교용

for item, wdf in base.groupby("품목명", sort=False):
    wdf = wdf.sort_values("week_start").reset_index(drop=True)
    n = len(wdf)
    if n < max(PERIOD, int(n*MIN_TRAIN_RATIO)) + 1:
        # 지나치게 짧은 품목은 스킵
        continue

    last_week = wdf["week_start"].iloc[-1]
    future_weeks = create_future_weeks(last_week, H)

    model_used, yhat, all_preds = choose_and_forecast(wdf, future_weeks, PERIOD, H)

    # 표(긴 형/wide 형) 병행: wide 형(모델별 컬럼)과 long 형(모델=값) 중 택1
    for i, ws in enumerate(future_weeks):
        rows.append({
            "품목명": item,
            "forecast_week_start": ws,
            "model_used": model_used,
            "y_hat": float(yhat[i]),
        })
        # (옵션) 비교용 상세
        details.append({
            "품목명": item,
            "forecast_week_start": ws,
            **{k: float(np.asarray(v, dtype=float)[i]) for k, v in all_preds.items()}
        })

pred_table = pd.DataFrame(rows).sort_values(["품목명","forecast_week_start"]).reset_index(drop=True)
pred_details = pd.DataFrame(details).sort_values(["품목명","forecast_week_start"]).reset_index(drop=True)

# =========================
# 4) 확인/저장
# =========================
print("▶ 한 달 뒤(다음 4주) 예측표 (요약 상위 20행):")
display(pred_table.head(20))

print("\n▶ (옵션) 모델별 예측 비교표 (상위 20행):")
display(pred_details.head(20))

# 저장 원하면 주석 해제
# pred_table.to_csv("./forecast_next_4w_by_item.csv", index=False)
# pred_details.to_csv("./forecast_next_4w_by_item_details.csv", index=False)


▶ 한 달 뒤(다음 4주) 예측표 (요약 상위 20행):


Unnamed: 0,품목명,forecast_week_start,model_used,y_hat
0,감자(수입),2025-03-24,HW-Mul,2413.224241
1,감자(수입),2025-03-31,HW-Mul,2701.053879
2,감자(수입),2025-04-07,HW-Mul,3058.951836
3,감자(수입),2025-04-14,HW-Mul,2059.273473
4,기타,2025-07-07,HW-Mul,971.38537
5,기타,2025-07-14,HW-Mul,1035.481749
6,기타,2025-07-21,HW-Mul,952.264555
7,기타,2025-07-28,HW-Mul,1074.96312
8,대지,2025-06-30,HW-Mul,931.127439
9,대지,2025-07-07,HW-Mul,1013.778022



▶ (옵션) 모델별 예측 비교표 (상위 20행):


Unnamed: 0,품목명,forecast_week_start,WeekMean,RollMean(8w),HW-Mul,HW-Add,SeasonalNaive
0,감자(수입),2025-03-24,1786.233571,1587.874429,2413.224241,2533.334467,2338.731944
1,감자(수입),2025-03-31,1888.818037,1587.874429,2701.053879,2731.10916,2674.081005
2,감자(수입),2025-04-07,1961.152986,1587.874429,3058.951836,2545.548203,3149.4
3,감자(수입),2025-04-14,1950.774911,1587.874429,2059.273473,2017.984563,2138.888889
4,기타,2025-07-07,930.959884,1557.355791,971.38537,950.265351,956.985486
5,기타,2025-07-14,976.551527,1557.355791,1035.481749,993.118864,959.959306
6,기타,2025-07-21,922.182676,1557.355791,952.264555,976.797139,911.365754
7,기타,2025-07-28,1042.139876,1557.355791,1074.96312,1117.129089,1063.303106
8,대지,2025-06-30,773.481544,1225.864332,931.127439,988.965276,1166.299551
9,대지,2025-07-07,532.151343,1225.864332,1013.778022,1206.221052,758.66671


In [11]:
# -*- coding: utf-8 -*-
# 전체 품목별 예측(자동 모델 선택 포함)
# - 품목별로 Rolling 미니 CV(최대 3폴드, H=4주)로 sMAPE 최저 모델 선택
# - 선택 모델로 다음 4주 예측 생성
# - 모든 품목 합본 테이블(pred_all_items)과 품목별 리더보드(leaders_by_item) 산출/저장 옵션
#
# ⚠️ 전제: 아래 변수/함수는 기존 노트북의 셀에서 이미 정의되어 있다고 가정합니다.
#   PATH, H, PERIOD, MIN_TRAIN_RATIO, CLIP_MIN,
#   base (품목명×week_start×target 집계 DataFrame),
#   create_future_weeks, seasonal_naive, weekmean_forecast,
#   rolling_mean_forecast, hw_forecast_safe
#
# 없다면, 바로 위 코드 셀(데이터 로드/함수 정의)부터 먼저 실행해 주세요.

import numpy as np
import pandas as pd
from IPython.display import display

# ---------- 지표 ----------
def rmse(y, yhat):  return float(np.sqrt(np.mean((y - yhat) ** 2)))
def mae(y, yhat):   return float(np.mean(np.abs(y - yhat)))
def mape(y, yhat):  return float(np.mean(np.abs((y - yhat) / np.clip(y, 1e-9, None))) * 100)
def smape(y, yhat): return float(np.mean(2 * np.abs(y - yhat) / (np.abs(y) + np.abs(yhat) + 1e-9)) * 100)

# ---------- 단일 폴드 평가: 주어진 train/test에 대해 모든 베이스라인 예측 ----------
def evaluate_fold(train_df, test_df, period=52, horizon=4):
    preds = {
        "SeasonalNaive": seasonal_naive(train_df, horizon, period),
        "WeekMean":      weekmean_forecast(train_df, test_df["week_start"]),
        "RollMean(8w)":  rolling_mean_forecast(train_df, horizon, 8),
        "HW-Add":        hw_forecast_safe(train_df, horizon, "add", period),
        "HW-Mul":        hw_forecast_safe(train_df, horizon, "mul", period),
    }
    rows = []
    y_true = test_df["target"].values
    for name, yhat in preds.items():
        yhat = np.asarray(yhat, dtype=float)
        rows.append({
            "model":  name,
            "RMSE":   rmse(y_true, yhat),
            "MAE":    mae(y_true, yhat),
            "MAPE":   mape(y_true, yhat),
            "sMAPE":  smape(y_true, yhat),
        })
    return pd.DataFrame(rows), preds  # (모델별 지표, 예측 dict)

# ---------- 미니 CV: 최근 구간을 중심으로 최대 last_k_folds 폴드 ----------
def backtest_item(item_df, period=52, horizon=4, last_k_folds=3):
    """
    end 인덱스를 n-H*(k)로 잡아 최근부터 최대 3폴드 평가.
    각 폴드마다 train=item_df[:end], test=item_df[end:end+H].
    """
    n = len(item_df)
    min_train = max(period, int(n * MIN_TRAIN_RATIO))
    fold_ends = []
    for k in range(last_k_folds, 0, -1):
        end = n - horizon * k
        if end >= min_train and end + horizon <= n:
            fold_ends.append(end)
    if not fold_ends:
        # 데이터가 아주 짧으면 마지막 1폴드라도 시도(불가하면 빈 결과 반환)
        end = n - horizon
        if end >= min_train and end + horizon <= n:
            fold_ends = [end]

    metrics_list = []
    preds_by_fold = {}

    for end in fold_ends:
        train = item_df.iloc[:end].copy()
        test  = item_df.iloc[end:end+horizon].copy()
        fold_metrics, fold_preds = evaluate_fold(train, test, period=period, horizon=horizon)
        fold_metrics["fold_end_idx"] = end
        metrics_list.append(fold_metrics)
        preds_by_fold[end] = fold_preds

    if not metrics_list:
        return pd.DataFrame(), {}

    metrics = pd.concat(metrics_list, ignore_index=True)
    # 평균 성능 리더보드
    leaderboard = (metrics.groupby("model")[["RMSE","MAE","MAPE","sMAPE"]]
                          .mean().sort_values(["sMAPE","MAE"]))
    return leaderboard, preds_by_fold

# ---------- 선택 모델로 다음 4주 예측 ----------
def forecast_with_selected(train_df, future_weeks, selected_model, period=52, horizon=4):
    if selected_model == "SeasonalNaive":
        yhat = seasonal_naive(train_df, horizon, period)
    elif selected_model == "WeekMean":
        yhat = weekmean_forecast(train_df, future_weeks)
    elif selected_model == "RollMean(8w)":
        yhat = rolling_mean_forecast(train_df, horizon, 8)
    elif selected_model == "HW-Add":
        yhat = hw_forecast_safe(train_df, horizon, "add", period)
    elif selected_model == "HW-Mul":
        yhat = hw_forecast_safe(train_df, horizon, "mul", period)
    else:
        # 안전 폴백
        yhat = seasonal_naive(train_df, horizon, period)
    return np.asarray(yhat, dtype=float)

# ---------- 메인 루프: 모든 품목 ----------
pred_rows = []
leader_rows = []
detail_rows = []  # (옵션) 마지막 폴드 예측 상세

for item, wdf in base.groupby("품목명", sort=False):
    wdf = wdf.sort_values("week_start").reset_index(drop=True)
    n = len(wdf)
    MIN_TRAIN = max(PERIOD, int(n * MIN_TRAIN_RATIO))
    if n < MIN_TRAIN + H:
        # 너무 짧은 시계열은 스킵 or 간단 폴백(여기선 스킵)
        # 폴백으로라도 예측하고 싶으면 아래 3줄 주석 해제:
        # last_week = wdf["week_start"].iloc[-1]
        # future_weeks = create_future_weeks(last_week, H)
        # ... 간단 폴백으로 예측 생성
        continue

    # 1) 품목별 미니 CV로 리더보드 생성
    leaderboard, preds_by_fold = backtest_item(wdf, period=PERIOD, horizon=H, last_k_folds=3)

    if leaderboard.empty:
        # CV 불가(아주 짧은 경우): 간단 폴백
        selected_model = "SeasonalNaive"
    else:
        selected_model = leaderboard.index[0]  # sMAPE 최저 모델

    # 2) 선택 모델로 다음 4주 예측
    last_week = wdf["week_start"].iloc[-1]
    future_weeks = create_future_weeks(last_week, H)
    yhat = forecast_with_selected(wdf, future_weeks, selected_model, period=PERIOD, horizon=H)

    # 3) 결과 적재 (모든 품목 합본)
    for i, ws in enumerate(future_weeks):
        pred_rows.append({
            "품목명": item,
            "forecast_week_start": ws,
            "model_used": selected_model,
            "y_hat": float(np.clip(yhat[i], a_min=CLIP_MIN, a_max=None)),
        })

    # 4) 리더보드/메타 적재
    leader_row = {"품목명": item, "selected_model": selected_model}
    if not leaderboard.empty:
        # 상위 3개 모델의 평균 sMAPE/MAE를 메타로 같이 저장(필요에 따라 확장)
        for rank, (m, row) in enumerate(leaderboard.head(3).iterrows(), start=1):
            leader_row[f"rank{rank}_model"] = m
            leader_row[f"rank{rank}_sMAPE"] = float(row["sMAPE"])
            leader_row[f"rank{rank}_MAE"]   = float(row["MAE"])
    leader_rows.append(leader_row)

    # 5) (옵션) 마지막 폴드 예측 상세 저장: 가장 최근 폴드 하나만 남김
    if preds_by_fold:
        last_end = max(preds_by_fold.keys())
        test_last = wdf.iloc[last_end:last_end+H].copy()
        for name, arr in preds_by_fold[last_end].items():
            for i, ws in enumerate(test_last["week_start"].values):
                detail_rows.append({
                    "품목명": item,
                    "fold_end_idx": last_end,
                    "week_start": ws,
                    "model": name,
                    "y_true": float(test_last["target"].iloc[i]),
                    "y_hat": float(np.asarray(arr, dtype=float)[i]),
                })

# ---------- 합본 테이블 ----------
pred_all_items   = pd.DataFrame(pred_rows).sort_values(["품목명","forecast_week_start"]).reset_index(drop=True)
leaders_by_item  = pd.DataFrame(leader_rows).sort_values(["품목명"]).reset_index(drop=True)
lastfold_details = pd.DataFrame(detail_rows).sort_values(["품목명","week_start","model"]).reset_index(drop=True)

print("▶ 예측 미리보기(상위 20행)")
display(pred_all_items.head(20))

print("\n▶ 품목별 리더보드 요약(상위 20행)")
display(leaders_by_item.head(20))

print("\n▶ (옵션) 마지막 폴드 상세(상위 20행)")
display(lastfold_details.head(20))

# ---------- 저장(원하면 주석 해제) ----------
# pred_all_items.to_csv("./forecast_next_4w_ALL_ITEMS.csv", index=False)
# leaders_by_item.to_csv("./model_leaderboard_by_item.csv", index=False)
# lastfold_details.to_csv("./lastfold_details_by_item.csv", index=False)


▶ 예측 미리보기(상위 20행)


Unnamed: 0,품목명,forecast_week_start,model_used,y_hat
0,감자(수입),2025-03-24,WeekMean,1786.233571
1,감자(수입),2025-03-31,WeekMean,1888.818037
2,감자(수입),2025-04-07,WeekMean,1961.152986
3,감자(수입),2025-04-14,WeekMean,1950.774911
4,기타,2025-07-07,HW-Mul,971.38537
5,기타,2025-07-14,HW-Mul,1035.481749
6,기타,2025-07-21,HW-Mul,952.264555
7,기타,2025-07-28,HW-Mul,1074.96312
8,대지,2025-06-30,RollMean(8w),1225.864332
9,대지,2025-07-07,RollMean(8w),1225.864332



▶ 품목별 리더보드 요약(상위 20행)


Unnamed: 0,품목명,selected_model,rank1_model,rank1_sMAPE,rank1_MAE,rank2_model,rank2_sMAPE,rank2_MAE,rank3_model,rank3_sMAPE,rank3_MAE
0,감자(수입),WeekMean,WeekMean,34.621681,369.522169,HW-Mul,57.843685,725.361345,SeasonalNaive,63.928019,765.541347
1,기타,HW-Mul,HW-Mul,14.123483,250.110004,HW-Add,16.885437,279.954085,SeasonalNaive,20.871778,355.83544
2,대지,RollMean(8w),RollMean(8w),33.619376,388.549346,WeekMean,38.771517,468.895825,HW-Mul,50.315112,639.691973
3,두백,RollMean(8w),RollMean(8w),14.696881,177.454597,WeekMean,19.168228,244.611657,HW-Mul,19.761713,231.095608
4,수미,HW-Add,HW-Add,16.310219,276.230605,HW-Mul,16.810284,285.912309,SeasonalNaive,18.815061,321.449351
5,조풍,WeekMean,WeekMean,38.247193,618.270848,RollMean(8w),42.336817,653.69797,HW-Add,44.856388,793.483431
6,추백감자,RollMean(8w),RollMean(8w),40.831605,363.020175,WeekMean,43.5366,390.245061,HW-Add,48.336391,422.348944
7,홍감자,RollMean(8w),RollMean(8w),29.973053,330.776095,SeasonalNaive,34.813737,451.86491,WeekMean,48.457213,759.606536



▶ (옵션) 마지막 폴드 상세(상위 20행)


Unnamed: 0,품목명,fold_end_idx,week_start,model,y_true,y_hat
0,감자(수입),118,2024-05-20,HW-Add,742.804027,1846.08077
1,감자(수입),118,2024-05-20,HW-Mul,742.804027,661.795741
2,감자(수입),118,2024-05-20,RollMean(8w),742.804027,1368.20693
3,감자(수입),118,2024-05-20,SeasonalNaive,742.804027,438.586957
4,감자(수입),118,2024-05-20,WeekMean,742.804027,1355.624207
5,감자(수입),118,2024-05-27,HW-Add,150.0,2964.481484
6,감자(수입),118,2024-05-27,HW-Mul,150.0,2485.958369
7,감자(수입),118,2024-05-27,RollMean(8w),150.0,1368.20693
8,감자(수입),118,2024-05-27,SeasonalNaive,150.0,2306.125
9,감자(수입),118,2024-05-27,WeekMean,150.0,1153.671039


In [10]:
# -*- coding: utf-8 -*-
# 전체 품목 대상: "다음 4주" 예측 + (옵션) 미니 CV로 모델 자동 선택
# - 안전한 Holt-Winters 래퍼 포함 (수렴 실패/음수값/짧은 시계열 폴백)
# - 우선순위: HW-Mul → HW-Add → SeasonalNaive → RollMean(8w) → WeekMean
# - 미니 CV(최대 3폴드, H=4)로 sMAPE 최저 모델을 품목별로 선택 가능

import pandas as pd
import numpy as np
import warnings
import os

# (선택) 노트북에서 표 보기
try:
    from IPython.display import display
except Exception:
    def display(x): print(x)

# Holt-Winters 경고 완화
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from statsmodels.tsa.holtwinters import ExponentialSmoothing
warnings.simplefilter("ignore", ConvergenceWarning)

# =========================
# 0) 파라미터
# =========================
PATH = "../도메인전처리/raw/preprocess/merged_all_filled_weekly_item.csv"  # 데이터 경로
H = 4                   # 다음 4주(≈한 달)
PERIOD = 52             # 주간 시즌 길이
MIN_TRAIN_RATIO = 0.5   # 최소 학습 길이 비율(50%)
CLIP_MIN = 0            # 예측 음수 방지 하한(원/kg)
RUN_AUTO_SELECTION = True  # True: 미니CV로 모델 자동 선택 / False: 우선순위 규칙으로 선택

SAVE_PRED_TABLE = True        # 예측 합본 저장 여부
SAVE_DETAILS = True           # 모델별 예측 상세 저장 여부(규모 큼)
SAVE_LEADERBOARD = True       # 품목별 리더보드 저장 여부

OUT_PRED_PATH = "../도메인전처리/raw/preprocess/forecast_next_4w_ALL_ITEMS.csv"
OUT_DETAILS_PATH = "../도메인전처리/raw/preprocess/forecast_next_4w_by_item_details.csv"
OUT_LEADERBOARD_PATH = "../도메인전처리/raw/preprocess/model_leaderboard_by_item.csv"

# =========================
# 1) 로드 & 전처리 (품목명×주차 단위)
# =========================
if not os.path.exists(PATH):
    raise FileNotFoundError(f"데이터 경로를 확인하세요: {PATH}")

df = pd.read_csv(PATH)

# 숫자형 변환(문자→숫자)
for col in ["금액_합", "반입량_합", "주간_평균단가", "금액", "반입량"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

def build_week_start(y, w):
    # ISO 주차 기준: '연-주차-월요일'
    return pd.to_datetime(f"{int(y)}-W{int(w):02d}-1", format="%G-W%V-%u", errors="coerce")

df["week_start"] = [build_week_start(y, w) for y, w in zip(df["연도"], df["주차"])]

# 타깃 확정
if "주간_평균단가" in df.columns:
    df["target"] = df["주간_평균단가"]
else:
    # 0 나눗셈 보호
    denom = df["반입량_합"].replace(0, np.nan)
    df["target"] = df["금액_합"] / denom

# 품목×주차 평균 집계(중복 행 방지)
base = (
    df.dropna(subset=["품목명", "week_start", "target"])
      .groupby(["품목명", "week_start"], as_index=False)["target"].mean()
      .sort_values(["품목명", "week_start"])
      .reset_index(drop=True)
)

# =========================
# 2) 유틸 & 베이스라인 함수
# =========================
def create_future_weeks(last_week_start, H):
    start = pd.to_datetime(last_week_start) + pd.Timedelta(days=7)
    return pd.date_range(start=start, periods=H, freq="W-MON")

def seasonal_naive(train, horizon=4, period=52):
    hist = train["target"].astype(float).values
    if len(hist) >= period:
        last_season = hist[-period:]
        reps = int(np.ceil(horizon / period))
        return np.tile(last_season, reps)[:horizon]
    else:
        return np.repeat(hist[-1], horizon)

def weekmean_forecast(train, future_week_start):
    """학습구간 ISO 주차별 평균으로 미래 주차 매핑"""
    wk_train = pd.to_datetime(train["week_start"]).dt.isocalendar().week.astype(int)
    wk_mean = train.groupby(wk_train)["target"].mean().to_dict()

    if isinstance(future_week_start, (pd.Series, pd.Index, np.ndarray, list)):
        fw = pd.Series(pd.to_datetime(future_week_start))
    else:
        fw = pd.Series([pd.to_datetime(future_week_start)])

    # pandas 구버전 호환
    try:
        wk_future = fw.dt.isocalendar().week.astype(int).to_numpy()
    except AttributeError:
        wk_future = fw.dt.week.astype(int).to_numpy()

    default = float(np.mean(list(wk_mean.values()))) if len(wk_mean) else float(train["target"].mean())
    return np.array([wk_mean.get(int(w), default) for w in wk_future])

def rolling_mean_forecast(train, horizon=4, window=8):
    last = train["target"].rolling(window, min_periods=1).mean().iloc[-1]
    return np.repeat(float(last), horizon)

def hw_forecast_safe(train, horizon=4, seasonal="add", period=52):
    """
    Holt-Winters 안전 래퍼:
    - 길이 < 2*period → 계절 제외(HW trend-only)
    - 승법 + 음수/0 존재 → 가법으로 강제
    - 실패 시: (n>=period) SeasonalNaive → 아니면 RollMean(8w)
    """
    y = train["target"].astype(float).values
    n = len(y)

    use_seasonal = (seasonal in ["add","mul"]) and (period is not None) and (n >= 2*period)
    seas = seasonal if use_seasonal else None
    if seas == "mul" and (train["target"] <= 0).any():
        seas = "add" if use_seasonal else None

    try:
        model = ExponentialSmoothing(
            y, trend="add",
            seasonal=seas, seasonal_periods=(period if seas else None),
            initialization_method="estimated"
        )
        # 수렴성 강화(필요시): fit(..., use_brute=True) 고려
        fit = model.fit(optimized=True)
        return fit.forecast(horizon)
    except Exception:
        if n >= period:
            return seasonal_naive(train, horizon, period)
        return rolling_mean_forecast(train, horizon, window=8)

def choose_and_forecast_by_priority(train, future_weeks, period=52, horizon=4):
    """
    규칙 기반 단일 선택: HW-Mul → HW-Add → SeasonalNaive → RollMean(8w) → WeekMean
    """
    preds = {}
    preds["WeekMean"]      = weekmean_forecast(train, future_weeks)
    preds["RollMean(8w)"]  = rolling_mean_forecast(train, horizon, 8)
    preds["HW-Mul"]        = hw_forecast_safe(train, horizon, "mul", period)
    preds["HW-Add"]        = hw_forecast_safe(train, horizon, "add", period)
    preds["SeasonalNaive"] = seasonal_naive(train, horizon, period)

    for name in ["HW-Mul", "HW-Add", "SeasonalNaive", "RollMean(8w)", "WeekMean"]:
        yhat = np.asarray(preds[name], dtype=float)
        if np.isfinite(yhat).all():
            return name, yhat.clip(min=CLIP_MIN), preds
    return "RollMean(8w)", np.asarray(preds["RollMean(8w)"], dtype=float).clip(min=CLIP_MIN), preds

# =========================
# 3) (옵션) 미니 CV로 모델 자동 선택
# =========================
def rmse(y, yhat):  return float(np.sqrt(np.mean((y - yhat) ** 2)))
def mae(y, yhat):   return float(np.mean(np.abs(y - yhat)))
def mape(y, yhat):  return float(np.mean(np.abs((y - yhat) / np.clip(y, 1e-9, None))) * 100)
def smape(y, yhat): return float(np.mean(2 * np.abs(y - yhat) / (np.abs(y) + np.abs(yhat) + 1e-9)) * 100)

def evaluate_fold(train_df, test_df, period=52, horizon=4):
    preds = {
        "SeasonalNaive": seasonal_naive(train_df, horizon, period),
        "WeekMean":      weekmean_forecast(train_df, test_df["week_start"]),
        "RollMean(8w)":  rolling_mean_forecast(train_df, horizon, 8),
        "HW-Add":        hw_forecast_safe(train_df, horizon, "add", period),
        "HW-Mul":        hw_forecast_safe(train_df, horizon, "mul", period),
    }
    rows = []
    y_true = test_df["target"].astype(float).values
    for name, yhat in preds.items():
        yhat = np.asarray(yhat, dtype=float)
        rows.append({
            "model":  name,
            "RMSE":   rmse(y_true, yhat),
            "MAE":    mae(y_true, yhat),
            "MAPE":   mape(y_true, yhat),
            "sMAPE":  smape(y_true, yhat),
        })
    return pd.DataFrame(rows), preds

def backtest_item(item_df, period=52, horizon=4, last_k_folds=3):
    """
    최근 구간 중심 최대 3폴드 평가.
    각 폴드: train=item_df[:end], test=item_df[end:end+H]
    """
    n = len(item_df)
    min_train = max(period, int(n * MIN_TRAIN_RATIO))
    fold_ends = []
    for k in range(last_k_folds, 0, -1):
        end = n - horizon * k
        if end >= min_train and end + horizon <= n:
            fold_ends.append(end)
    if not fold_ends:
        end = n - horizon
        if end >= min_train and end + horizon <= n:
            fold_ends = [end]

    metrics_list = []
    preds_by_fold = {}
    for end in fold_ends:
        train = item_df.iloc[:end].copy()
        test  = item_df.iloc[end:end+horizon].copy()
        fold_metrics, fold_preds = evaluate_fold(train, test, period=period, horizon=horizon)
        fold_metrics["fold_end_idx"] = end
        metrics_list.append(fold_metrics)
        preds_by_fold[end] = fold_preds

    if not metrics_list:
        return pd.DataFrame(), {}

    metrics = pd.concat(metrics_list, ignore_index=True)
    leaderboard = (metrics.groupby("model")[["RMSE","MAE","MAPE","sMAPE"]]
                          .mean().sort_values(["sMAPE","MAE"]))
    return leaderboard, preds_by_fold

def forecast_with_selected(train_df, future_weeks, selected_model, period=52, horizon=4):
    if selected_model == "SeasonalNaive":
        yhat = seasonal_naive(train_df, horizon, period)
    elif selected_model == "WeekMean":
        yhat = weekmean_forecast(train_df, future_weeks)
    elif selected_model == "RollMean(8w)":
        yhat = rolling_mean_forecast(train_df, horizon, 8)
    elif selected_model == "HW-Add":
        yhat = hw_forecast_safe(train_df, horizon, "add", period)
    elif selected_model == "HW-Mul":
        yhat = hw_forecast_safe(train_df, horizon, "mul", period)
    else:
        yhat = seasonal_naive(train_df, horizon, period)
    return np.asarray(yhat, dtype=float)

# =========================
# 4) 메인 루프: 품목별 4주 예측
# =========================
pred_rows = []
leader_rows = []
detail_rows = []

for item, wdf in base.groupby("품목명", sort=False):
    wdf = wdf.sort_values("week_start").reset_index(drop=True)
    n = len(wdf)

    MIN_TRAIN = max(PERIOD, int(n * MIN_TRAIN_RATIO))
    if n < MIN_TRAIN + H:
        # 데이터가 너무 짧으면 스킵(원하면 간단 폴백으로 변경 가능)
        continue

    last_week = wdf["week_start"].iloc[-1]
    future_weeks = create_future_weeks(last_week, H)

    if RUN_AUTO_SELECTION:
        # 1) 미니 CV로 최고 모델 선택
        leaderboard, preds_by_fold = backtest_item(wdf, period=PERIOD, horizon=H, last_k_folds=3)
        selected_model = leaderboard.index[0] if not leaderboard.empty else "SeasonalNaive"
        # 2) 선택 모델로 4주 예측
        yhat = forecast_with_selected(wdf, future_weeks, selected_model, period=PERIOD, horizon=H)
        # (리더보드 메타)
        leader_row = {"품목명": item, "selected_model": selected_model}
        if not leaderboard.empty:
            for rank, (m, row) in enumerate(leaderboard.head(3).iterrows(), start=1):
                leader_row[f"rank{rank}_model"] = m
                leader_row[f"rank{rank}_sMAPE"] = float(row["sMAPE"])
                leader_row[f"rank{rank}_MAE"]   = float(row["MAE"])
                leader_row[f"rank{rank}_RMSE"]   = float(row["RMSE"])
                leader_row[f"rank{rank}_MAPE"]   = float(row["MAPE"])
        leader_rows.append(leader_row)

        # (옵션) 마지막 폴드 상세 저장
        if preds_by_fold:
            last_end = max(preds_by_fold.keys())
            test_last = wdf.iloc[last_end:last_end+H].copy()
            for name, arr in preds_by_fold[last_end].items():
                arr = np.asarray(arr, dtype=float)
                for i, ws in enumerate(test_last["week_start"].values):
                    detail_rows.append({
                        "품목명": item,
                        "fold_end_idx": last_end,
                        "week_start": ws,
                        "model": name,
                        "y_true": float(test_last["target"].iloc[i]),
                        "y_hat": float(arr[i]),
                    })
    else:
        # 규칙 기반 우선순위 선택
        chosen, yhat, all_preds = choose_and_forecast_by_priority(wdf, future_weeks, PERIOD, H)
        selected_model = chosen
        # 비교 상세 출력이 필요하면 all_preds 활용 가능

    # 3) 합본 예측 행 적재
    for i, ws in enumerate(future_weeks):
        pred_rows.append({
            "품목명": item,
            "forecast_week_start": ws,
            "model_used": selected_model,
            "y_hat": float(np.clip(yhat[i], a_min=CLIP_MIN, a_max=None)),
        })

# =========================
# 5) 결과 테이블 & 저장
# =========================
pred_all_items   = pd.DataFrame(pred_rows).sort_values(["품목명","forecast_week_start"]).reset_index(drop=True)
leaders_by_item  = pd.DataFrame(leader_rows).sort_values(["품목명"]).reset_index(drop=True) if leader_rows else pd.DataFrame()
lastfold_details = pd.DataFrame(detail_rows).sort_values(["품목명","week_start","model"]).reset_index(drop=True) if detail_rows else pd.DataFrame()

print("▶ 예측 미리보기(상위 20행)")
display(pred_all_items.head(20))

if not leaders_by_item.empty:
    print("\n▶ 품목별 리더보드 요약(상위 20행)")
    display(leaders_by_item.head(20))

if not lastfold_details.empty:
    print("\n▶ (옵션) 마지막 폴드 상세(상위 20행)")
    display(lastfold_details.head(20))

# 저장(필요 시 주석 해제하거나 플래그 True)
if SAVE_PRED_TABLE:
    pred_all_items.to_csv(OUT_PRED_PATH, index=False)
if SAVE_LEADERBOARD and not leaders_by_item.empty:
    leaders_by_item.to_csv(OUT_LEADERBOARD_PATH, index=False)
if SAVE_DETAILS and not lastfold_details.empty:
    lastfold_details.to_csv(OUT_DETAILS_PATH, index=False)


▶ 예측 미리보기(상위 20행)


Unnamed: 0,품목명,forecast_week_start,model_used,y_hat
0,깐양파,2025-07-07,WeekMean,1298.89997
1,깐양파,2025-07-14,WeekMean,1310.631804
2,깐양파,2025-07-21,WeekMean,1310.340815
3,깐양파,2025-07-28,WeekMean,1322.71154
4,깐쪽파,2025-07-07,WeekMean,7615.520744
5,깐쪽파,2025-07-14,WeekMean,7837.897508
6,깐쪽파,2025-07-21,WeekMean,7257.58434
7,깐쪽파,2025-07-28,WeekMean,8491.627666
8,깻잎(일반),2025-07-07,HW-Mul,6337.012094
9,깻잎(일반),2025-07-14,HW-Mul,7160.607558



▶ 품목별 리더보드 요약(상위 20행)


Unnamed: 0,품목명,selected_model,rank1_model,rank1_sMAPE,rank1_MAE,rank1_RMSE,rank1_MAPE,rank2_model,rank2_sMAPE,rank2_MAE,rank2_RMSE,rank2_MAPE,rank3_model,rank3_sMAPE,rank3_MAE,rank3_RMSE,rank3_MAPE
0,깐양파,WeekMean,WeekMean,16.702287,282.447502,302.354971,14.58946,HW-Add,19.170685,292.575658,311.465924,21.129607,HW-Mul,19.179385,289.186443,308.294064,20.202986
1,깐쪽파,WeekMean,WeekMean,19.343125,1300.816282,1477.376267,20.875939,RollMean(4w),29.152203,1823.182024,2185.417976,23.499662,HW-Mul,31.43947,1993.8993,2466.504265,30.166524
2,깻잎(일반),HW-Mul,HW-Mul,18.12913,1105.370441,1237.39064,19.352589,HW-Add,18.705121,1128.8503,1279.930299,20.406915,RollMean(4w),21.133815,1270.403908,1441.078471,23.47536
3,노랑파프리카,WeekMean,WeekMean,22.898481,702.526226,890.963318,20.026061,SeasonalNaive,28.785808,998.782893,1254.450793,30.891504,HW-Mul,29.639275,1055.800637,1208.066481,35.286069
4,녹광,SeasonalNaive,SeasonalNaive,24.303726,1119.604467,1359.084412,23.934322,RollMean(4w),26.6303,1395.064289,1558.431423,32.106906,WeekMean,26.705216,1200.876111,1466.823883,22.602034
5,뉴그린,HW-Mul,HW-Mul,32.610147,1090.693946,1440.511323,42.796029,SeasonalNaive,34.565095,840.952488,1101.756079,28.268101,HW-Add,35.880809,1215.467602,1575.645225,47.548769
6,느타리버섯(일반),HW-Add,HW-Add,11.923678,616.125969,769.495329,11.896535,RollMean(4w),12.463111,643.11883,780.839427,12.44749,HW-Mul,13.019466,672.203178,839.223036,12.793709
7,단호박,RollMean(4w),RollMean(4w),44.506141,911.238922,1216.674947,51.747314,WeekMean,47.051227,900.181237,1082.443314,54.48514,HW-Add,51.154033,1134.202171,1335.438348,115.31373
8,당근(일반),HW-Add,HW-Add,14.323082,198.092143,228.614916,16.04446,HW-Mul,20.967297,279.597229,299.201924,24.139559,RollMean(4w),24.535277,360.229965,381.75787,29.648923
9,대추방울,HW-Mul,HW-Mul,15.35877,533.626608,618.663323,15.017213,HW-Add,15.695859,566.49187,660.985682,15.950124,WeekMean,17.839448,621.341877,698.161249,16.274258



▶ (옵션) 마지막 폴드 상세(상위 20행)


Unnamed: 0,품목명,fold_end_idx,week_start,model,y_true,y_hat
0,깐양파,519,2025-06-09,HW-Add,1229.754719,1110.991745
1,깐양파,519,2025-06-09,HW-Mul,1229.754719,1092.767565
2,깐양파,519,2025-06-09,RollMean(4w),1229.754719,1284.92478
3,깐양파,519,2025-06-09,SeasonalNaive,1229.754719,1841.600417
4,깐양파,519,2025-06-09,WeekMean,1229.754719,1246.440682
5,깐양파,519,2025-06-16,HW-Add,1213.943858,1089.776248
6,깐양파,519,2025-06-16,HW-Mul,1213.943858,1063.71703
7,깐양파,519,2025-06-16,RollMean(4w),1213.943858,1284.92478
8,깐양파,519,2025-06-16,SeasonalNaive,1213.943858,1777.690416
9,깐양파,519,2025-06-16,WeekMean,1213.943858,1229.333233


In [7]:
print("pred_all_items:", pred_all_items.shape)
print("leaders_by_item:", leaders_by_item.shape)
print("lastfold_details:", lastfold_details.shape)


pred_all_items: (212, 4)
leaders_by_item: (53, 11)
lastfold_details: (1060, 6)


In [8]:
import os
print("cwd:", os.getcwd())
print("OUT_PRED_PATH:", os.path.abspath(OUT_PRED_PATH))
print("OUT_LEADERBOARD_PATH:", os.path.abspath(OUT_LEADERBOARD_PATH))
print("OUT_DETAILS_PATH:", os.path.abspath(OUT_DETAILS_PATH))


cwd: C:\Users\soomi\Desktop\GDF_Final_G3\soomin
OUT_PRED_PATH: C:\Users\soomi\Desktop\GDF_Final_G3\soomin\forecast_next_4w_ALL_ITEMS.csv
OUT_LEADERBOARD_PATH: C:\Users\soomi\Desktop\GDF_Final_G3\soomin\model_leaderboard_by_item.csv
OUT_DETAILS_PATH: C:\Users\soomi\Desktop\GDF_Final_G3\soomin\forecast_next_4w_by_item_details.csv


In [13]:
pred_all_items[pred_all_items["품목명"]=="단호박"]

Unnamed: 0,품목명,forecast_week_start,model_used,y_hat
28,단호박,2025-07-07,RollMean(4w),1544.444444
29,단호박,2025-07-14,RollMean(4w),1150.0
30,단호박,2025-07-21,RollMean(4w),696.73913
31,단호박,2025-07-28,RollMean(4w),8250.0


In [16]:
df[df["품목명"]=="단호박"]

Unnamed: 0,품목명,연도,주차,주차_일수,주간_평균단가,금액_합,반입량_합,week_start,target
3661,단호박,2015,27,5,1111.776057,203900000.0,184391.0,2015-06-29,1111.776057
3662,단호박,2015,28,6,1052.955108,430506300.0,408286.0,2015-07-06,1052.955108
3663,단호박,2015,29,6,911.145758,392323100.0,431745.0,2015-07-13,911.145758
3664,단호박,2015,30,6,905.007491,425744800.0,469034.0,2015-07-20,905.007491
3665,단호박,2015,31,6,799.864011,319874600.0,388290.0,2015-07-27,799.864011
...,...,...,...,...,...,...,...,...,...
4126,단호박,2025,23,2,6615.000000,13500000.0,2050.0,2025-06-02,6615.000000
4127,단호박,2025,24,1,2302.197802,1257000.0,546.0,2025-06-09,2302.197802
4128,단호박,2025,25,4,3063.958931,17065500.0,5191.0,2025-06-16,3063.958931
4129,단호박,2025,26,6,2811.590020,38345000.0,14380.0,2025-06-23,2811.590020
