 # LightGBM 기반 7일 수요예측 파이프라인 (제출까지)
# -  단일 모델 + horizon(1~7) 멀티태스크
# - 캘린더/시계열 lag & rolling 피처
# - 담하/미라시아 샘플 가중치 반영
# - 0실적 제외 SMAPE


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:


import os, re, glob, random, warnings, math
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import torch
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb


TRAIN_PATH = "/content/drive/MyDrive/lgaimers7기/data/train/train.csv"
TEST_DIR   = "/content/drive/MyDrive/lgaimers7기/data/test"
SAMPLE_PATH= "/content/drive/MyDrive/lgaimers7기/data/sample_submission.csv"
SAVE_PATH  = "/content/drive/MyDrive/lgaimers7기/submission/4_0824_submission.csv"

os.makedirs(os.path.dirname(SAVE_PATH), exist_ok=True)


def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed); torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(42)

In [None]:
train = pd.read_csv(TRAIN_PATH)
test_files = sorted(glob.glob(os.path.join(TEST_DIR, "TEST_*.csv")))
sample_submission = pd.read_csv(SAMPLE_PATH)

print(f"📌 Train shape: {train.shape}")
print(f"📌 Sample shape: {sample_submission.shape}")
print(f"📌 Test 파일 개수: {len(test_files)}")

유틸함수 & 피처 추가

In [None]:
import ast, unicodedata

def normalize_key(s):
     if isinstance(s, tuple):
        s = s[0]
    elif isinstance(s, str) and s.startswith("(") and s.endswith(")"):
        try:
            s = ast.literal_eval(s)[0]
        except:
            pass
    s = str(s)
    s = s.replace('\u3000', ' ')  # 전각 공백 → 반각
    s = unicodedata.normalize('NFKC', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def ensure_store_menu(df):

    cols = df.columns
    # 날짜 컬럼 표준화
    if '영업일자' in cols:
        df['영업일자'] = pd.to_datetime(df['영업일자'], errors='coerce')
    # 결합키가 이미 있으면 정규화
    if '영업장명_메뉴명' in cols:
        df['영업장명_메뉴명'] = df['영업장명_메뉴명'].apply(normalize_key)
        parts = df['영업장명_메뉴명'].str.rsplit('_', n=1, expand=True)
        if parts.shape[1] == 2:
            df['업장명'] = parts[0]
            df['메뉴명'] = parts[1]
        else:
            # 실패 시 전체를 업장명으로, 메뉴명은 공란
            df['업장명'] = df['영업장명_메뉴명']
            df['메뉴명'] = ''
    else:
        # 별도 컬럼이 있다면 결합
        if {'영업장명','메뉴명'}.issubset(cols):
            df['영업장명'] = df['영업장명'].apply(normalize_key)
            df['메뉴명'] = df['메뉴명'].apply(normalize_key)
            df['영업장명_메뉴명'] = (df['영업장명'] + '_' + df['메뉴명']).apply(normalize_key)
        else:
            raise ValueError("필수 키 컬럼이 없습니다. (영업장명_메뉴명 또는 영업장명/메뉴명)")

    # 타깃 컬럼 명 통일
    if '매출수량' not in cols:
        raise ValueError("train/test에 '매출수량' 컬럼이 필요합니다.")

    return df

train = ensure_store_menu(train)


 def add_calendar_features(df):
    df = df.sort_values(['영업장명_메뉴명','영업일자']).copy()
    df['dow'] = df['영업일자'].dt.weekday.astype(np.int16)         # 0=월
    df['is_weekend'] = df['dow'].isin([5,6]).astype(np.int8)
    df['week'] = df['영업일자'].dt.isocalendar().week.astype(np.int16)
    df['month'] = df['영업일자'].dt.month.astype(np.int8)
    df['quarter'] = df['영업일자'].dt.quarter.astype(np.int8)
    df['day'] = df['영업일자'].dt.day.astype(np.int8)
    # 주기 피처 (요일 7주기)
    df['dow_sin'] = np.sin(2*np.pi*df['dow']/7).astype(np.float32)
    df['dow_cos'] = np.cos(2*np.pi*df['dow']/7).astype(np.float32)
    return df

def add_ts_features(df, target='매출수량', group='영업장명_메뉴명'):
    df = df.sort_values([group,'영업일자']).copy()

    def _grp_feats(g):
        g = g.copy()
        # lag
        for L in [1,7,14]:
            g[f'lag_{L}'] = g[target].shift(L)
        # rolling mean / std (min_periods=1은 과적합 위험 → 충분히 확보된 시점만 사용)
        g['rmean_3']  = g[target].shift(1).rolling(3, min_periods=2).mean()
        g['rmean_7']  = g[target].shift(1).rolling(7, min_periods=3).mean()
        g['rmean_14'] = g[target].shift(1).rolling(14, min_periods=5).mean()
        g['rstd_7']   = g[target].shift(1).rolling(7, min_periods=3).std()
        # 1차 차분
        g['diff_1']   = g[target].diff(1)
        return g

    df = df.groupby(group, group_keys=False).apply(_grp_feats)
    return df

train = add_calendar_features(train)
train = add_ts_features(train)

인코딩 및 특정 업장 가중치

In [None]:
HORIZONS = [1,2,3,4,5,6,7]
GROUP_COL = '영업장명_메뉴명'
TARGET = '매출수량'

 for h in HORIZONS:
    train[f'target_h{h}'] = train.groupby(GROUP_COL)[TARGET].shift(-h)

 base_feats = [
    'dow','is_weekend','week','month','quarter','day','dow_sin','dow_cos',
    'lag_1','lag_7','lag_14','rmean_3','rmean_7','rmean_14','rstd_7','diff_1'
]

from sklearn.preprocessing import LabelEncoder
le_store = LabelEncoder()
le_menu  = LabelEncoder()

train['업장명'] = train['업장명'].astype(str)
train['메뉴명'] = train['메뉴명'].astype(str)

le_store.fit(train['업장명'].fillna('NA'))
le_menu.fit(train['메뉴명'].fillna('NA'))

train['업장_code'] = le_store.transform(train['업장명'].fillna('NA')).astype(np.int32)
train['메뉴_code'] = le_menu.transform(train['메뉴명'].fillna('NA')).astype(np.int32)

feat_cols = base_feats + ['업장_code','메뉴_code']

stack_list = []
keep_cols = ['영업일자'] + feat_cols + ['업장명','메뉴명', GROUP_COL]

for h in HORIZONS:
    tmp = train.dropna(subset=[f'target_h{h}']).copy()
    tmp['h'] = h
    tmp['target'] = tmp[f'target_h{h}']
    stack_list.append(tmp[keep_cols + ['h','target']])

train_stack = pd.concat(stack_list, axis=0, ignore_index=True)
train_stack = train_stack.dropna(subset=feat_cols)  # lag/roll 결측 제거

 def norm_name(s):
    return normalize_key(str(s))

FOCUS_STORES = ['담하', '미라시아']
FOCUS_STORES_NORM = [normalize_key(s) for s in FOCUS_STORES]

def make_weight(row):
    w = 1.0
    if any(fs in norm_name(row['업장명']) for fs in FOCUS_STORES_NORM):
        w *= 2.0
    if row['target'] == 0:
        w *= 0.5
    return w

train_stack['weight'] = train_stack.apply(make_weight, axis=1).astype(np.float32)

train_stack['영업일자'] = pd.to_datetime(train_stack['영업일자'], errors='coerce')
cutoff_date = train_stack['영업일자'].dropna().quantile(0.9)
train_stack['is_val'] = train_stack['영업일자'] >= cutoff_date

X = train_stack[feat_cols + ['h']].astype(np.float32)
y = train_stack['target'].astype(np.float32)
w = train_stack['weight'].astype(np.float32).values

X_train = X[~train_stack['is_val']]; y_train = y[~train_stack['is_val']]; w_train = w[~train_stack['is_val']]
X_valid = X[ train_stack['is_val']]; y_valid = y[ train_stack['is_val']]
print("🧱 Train/Valid shapes:", X_train.shape, X_valid.shape)
print("📅 cutoff_date:", cutoff_date)

#lgbm + blending

In [None]:
def smape_ignore_zero(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    yt = y_true[mask]; yp = y_pred[mask]
    return np.mean(2.0 * np.abs(yp - yt) / (np.abs(yt) + np.abs(yp) + eps))


lgbm = lgb.LGBMRegressor(
    n_estimators=5000, learning_rate=0.03,
    num_leaves=64, max_depth=-1,
    subsample=0.9, colsample_bytree=0.8,
    min_child_samples=40, reg_alpha=1.0, reg_lambda=2.0,
    n_jobs=-1, random_state=42
)
lgbm.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='l1',
    callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
)
valid_pred = lgbm.predict(X_valid, num_iteration=lgbm.best_iteration_)
valid_pred = np.clip(valid_pred, 0, None)
print(f"🔎 LGBM valid SMAPE(0제외): {smape_ignore_zero(y_valid.values, valid_pred):.6f}")

train_stack['nonzero'] = (train_stack['target'] > 0).astype(np.int8)
X_all = train_stack[feat_cols + ['h']].astype(np.float32)

clf = lgb.LGBMClassifier(
    n_estimators=3000, learning_rate=0.03,
    num_leaves=63, subsample=0.9, colsample_bytree=0.8,
    min_child_samples=40, reg_alpha=0.5, reg_lambda=1.0, random_state=42
)
clf.fit(
    X_all[~train_stack['is_val']], train_stack['nonzero'][~train_stack['is_val']],
    sample_weight=train_stack.loc[~train_stack['is_val'], 'weight'],
    eval_set=[(X_all[train_stack['is_val']], train_stack['nonzero'][train_stack['is_val']])],
    callbacks=[lgb.early_stopping(200, verbose=False)]
)

reg = lgb.LGBMRegressor(
    n_estimators=5000, learning_rate=0.03,
    num_leaves=127, subsample=0.9, colsample_bytree=0.8,
    min_child_samples=30, reg_alpha=1.0, reg_lambda=2.0,
    objective='tweedie', tweedie_variance_power=1.2, random_state=42
)
nz = train_stack['nonzero'] == 1
y_reg = np.log1p(train_stack.loc[nz, 'target'])
reg.fit(
    X_all[nz & (~train_stack['is_val'])], y_reg[nz & (~train_stack['is_val'])],
    sample_weight=train_stack.loc[nz & (~train_stack['is_val']), 'weight'],
    eval_set=[(X_all[nz & (train_stack['is_val'])], y_reg[nz & (train_stack['is_val'])])],
    eval_metric='l1', callbacks=[lgb.early_stopping(300, verbose=False)]
)

p_val  = clf.predict_proba(X_valid)[:, 1]
mu_val = np.expm1(reg.predict(X_valid))
zi_val = np.clip(p_val * mu_val, 0, None)

#도우나이브
train_nonval = train[train['영업일자'] < cutoff_date].copy()
train_nonval['dow'] = train_nonval['영업일자'].dt.weekday
grp_dow = train_nonval.groupby(['영업장명_메뉴명','dow'])['매출수량'].mean()


valid_meta = train_stack.loc[train_stack['is_val'], [GROUP_COL, '영업일자']].copy()
valid_meta['dow'] = valid_meta['영업일자'].dt.weekday
naive_val = valid_meta.set_index([GROUP_COL, 'dow']).index.map(grp_dow).to_series(index=valid_meta.index).values
naive_val = np.where(np.isnan(naive_val), valid_pred.mean(), naive_val)



def optimize_blend_weights(y_true, preds, step=0.02):
    lgbm_v, zi_v, nv = preds
    grid = np.arange(0.0, 1.0+1e-9, step)
    best_w, best = (1.0,0.0,0.0), smape_ignore_zero(y_true, lgbm_v)
    for w1 in grid:
        for w2 in grid:
            w3 = 1.0 - w1 - w2
            if w3 < 0: continue
            blend = w1*lgbm_v + w2*zi_v + w3*nv
            score = smape_ignore_zero(y_true, np.clip(blend, 0, None))
            if not np.isnan(score) and score < best:
                best, best_w = score, (w1,w2,w3)
    return best_w, best

best_w, best_score = optimize_blend_weights(y_valid.values, (valid_pred, zi_val, naive_val), step=0.02)
w1, w2, w3 = best_w
print(f"🏁 Best weights: LGBM={w1:.2f} ZI={w2:.2f} Naive={w3:.2f} | SMAPE={best_score:.6f}")


# ------------------------------------------
# 포커스 업장 스케일 α 탐색 (담하/미라시아만)
# ------------------------------------------
valid_store = train_stack.loc[train_stack['is_val'], '업장명'].map(norm_name).values
focus_mask = np.array([any(fs in st for fs in FOCUS_STORES_NORM) for st in valid_store])


base_blend = np.clip(w1*valid_pred + w2*zi_val + w3*naive_val, 0, None)

def search_focus_alpha(y_true, blend, mask, alphas=np.round(np.arange(0.70, 1.31, 0.02),2)):
    best_a, best = 1.00, smape_ignore_zero(y_true, blend)
    for a in alphas:
        adj = blend.copy()
        adj[mask] = np.clip(a*adj[mask], 0, None)
        score = smape_ignore_zero(y_true, adj)
        if score < best: best, best_a = score, a
    return best_a, best

alpha, best_blend = search_focus_alpha(y_valid.values, base_blend, focus_mask)
print(f"🎯 Best focus alpha: {alpha:.2f} | SMAPE={best_blend:.6f}")

# 최종 검증 블렌드 점수
final_valid = base_blend.copy()
final_valid[focus_mask] = np.clip(alpha*final_valid[focus_mask], 0, None)
print(f"🌀 Final blended valid SMAPE(0제외): {smape_ignore_zero(y_valid.values, final_valid):.6f}")

# 테스트에서 쓸 전역 파라미터 저장
BLEND_WEIGHTS = (w1, w2, w3)
FOCUS_ALPHA = alpha

##test

In [None]:
def make_test_features(test_df):
    test_df = ensure_store_menu(test_df)
    test_df = add_calendar_features(test_df)
    test_df = add_ts_features(test_df)

    def safe_transform(le, series):
        mapping = {k: i for i, k in enumerate(le.classes_)}
        code = series.map(mapping).astype('Int64')  # pandas nullable int
        unk_code = (pd.Series(mapping.values()).max() if len(mapping) else -1) + 1
        return code.fillna(unk_code).astype(np.int32)

    test_df['업장명'] = test_df['업장명'].astype(str)
    test_df['메뉴명'] = test_df['메뉴명'].astype(str)
    test_df['업장_code'] = safe_transform(le_store, test_df['업장명'])
    test_df['메뉴_code'] = safe_transform(le_menu,  test_df['메뉴명'])
    return test_df

# (메뉴,요일) 평균은 train에서만 계산 → 위 grp 재사용
def predict_test_file(path):
    filename = os.path.basename(path)
    prefix = re.search(r'(TEST_\d+)', filename).group(1)

    test_df = pd.read_csv(path)
    test_df = make_test_features(test_df)

    # predict_test_file 내부, feats_last 만든 직후
    last_date = test_df.groupby('영업장명_메뉴명')['영업일자'].max()
    last_dow  = last_date.dt.weekday  # 0~6

    # 그룹별 마지막 시점 피처
    test_df = test_df.sort_values(['영업장명_메뉴명','영업일자'])
    last_idx = test_df.groupby('영업장명_메뉴명')['영업일자'].idxmax()
    feats_last = test_df.loc[last_idx, ['영업장명_메뉴명','업장명','메뉴명'] + feat_cols].copy()
    feats_last = feats_last.dropna(subset=feat_cols)
    if len(feats_last) == 0:
        return pd.DataFrame(columns=['영업일자','영업장명_메뉴명','매출수량'])

    # 공통 X
    def infer_one_h(h):
        tmp = feats_last.copy()
        tmp['h'] = h
        Xh = tmp[feat_cols + ['h']].astype(np.float32)
        # LGBM
        lgbh = np.clip(lgbm.predict(Xh, num_iteration=lgbm.best_iteration_), 0, None)
        # ZI
        ph  = clf.predict_proba(Xh)[:,1]
        muh = np.expm1(reg.predict(Xh))
        zih = np.clip(ph * muh, 0, None)

        # ---- DOW naive (horizon별 요일 반영) ----
        keys = tmp['영업장명_메뉴명'].values
        dow_h = (last_dow.loc[keys].values + h) % 7

        idx = pd.MultiIndex.from_arrays([keys, dow_h])
        naive_vals = grp_dow.reindex(idx).to_numpy()

        if np.any(pd.isna(naive_vals)):
            menu_mean = train.groupby('영업장명_메뉴명')['매출수량'].mean()
            fill1 = pd.Series(keys).map(menu_mean).to_numpy()
            fill1 = np.where(pd.isna(fill1), float(menu_mean.mean()), fill1)
            naive_vals = np.where(pd.isna(naive_vals), fill1, naive_vals)

        # ---- 블렌드 (검증에서 학습된 전역 가중치 + 포커스 업장 α) ----
        w1, w2, w3 = BLEND_WEIGHTS
        blend = w1*lgbh + w2*zih + w3*naive_vals

        stores = tmp['업장명'].map(norm_name).values  # ← tmp로 통일
        focus_mask = np.array([any(fs in st for fs in FOCUS_STORES_NORM) for st in stores])

        blend[focus_mask] = FOCUS_ALPHA * blend[focus_mask]
        blend = np.clip(blend, 0, None)


        out = tmp[['영업장명_메뉴명']].copy()
        out['영업일자'] = f"{prefix}+{h}일"
        out['매출수량'] = blend
        return out

    outs = [infer_one_h(h) for h in HORIZONS]
    return pd.concat(outs, ignore_index=True)

all_preds = []
for p in test_files:
    all_preds.append(predict_test_file(p))
pred_full = pd.concat(all_preds, ignore_index=True)


##submission

In [None]:
# -----------------------------
# 제출 변환
# -----------------------------
def convert_to_submission(pred_df, sample_df):
    out = sample_df.copy()
    pred_df = pred_df.copy()
    pred_df['영업장명_메뉴명'] = pred_df['영업장명_메뉴명'].apply(normalize_key)
    pred_df['영업일자'] = pred_df['영업일자'].astype(str)

    # 샘플 컬럼 정규화
    col_map = {}
    for c in out.columns:
        if c == '영업일자':
            continue
        col_map[c] = normalize_key(c)

    # key → value
    pred_df['norm_menu'] = pred_df['영업장명_메뉴명'].apply(normalize_key)
    key = list(zip(pred_df['영업일자'], pred_df['norm_menu']))
    pred_dict = dict(zip(key, pred_df['매출수량']))

    # 채우기
    for i in range(len(out)):
        date = str(out.loc[i, '영업일자'])
        for c in out.columns[1:]:
            out.loc[i, c] = pred_dict.get((date, col_map[c]), 0)
    return out

submission = convert_to_submission(pred_full, sample_submission)
submission.to_csv(SAVE_PATH, index=False, encoding="utf-8-sig")
print("최종 제출 파일 저장 완료:", SAVE_PATH)
print(submission.head(3).to_string()[:1000])