<a href="https://colab.research.google.com/github/satou409/SIGNATE-.ipynb/blob/main/SIGNATE%E3%81%8A%E5%BC%81%E5%BD%93.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# ---------- 0) 読み込み ----------
import pandas as pd
# ファイル読み込み
train = pd.read_csv('/content/drive/MyDrive/SIGNATE/お弁当/train.csv')
test = pd.read_csv('/content/drive/MyDrive/SIGNATE/お弁当/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/SIGNATE/お弁当/sample.csv',header=None)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# =============================
# SIGNATE「お弁当」 二段階残差モデル（統合版）
# - 天気4分類（快晴/晴れ/曇り系/悪天候）
# - 指数減衰トレンド（前半10日除外でfit）
# - 人気スコア：残差の平均（連続値）＋薄いtop/botフラグ
# - menu_category / meat_category / 交互作用 menu_meat
# =============================

# === 0) 準備 ===
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from scipy.optimize import curve_fit

RANDOM_SEED = 42
N_SPLITS    = 5

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# === 1) 読み込み・基本前処理 ===
for df in (train, test):
    df["datetime"] = pd.to_datetime(df["datetime"])
    df["payday"]   = df["payday"].fillna(0)
    df["event"]    = df["event"].fillna("なし")
    df["remarks"]  = df["remarks"].fillna("なし")
    df["kcal"]     = df["kcal"].fillna(train["kcal"].mean()).astype(int) # kcal の欠損を平均値で埋めた後に整数に変換

train = train.sort_values("datetime").reset_index(drop=True)
test  = test.sort_values("datetime").reset_index(drop=True)

# === 2) 天気4分類 ===
def bin_weather(s):
    if pd.isna(s): return "不明"
    s = str(s)
    if s == "快晴": return "快晴"
    if s == "晴れ": return "晴れ"
    if s in ["曇", "薄曇", "薄雲"]: return "曇り系"
    if s in ["雨", "雪", "雷電"]:  return "悪天候"
    return "その他"

for df in (train, test):
    df["weather_bin"] = df["weather"].apply(bin_weather)

In [4]:
# === 3) 指数減衰トレンド（前半10日除外でfit） ===
t0 = train["datetime"].min()
train["t"] = (train["datetime"] - t0).dt.days.astype(int)

mask = train["t"] > 10
t = train.loc[mask, "t"].astype(float).values
y = train.loc[mask, "y"].astype(float).values

def exp_decay(t, a, b, c):
    # y = a * exp(-b t) + c
    return a * np.exp(-b * t) + c

c0 = np.median(train["y"].tail(30))
a0 = max(y) - c0 if (max(y) - c0) > 1 else (y.mean() + 1)
b0 = 0.01
bounds = ([0.0, 1e-6, 0.0], [np.inf, 1.0, np.inf])

popt, _ = curve_fit(exp_decay, t, y, p0=(a0, b0, c0), bounds=bounds, maxfev=10000)
a, b, c = popt
print(f"[Trend] a={a:.2f}, b={b:.4f}, c={c:.2f}")

train["trend_decay"]    = exp_decay(train["t"].values, a, b, c)
train["residual_decay"] = train["y"] - train["trend_decay"]

# test側のトレンド付与
test["t"] = (test["datetime"] - t0).dt.days.astype(int)
test["trend_decay"] = exp_decay(test["t"].values, a, b, c)


[Trend] a=117.02, b=0.0069, c=40.76


In [5]:
# === 4) 人気スコア（連続値）＋薄いフラグ ===
TOP_N, BOT_N = 5, 5        # popular/unpopular を弱く入れる

menu_scores = train.groupby("name")["residual_decay"].mean()
top_names = set(menu_scores.sort_values(ascending=False).head(TOP_N).index)
bot_names = set(menu_scores.sort_values(ascending=True).head(BOT_N).index)

for df in (train, test):
    df["menu_score"] = df["name"].map(menu_scores).fillna(0.0)
    df["popular_flag"]   = df["name"].isin(top_names).astype(int)
    df["unpopular_flag"] = df["name"].isin(bot_names).astype(int)


In [6]:
# === 5) メニューカテゴリ（定義） ===
def meat_category(name):
    if any(x in name for x in ["牛", "ビーフ" ,"プルコギ"]):
        return "beef"
    if any(x in name for x in ["豚", "ポーク", "カツ" ,"サムジョン" ,"ロース" , "かつ"]):
        return "pork"
    if any(x in name for x in ["鳥", "チキン", "鶏" ]):
        return "chicken"
    if any(x in name for x in ["肉", "麻婆", "ハンバーグ", "ロコモコ", "メンチ", "マーボ" ]):
        return "meat"
    return "other"

def menu_category(name):
    if any(x in name for x in ["カレー", "かれー"]):
        return "curry"
    if any(x in name for x in ["かつ", "ヒレカツ", "ロースカツ", "野菜あん", "味噌", "すき焼", "生姜", "南部焼き", "山賊", "丼",
                               "ピリ辛焼", "照り", "ご飯", "御飯", "肉じゃが", "唐揚", "から揚", "さんま", "和風", "親子", "ゴーヤ", "しゃぶ", "甘辛煮",
                               "天ぷら", "てんぷら", "南蛮", "シャブ", "味噌カツ", "筑前煮" ,"うどん", "そば", "コロッケ" ,"筍", "スタミナ炒め",
                               "スキヤキ", "塩焼き", "治部煮" ,"梅肉", "柳川", "はさみ揚"]):
        return "japanese"
    if any(x in name for x in ["フライ", "マスタード", "シチュー", "カツ", "ロコモコ", "ハンバーグ", "ステーキ", "フリカッセ", "ムニエル", "タンドリー",
                               "クリーム", "チリソース" ,"ハヤシ" ,"クリーミー", "クリーミ―", "ビーフ", "チキン", "ポーク", "ピザ", "レモンペッパー", "ビュッフェ",
                               "ミックス"]):
        return "western"
    if any(x in name for x in ["回鍋肉", "チャプチェ", "カッシュナッツ", "カシューナッツ", "ニンニクの芽", "キムチ", "マーボ", "麻婆", "青椒肉絲" , "チンジャオ", "サムジョン",
                               "酢豚", "青梗菜牛肉炒め", "八宝菜", "プルコギ", "韓国", "中華", "焼き肉", "炒め" ,"黒酢"]):
        return "chinese"
    return "other"

In [7]:
# === 5) メニューカテゴリ（適用） ===
for df in (train, test):
    df["menu_category"] = df["name"].apply(menu_category)
    df["meat_category"] = df["name"].apply(meat_category)

menu_vocab = sorted(set(train["menu_category"]) | set(test["menu_category"]))
meat_vocab = sorted(set(train["meat_category"]) | set(test["meat_category"]))
for df in (train, test):
    df["menu_category"] = pd.Categorical(df["menu_category"], categories=menu_vocab)
    df["meat_category"] = pd.Categorical(df["meat_category"], categories=meat_vocab)
    df["menu_meat"] = df["menu_category"].astype(str) + "_" + df["meat_category"].astype(str)

In [8]:
# === 6) 残差を目的変数にして学習（時系列CV） ===
y_trend    = train["trend_decay"].values
y_residual = (train["y"] - train["trend_decay"]).values

cat_cols = ["weather_bin","menu_category","meat_category","menu_meat"]
num_cols = ["payday","kcal","menu_score","popular_flag","unpopular_flag"]
use_cols = cat_cols + num_cols

X_all = pd.get_dummies(pd.concat([train[use_cols], test[use_cols]], axis=0),
                       columns=cat_cols, dummy_na=False)
X_tr  = X_all.iloc[:len(train)].reset_index(drop=True)
X_te  = X_all.iloc[len(train):].reset_index(drop=True)

tscv = TimeSeriesSplit(n_splits=N_SPLITS)
oof_resid = np.zeros(len(train))

for fold, (tr_idx, va_idx) in enumerate(tscv.split(X_tr)):
    X_trn, X_val = X_tr.iloc[tr_idx], X_tr.iloc[va_idx]
    y_trn, y_val = y_residual[tr_idx], y_residual[va_idx]

    # 木 or Ridge を選んでOK（木が効きやすい）
    model = RandomForestRegressor(
        n_estimators=400, max_depth=6, min_samples_leaf=4,
        random_state=RANDOM_SEED, n_jobs=-1
    )
    # model = Ridge(alpha=1.0)

    model.fit(X_trn, y_trn)
    oof_resid[va_idx] = model.predict(X_val)
    rmse_fold = mse(train["y"].values[va_idx], y_trend[va_idx] + oof_resid[va_idx])**0.5
    print(f"[Fold {fold}] RMSE = {rmse_fold:.3f}")

rmse_cv = mse(train["y"].values, y_trend + oof_resid)**0.5
print(f"[CV AVG] RMSE = {rmse_cv:.3f}")

[Fold 0] RMSE = 9.177
[Fold 1] RMSE = 11.769
[Fold 2] RMSE = 12.802
[Fold 3] RMSE = 6.690
[Fold 4] RMSE = 7.842
[CV AVG] RMSE = 13.839


In [8]:
'''
# === 7) 全学習 → test予測 & 提出 ===
final_model = RandomForestRegressor(
    n_estimators=600, max_depth=6, min_samples_leaf=3,
    random_state=RANDOM_SEED, n_jobs=-1
)
final_model.fit(X_tr, y_residual)

pred_resid_tr = final_model.predict(X_tr)
pred_resid_te = final_model.predict(X_te)

pred_train = y_trend + pred_resid_tr
pred_test  = test["trend_decay"].values + pred_resid_te

print("Train RMSE (re-fit):", mse(train["y"].values, pred_train)**0.5)

# クリップ＆整数化（必要なら外してOK）
pred_test = np.clip(pred_test, 0, None)
pred_test_rounded = np.rint(pred_test).astype(int)

# sample.csvが1列想定（SIGNATE）
submission = sample.copy()
submission.iloc[:, 0] = pred_test_rounded

out_path = "/content/submission_residual_two_stage.csv"
submission.to_csv(out_path, index=False, header=False)
print("Saved:", out_path)
'''