In [None]:


######################################################################
# 1. Load
######################################################################
from pathlib import Path
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
sns.set_style("whitegrid")

DATA_PATH = Path('/content/4class.csv')
assert DATA_PATH.exists(), f"{DATA_PATH} not found—upload it first."
df = pd.read_csv(DATA_PATH)
print(f"✅ raw: {df.shape[0]:,} rows × {df.shape[1]} cols")

######################################################################
# 2. Leakage & ID removal
######################################################################
df = df.drop(columns=[
    c for c in ["DateOfCancel","CanceledQty","net_qty","days_to_cancel",
                "Unnamed: 0","TransactionId","BookingNr","OrderId"]
    if c in df.columns])
print("Remaining after leakage/ID purge:", df.shape[1])

######################################################################
# 3. Feature engineering
######################################################################
df["DateOfOrder"]   = pd.to_datetime(df["DateOfOrder"])
df["DateOfService"] = pd.to_datetime(df["DateOfService"])
df["lead_time"]     = (df["DateOfService"] - df["DateOfOrder"]).dt.days.clip(lower=0)

srv = df["DateOfService"]
df["srv_year"]        = srv.dt.year
df["srv_month"]       = srv.dt.month
df["srv_dayofmonth"]  = srv.dt.day
df["srv_weekday"]     = srv.dt.weekday
df["srv_is_weekend"]  = srv.dt.weekday.isin([5,6]).astype(int)

df["price_paid"] = df["MenuPrice"] - df["MenuSubsidy"]
df = df.drop(columns=["DateOfOrder","DateOfService"])

######################################################################
# 4. Trim low info columns
######################################################################
miss_pct = df.isna().mean()
df = df.drop(columns=miss_pct[miss_pct>0.30].index)

from sklearn.feature_selection import VarianceThreshold
num_df   = df.select_dtypes(include=[np.number]).fillna(0)
keep     = VarianceThreshold(0.01).fit(num_df).get_support()
df = df.drop(columns=list(set(num_df.columns)-set(num_df.columns[keep])))
print("Final working columns:", df.shape[1])

######################################################################
# 5. Order level classification
######################################################################
from sklearn.model_selection import train_test_split
from sklearn.compose  import ColumnTransformer
from sklearn.pipeline  import Pipeline
from sklearn.metrics   import classification_report, confusion_matrix
from category_encoders import TargetEncoder
import lightgbm as lgb
from imblearn.over_sampling import SMOTENC

y = df["cancel_timing"]
X = df.drop(columns=["cancel_timing"])

# ── 5% FOR TEST ─────────────────────────────────────────
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.05, stratify=y, random_state=42)

X_tr, X_val, y_tr, y_val = train_test_split(
    X_temp, y_temp, test_size=0.20, stratify=y_temp, random_state=42)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = sorted(list(set(X.columns) - set(num_cols)))
cat_idx  = [X.columns.get_loc(c) for c in cat_cols]

print("\nBefore SMOTE:", y_tr.value_counts())

smote = SMOTENC(
    categorical_features=cat_idx,
    sampling_strategy="not majority",
    random_state=42
)
X_tr_bal, y_tr_bal = smote.fit_resample(X_tr, y_tr)

print("After  SMOTE:", pd.Series(y_tr_bal).value_counts())

pre = ColumnTransformer([
    ("num","passthrough",num_cols),
    ("cat",TargetEncoder(handle_missing="value",min_samples_leaf=20),cat_cols)],
    verbose_feature_names_out=False)

clf = lgb.LGBMClassifier(
    objective="multiclass", n_estimators=800, learning_rate=0.05,
    num_leaves=64, subsample=0.8, colsample_bytree=0.8,
    class_weight="balanced", min_gain_to_split=0.001, random_state=42)

pipe_cls = Pipeline([("prep",pre),("clf",clf)]).fit(X_tr_bal, y_tr_bal)

print("\n=== Validation set ===")
pred_val = pipe_cls.predict(X_val)
print(classification_report(y_val, pred_val))

print("\n=== FINAL Test set (5 %) ===")
pred_test = pipe_cls.predict(X_test)
print(classification_report(y_test, pred_test))

cm = confusion_matrix(y_val, pred_val, labels=pipe_cls.classes_)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=pipe_cls.classes_, yticklabels=pipe_cls.classes_)
plt.title("Confusion Matrix (validation)"); plt.xlabel("Pred"); plt.ylabel("Actual")
plt.show()

import shap

try:
    X_val_enc = pipe_cls["prep"].transform(X_val)
    shap.summary_plot(
        shap.TreeExplainer(pipe_cls["clf"])(X_val_enc, check_additivity=False),
        feature_names=pipe_cls["prep"].get_feature_names_out(),
        show=False)
    plt.title("SHAP – order level"); plt.show()
except Exception as e:
    print("SHAP skipped:", e)

######################################################################
# 6. Group daily Poisson regression
######################################################################
df["is_cancel"] = (df["cancel_timing"]!="no_cancel").astype(int)

agg_dict = {
    "order_cnt":  ("is_cancel","size"),
    "cancel_cnt": ("is_cancel","sum"),
}

for col, fn in {
        "lead_time":"mean", "price_paid":"mean",
        "tavg_C":"mean", "prcp_mm":"mean","temp_dev":"mean","is_holiday":"max"}.items():
    if col in df.columns:
        agg_dict[f"{col}_{fn}"] = (col, fn)

print("\nAggregating with cols:", list(agg_dict.keys()))

grp_keys = ["Site","MenuName","srv_year","srv_month","srv_dayofmonth"]
agg_df = df.groupby(grp_keys, as_index=False).agg(**agg_dict)

y_r  = agg_df["cancel_cnt"]
X_r  = agg_df.drop(columns="cancel_cnt")
mask = ((X_r["srv_year"]<2023) | ((X_r["srv_year"]==2023)&(X_r["srv_month"]<=6)))
X_tr_r, X_val_r_orig, y_tr_r, y_val_r_orig = X_r[mask], X_r[~mask], y_r[mask], y_r[~mask]

X_val_r, X_test_r, y_val_r, y_test_r = train_test_split(
    X_val_r_orig, y_val_r_orig, test_size=0.05, random_state=42)

num_cols_r = X_r.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_r = sorted(list(set(X_r.columns)-set(num_cols_r)))

pre_r = ColumnTransformer([
    ("num","passthrough",num_cols_r),
    ("cat",TargetEncoder(handle_missing="value",min_samples_leaf=10),cat_cols_r)],
    verbose_feature_names_out=False)

reg = lgb.LGBMRegressor(
    objective="poisson", n_estimators=600, learning_rate=0.05,
    num_leaves=64, subsample=0.8, colsample_bytree=0.8,
    min_gain_to_split=0.001, random_state=42)

pipe_reg = Pipeline([("prep",pre_r),("reg",reg)]).fit(X_tr_r, y_tr_r)
pred_val_r = pipe_reg.predict(X_val_r).clip(min=0)
pred_test_r = pipe_reg.predict(X_test_r).clip(min=0)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae  = mean_absolute_error(y_val_r, pred_val_r)
rmse = np.sqrt(mean_squared_error(y_val_r, pred_val_r))
r2   = r2_score(y_val_r, pred_val_r)

mae_t  = mean_absolute_error(y_test_r, pred_test_r)
rmse_t = np.sqrt(mean_squared_error(y_test_r, pred_test_r))
r2_t   = r2_score(y_test_r, pred_test_r)

print("\n=== Validation (group daily) ===")
print(f"MAE: {mae:.4f} | RMSE: {rmse:.4f} | R²: {r2:.4f}")

print("\n=== FINAL Test (group daily, 5 %) ===")
print(f"MAE: {mae_t:.4f} | RMSE: {rmse_t:.4f} | R²: {r2_t:.4f}")

try:
    X_val_r_enc = pipe_reg["prep"].transform(X_val_r)
    shap.summary_plot(
        shap.TreeExplainer(pipe_reg["reg"])(X_val_r_enc, check_additivity=False),
        feature_names=pipe_reg["prep"].get_feature_names_out(),
        show=False)
    plt.title("SHAP – group daily"); plt.show()
except Exception as e:
    print("SHAP skipped:", e)

######################################################################
# 7. Save cleaned datasets
######################################################################
order_out = Path("/content/4class_order_clean.csv")
group_out = Path("/content/4class_group_daily.csv")
df.drop(columns="is_cancel").to_csv(order_out, index=False)
agg_df.to_csv(group_out, index=False)
print(f"📁 Saved order level CSV → {order_out}")
print(f"📁 Saved group daily CSV → {group_out}")


