In [2]:
# =============================================================================
# 1. IMPORTS & SETUP
# =============================================================================
import numpy as np
import pandas as pd
from IPython.core.display_functions import display

# Sklearn & related imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Other libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score

# Load data
df_weather2 = pd.read_csv('/Users/shayan/Desktop/IDS2/Stattkueche/df_weather3.csv', parse_dates=['DateOfCancel', 'DateOfService'])



In [3]:
# =============================================================================
# 2. CUSTOM TRANSFORMER DEFINITIONS
# =============================================================================

class InCVTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, smoothing=1.0):
        self.cols = cols
        self.smoothing = smoothing

    def fit(self, X, y):
        self.global_mean_ = y.mean()
        self.mapping_ = {}
        for c in self.cols:
            df = pd.DataFrame({c: X[c], 'target': y})
            agg = df.groupby(c)['target'].agg(['mean', 'count'])
            # smoothing formula
            agg['enc'] = (
                (agg['count'] * agg['mean'] +
                 self.smoothing * self.global_mean_)
                / (agg['count'] + self.smoothing)
            )
            self.mapping_[c] = agg['enc']
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            X[c + '_te'] = X[c]\
                .map(self.mapping_.get(c))\
                .fillna(self.global_mean_)
        return X

class HistCancelRateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_keys=('Site','MenuBase'), value_col='net_qty', out_col='hist_cancel_rate'):
        self.group_keys = group_keys
        self.value_col  = value_col
        self.out_col    = out_col
    def fit(self, X, y=None):
        keys = list(self.group_keys)
        self.hist_    = X.groupby(keys)[self.value_col].mean()
        self.default_ = self.hist_.median()
        return self
    def transform(self, X):
        keys   = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X      = X.copy()
        X[self.out_col] = [self.hist_.get(t, self.default_) for t in tuples]
        return X

class ClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, profile_feats, group_keys=('Site','MenuBase'),
                 n_clusters=5, out_col='cluster_id'):
        self.profile_feats = profile_feats
        self.group_keys    = group_keys
        self.n_clusters    = n_clusters
        self.out_col       = out_col
    def fit(self, X, y=None):
        keys = list(self.group_keys)
        prof = (X.groupby(keys)[self.profile_feats].mean().reset_index())
        prof[self.profile_feats] = prof[self.profile_feats].fillna(prof[self.profile_feats].median())
        self.scaler_ = StandardScaler().fit(prof[self.profile_feats])
        scaled      = self.scaler_.transform(prof[self.profile_feats])
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10).fit(scaled) # Set n_init explicitly
        tuples      = [tuple(r) for r in prof[keys].values]
        self.cluster_map_ = dict(zip(tuples, self.kmeans_.labels_))
        self.default_     = int(np.median(self.kmeans_.labels_))
        return self
    def transform(self, X):
        X = X.copy()
        keys   = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X[self.out_col] = [self.cluster_map_.get(t, self.default_) for t in tuples]
        return X

class MissingFlagImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy
    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=[np.number]).columns
        clean = X[self.num_cols].replace([np.inf,-np.inf], np.nan)
        self.imputer_ = SimpleImputer(strategy=self.strategy).fit(clean)
        return self
    def transform(self, X):
        X = X.copy()
        X[self.num_cols] = X[self.num_cols].replace([np.inf,-np.inf], np.nan)
        for c in self.num_cols:
            X[c + '_missing'] = X[c].isna().astype(int)
        X[self.num_cols] = self.imputer_.transform(X[self.num_cols])
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.cols_to_drop, errors='ignore')


In [4]:
# =============================================================================
# 3. VIF CHECK
# =============================================================================
num_cols = df_weather2.select_dtypes(include=[np.number]).columns.to_list()

drop_col = ['DateOfOrder', 'DateOfService','days_to_cancel','CanceledQty','OrderId', 'TransactionId','cancel_timing','BookingNr','DateOfCancel','net_qty']
vif_col_1 = [c for c in num_cols if c not in drop_col]

# FIX 1: Changed df_encoded to df_weather2
vif_dat_1 = df_weather2[vif_col_1].copy()
vif_dat_1.replace([np.inf,-np.inf],np.nan,inplace=True)
vif_dat_1 = vif_dat_1.dropna()

vif_scores_1 = [variance_inflation_factor(vif_dat_1.values, i) for i in range(vif_dat_1.shape[1])]

vif_table_1 = (pd.DataFrame({'feature':vif_col_1,'VIF':vif_scores_1}).sort_values('VIF',ascending=False).reset_index(drop=True))
print('VIF values')
display(vif_table_1)

hg_vif_1 = vif_table_1.loc[vif_table_1['VIF']>10,'feature'].to_list()
print('the high vif columns dropped')
df_without_vif_1 = vif_dat_1.drop(columns=hg_vif_1)
print(df_without_vif_1.columns)


  vif = 1. / (1. - r_squared_i)


VIF values


Unnamed: 0,feature,VIF
0,afternoon,inf
1,morning,inf
2,evening,inf
3,day_of_year,31028.98
4,month,30864.69
5,day_of_month,130.0759
6,quarter,22.09046
7,MenuSubsidy,6.593575
8,MenuPrice,6.495096
9,order_hour,5.359547


the high vif columns dropped
Index(['Unnamed: 0', 'OrderQty', 'MenuPrice', 'MenuSubsidy', 'weekday',
       'is_weekend', 'sin_doy', 'cos_doy', 'is_month_end', 'is_month_start',
       'order_hour', 'hist_cancel_rate', 'is_holiday', 'tavg_C', 'prcp_mm',
       'rain_flag', 'temp_dev'],
      dtype='object')


In [5]:
# =============================================================================
# 4. STAGE A: RANDOM FOREST
# =============================================================================
df_A    = df_weather2.copy()
y_A     = (df_A['CanceledQty'] > 0).astype(int)
X_A     = df_A.drop(columns=[
    'CanceledQty', 'cancel_timing', 'DateOfOrder', 'DateOfService', 'DateOfCancel',
    'OrderId', 'TransactionId', 'BookingNr', 'hist_cancel_rate', 'GroupName', 'SchoolID'
])

drop_cols_A = [
    'Site','MenuBase','MenuName','GroupName','MenuNorm','MenuCode','net_qty','days_to_cancel'
]

pipeline_A = ImbPipeline([
    ('te',      InCVTargetEncoder(cols=['Site','MenuBase'], smoothing=0.3)), # FIX 2: This now works as the class is pre-defined
    ('hist',    HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
                   profile_feats=['hist_cancel_rate','rain_flag','temp_dev','sin_doy', 'cos_doy', 'month',
       'day_of_month', 'is_month_end', 'is_month_start','tavg_C', 'prcp_mm'],
                   n_clusters=5)),
    ('drop',    ColumnDropper(drop_cols_A)),
    ('impute',  MissingFlagImputer()),
    ('clf',     RandomForestClassifier(
                   n_estimators=500, criterion='entropy', max_depth=8,
                   min_samples_split=5, min_samples_leaf=1, max_features='sqrt',
                   class_weight='balanced_subsample', random_state=24, n_jobs=-1
               )),
])

tscv    = TimeSeriesSplit(n_splits=5)
scores  = cross_val_score(pipeline_A, X_A, y_A, cv=tscv, scoring='roc_auc', n_jobs=-1)
print("Stage A ROC-AUC:", np.round(scores.mean(),4))



KeyboardInterrupt: 

In [None]:
# =============================================================================
# 5. STAGE A: LGBM
# =============================================================================
pipeline_A_l = ImbPipeline([
    ('te',      InCVTargetEncoder(cols=['Site','MenuBase'], smoothing=0.3)),
    ('hist',    HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
                   profile_feats=['hist_cancel_rate','rain_flag','temp_dev','sin_doy', 'cos_doy', 'month',
       'day_of_month', 'is_month_end', 'is_month_start','tavg_C', 'prcp_mm'],
                   n_clusters=5)),
    ('drop',    ColumnDropper(drop_cols_A)),
    ('impute',  MissingFlagImputer()),
    ('clf',       LGBMClassifier(objective='binary', # Changed to binary for 0/1 target
                                 random_state=24,
                                 metric="roc_auc", # Common metric for binary classification
                                 n_jobs=-1))
])

param_dist_lgbm = {
    'clf__n_estimators':      [200,500,800],
    'clf__learning_rate':     [0.01,0.03,0.05],
    'clf__num_leaves':        [10, 15, 31],
    'clf__max_depth':         [6, 10, 20],
    'clf__subsample':         [0.7, 1.0],
    'clf__colsample_bytree':  [0.7, 1.0],
    'clf__min_child_samples': [10, 20]
}

tscv = TimeSeriesSplit(n_splits=5)
# FIX 3: RandomizedSearchCV is now imported
search_A_lgbm = RandomizedSearchCV(
    pipeline_A_l,
    param_distributions=param_dist_lgbm,
    n_iter=30,
    cv=tscv,
    scoring='roc_auc', # roc_auc is for binary, roc_auc_ovo is for multiclass
    n_jobs=-1,
    random_state=24
)

search_A_lgbm.fit(X_A, y_A)

print("Best LGBM ROC_AUC:", np.round(search_A_lgbm.best_score_,4))
print("Best hyper‐parameters:")
for k, v in search_A_lgbm.best_params_.items():
    print(f"  {k}: {v}")



In [None]:
# =============================================================================
# 6. STAGE B2: RANDOM FOREST
# =============================================================================
df_B2 = df_weather2[df_weather2.cancel_timing != 'no_cancel'].copy()
le    = LabelEncoder().fit(df_B2.cancel_timing)
df_B2['timing_code'] = le.transform(df_B2.cancel_timing)

X_B2 = df_B2.drop(columns=[
    'CanceledQty', 'cancel_timing','timing_code', 'DateOfOrder','DateOfService',
    'DateOfCancel', 'OrderId','TransactionId','BookingNr','hist_cancel_rate',
    'GroupName','SchoolID'
])
y_B2 = df_B2['timing_code']

drop_cols_B2 = ['MenuName','GroupName','MenuNorm','MenuCode']
drop_post = ['Site','MenuBase','net_qty','days_to_cancel']

pipeline_B2_smote = ImbPipeline([
    ('te',      InCVTargetEncoder(cols=['Site','MenuBase'], smoothing=0.3)),
    ('drop',    ColumnDropper(drop_cols_B2)),
    ('hist',    HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
                    profile_feats=['hist_cancel_rate','rain_flag','temp_dev','sin_doy', 'cos_doy', 'month',
       'day_of_month', 'is_month_end', 'is_month_start','tavg_C', 'prcp_mm'],
                    n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post)),
    ('impute',  MissingFlagImputer()),
    ('smote',   SMOTE(random_state=24)),
    ('clf',     RandomForestClassifier(
                   class_weight='balanced', random_state=24, n_jobs=-1
               )),
])

param_dist_rf = {
    'clf__n_estimators':      [600, 1200],
    'clf__max_depth':         [6, 10],
    'clf__min_samples_split': [10, 20],
    'clf__min_samples_leaf':  [15],
    'clf__max_features':      [0.8, 'sqrt'], # max_features > 1.0 is not valid
    'clf__criterion':         ['entropy'],
}

tscv = TimeSeriesSplit(n_splits=5)
search_rf = RandomizedSearchCV(
    pipeline_B2_smote,
    param_distributions=param_dist_rf,
    n_iter=20,
    cv=tscv,
    scoring='roc_auc_ovo_weighted',
    n_jobs=-1,
    random_state=24
)

search_rf.fit(X_B2, y_B2)

print("Best RF roc_auc_ovo_weighted:", np.round(search_rf.best_score_, 4))
print("Best hyper-parameters:")
for k, v in search_rf.best_params_.items():
    print(f"  {k} = {v}")


In [None]:
# =============================================================================
# 7. STAGE B2: LGBM
# =============================================================================
pipeline_B2_lgbm = ImbPipeline([
    ('te',      InCVTargetEncoder(cols=['Site','MenuBase'], smoothing=0.3)),
    ('drop',    ColumnDropper(drop_cols_B2)),
    ('hist',    HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
                    profile_feats=['hist_cancel_rate','rain_flag','temp_dev','sin_doy', 'cos_doy', 'month',
       'day_of_month', 'is_month_end', 'is_month_start','tavg_C', 'prcp_mm'],
                    n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post)),
    ('impute',  MissingFlagImputer()),
   ('clf',       LGBMClassifier(objective='multiclass',
                                 num_class=len(np.unique(y_B2)),
                                 random_state=24,
                                 metric="multi_logloss",
                                 n_jobs=-1))
])

param_dist_lgbm_b2 = {
    'clf__n_estimators':      [200, 600],
    'clf__learning_rate':     [0.01, 0.07],
    'clf__num_leaves':        [31, 40],
    'clf__max_depth':         [10, 20],
    'clf__subsample':         [0.7, 1.0],
    'clf__colsample_bytree':  [1.0], # colsample_bytree cannot be > 1.0
    'clf__min_child_samples': [20, 30]
}

tscv = TimeSeriesSplit(n_splits=5)
search_lgbm_b2 = RandomizedSearchCV(
    pipeline_B2_lgbm,
    param_distributions=param_dist_lgbm_b2,
    n_iter=30,
    cv=tscv,
    scoring='roc_auc_ovo_weighted',
    n_jobs=-1,
    random_state=24
)

search_lgbm_b2.fit(X_B2, y_B2)

print("Best LGBM ROC_AUC_OVO_weighted:", np.round(search_lgbm_b2.best_score_,4))
print("Best hyper‐parameters:")
for k, v in search_lgbm_b2.best_params_.items():
    print(f"  {k}: {v}")



In [None]:
# =============================================================================
# 8. STAGE B2: CATBOOST
# =============================================================================
pipeline_B2_cat = ImbPipeline([
    ('te',      InCVTargetEncoder(cols=['Site','MenuBase'], smoothing=0.3)),
    ('drop',    ColumnDropper(drop_cols_B2)),
    ('hist',    HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
                    profile_feats=['hist_cancel_rate','rain_flag','temp_dev','sin_doy', 'cos_doy', 'month',
       'day_of_month', 'is_month_end', 'is_month_start','tavg_C', 'prcp_mm'],
                    n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post)),
    ('impute',  MissingFlagImputer()),
    ('clf',     CatBoostClassifier(
                    iterations=500,
                    auto_class_weights='Balanced',
                    loss_function='MultiClass',
                    learning_rate=0.05,
                    depth=6,
                    early_stopping_rounds=50,
                    l2_leaf_reg=3,
                    verbose=False,
                    random_seed=24,
                    thread_count=-1
               ))
])

cat_param_dist = {
    'clf__iterations':       [500, 800, 1000],
    'clf__learning_rate':    [0.01, 0.03, 0.1],
    'clf__depth':            [6, 8, 10],
    'clf__l2_leaf_reg':      [1, 3, 10],
    'clf__bagging_temperature': [0, 3, 7],
    'clf__rsm':              [0.5, 0.8, 1.0] # same as colsample_bylevel
}
scoring_1 = {
    'pr_auc':       'average_precision',
    'roc_auc_ovo':  'roc_auc_ovo',
    'f1_macro':     'f1_macro',
    'bal_acc':      make_scorer(balanced_accuracy_score),
    'neg_log_loss': 'neg_log_loss'
}

tscv = TimeSeriesSplit(n_splits=5)
search_cat = RandomizedSearchCV(
    pipeline_B2_cat,
    param_distributions=cat_param_dist,
    n_iter=30,
    cv=tscv,
    refit='roc_auc_ovo',
    scoring=scoring_1,
    n_jobs=-1,
    random_state=24
)

search_cat.fit(X_B2, y_B2)

print("Best CatBoost Score (refit='roc_auc_ovo'):", np.round(search_cat.best_score_,4))
print("Best hyper‐parameters:")
for k, v in search_cat.best_params_.items():
    print(f"  {k}: {v}")

In [None]:
# =============================================================================
# 1. IMPORTS & SETUP
# =============================================================================
import numpy as np
import pandas as pd
from IPython.core.display_functions import display

# Sklearn & related imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Other libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import make_scorer, balanced_accuracy_score

# Load data
df_weather2 = pd.read_csv('/Users/shayan/Desktop/IDS2/Stattkueche/df_weather3.csv',
                          parse_dates=['DateOfCancel', 'DateOfService'])


# =============================================================================
# 2. CUSTOM TRANSFORMER DEFINITIONS
# =============================================================================

class InCVTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, smoothing=1.0):
        self.cols = cols
        self.smoothing = smoothing

    def fit(self, X, y):
        self.global_mean_ = y.mean()
        self.mapping_ = {}
        for c in self.cols:
            df = pd.DataFrame({c: X[c], 'target': y})
            agg = df.groupby(c)['target'].agg(['mean', 'count'])
            # smoothing formula
            agg['enc'] = (
                    (agg['count'] * agg['mean'] +
                     self.smoothing * self.global_mean_)
                    / (agg['count'] + self.smoothing)
            )
            self.mapping_[c] = agg['enc']
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            X[c + '_te'] = X[c] \
                .map(self.mapping_.get(c)) \
                .fillna(self.global_mean_)
        return X


class HistCancelRateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_keys=('Site', 'MenuBase'), value_col='net_qty', out_col='hist_cancel_rate'):
        self.group_keys = group_keys
        self.value_col = value_col
        self.out_col = out_col

    def fit(self, X, y=None):
        keys = list(self.group_keys)
        self.hist_ = X.groupby(keys)[self.value_col].mean()
        self.default_ = self.hist_.median()
        return self

    def transform(self, X):
        keys = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X = X.copy()
        X[self.out_col] = [self.hist_.get(t, self.default_) for t in tuples]
        return X


class ClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, profile_feats, group_keys=('Site', 'MenuBase'),
                 n_clusters=5, out_col='cluster_id'):
        self.profile_feats = profile_feats
        self.group_keys = group_keys
        self.n_clusters = n_clusters
        self.out_col = out_col

    def fit(self, X, y=None):
        keys = list(self.group_keys)
        prof = (X.groupby(keys)[self.profile_feats].mean().reset_index())
        prof[self.profile_feats] = prof[self.profile_feats].fillna(prof[self.profile_feats].median())
        self.scaler_ = StandardScaler().fit(prof[self.profile_feats])
        scaled = self.scaler_.transform(prof[self.profile_feats])
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10).fit(
            scaled)  # Set n_init explicitly
        tuples = [tuple(r) for r in prof[keys].values]
        self.cluster_map_ = dict(zip(tuples, self.kmeans_.labels_))
        self.default_ = int(np.median(self.kmeans_.labels_))
        return self

    def transform(self, X):
        X = X.copy()
        keys = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X[self.out_col] = [self.cluster_map_.get(t, self.default_) for t in tuples]
        return X


class MissingFlagImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy

    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=[np.number]).columns
        clean = X[self.num_cols].replace([np.inf, -np.inf], np.nan)
        self.imputer_ = SimpleImputer(strategy=self.strategy).fit(clean)
        return self

    def transform(self, X):
        X = X.copy()
        X[self.num_cols] = X[self.num_cols].replace([np.inf, -np.inf], np.nan)
        for c in self.num_cols:
            X[c + '_missing'] = X[c].isna().astype(int)
        X[self.num_cols] = self.imputer_.transform(X[self.num_cols])
        return X


class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.cols_to_drop, errors='ignore')


# =============================================================================
# 3. VIF CHECK
# =============================================================================
num_cols = df_weather2.select_dtypes(include=[np.number]).columns.to_list()

drop_col = ['DateOfOrder', 'DateOfService', 'days_to_cancel', 'CanceledQty', 'OrderId', 'TransactionId',
            'cancel_timing', 'BookingNr', 'DateOfCancel', 'net_qty']
vif_col_1 = [c for c in num_cols if c not in drop_col]

# FIX 1: Changed df_encoded to df_weather2
vif_dat_1 = df_weather2[vif_col_1].copy()
vif_dat_1.replace([np.inf, -np.inf], np.nan, inplace=True)
vif_dat_1 = vif_dat_1.dropna()

vif_scores_1 = [variance_inflation_factor(vif_dat_1.values, i) for i in range(vif_dat_1.shape[1])]

vif_table_1 = (
    pd.DataFrame({'feature': vif_col_1, 'VIF': vif_scores_1}).sort_values('VIF', ascending=False).reset_index(
        drop=True))
print('VIF values')
display(vif_table_1)

hg_vif_1 = vif_table_1.loc[vif_table_1['VIF'] > 10, 'feature'].to_list()
print('the high vif columns dropped')
df_without_vif_1 = vif_dat_1.drop(columns=hg_vif_1)
print(df_without_vif_1.columns)

# =============================================================================
# 4. STAGE A: RANDOM FOREST
# =============================================================================
df_A = df_weather2.copy()
y_A = (df_A['CanceledQty'] > 0).astype(int)
X_A = df_A.drop(columns=[
    'CanceledQty', 'cancel_timing', 'DateOfOrder', 'DateOfService', 'DateOfCancel',
    'OrderId', 'TransactionId', 'BookingNr', 'hist_cancel_rate', 'GroupName', 'SchoolID'
])

drop_cols_A = [
    'Site', 'MenuBase', 'MenuName', 'GroupName', 'MenuNorm', 'MenuCode', 'net_qty', 'days_to_cancel'
]

pipeline_A = ImbPipeline([
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    # FIX 2: This now works as the class is pre-defined
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
        profile_feats=['hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy', 'month',
                       'day_of_month', 'is_month_end', 'is_month_start', 'tavg_C', 'prcp_mm'],
        n_clusters=5)),
    ('drop', ColumnDropper(drop_cols_A)),
    ('impute', MissingFlagImputer()),
    ('clf', RandomForestClassifier(
        n_estimators=500, criterion='entropy', max_depth=8,
        min_samples_split=5, min_samples_leaf=1, max_features='sqrt',
        class_weight='balanced_subsample', random_state=24, n_jobs=-1
    )),
])

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(pipeline_A, X_A, y_A, cv=tscv, scoring='roc_auc', n_jobs=-1)
print("Stage A ROC-AUC:", np.round(scores.mean(), 4))

# =============================================================================
# 5. STAGE A: LGBM
# =============================================================================
pipeline_A_l = ImbPipeline([
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
        profile_feats=['hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy', 'month',
                       'day_of_month', 'is_month_end', 'is_month_start', 'tavg_C', 'prcp_mm'],
        n_clusters=5)),
    ('drop', ColumnDropper(drop_cols_A)),
    ('impute', MissingFlagImputer()),
    ('clf', LGBMClassifier(objective='binary',  # Changed to binary for 0/1 target
                           random_state=24,
                           metric="roc_auc",  # Common metric for binary classification
                           n_jobs=-1))
])

param_dist_lgbm = {
    'clf__n_estimators': [200, 500, 800],
    'clf__learning_rate': [0.01, 0.03, 0.05],
    'clf__num_leaves': [10, 15, 31],
    'clf__max_depth': [6, 10, 20],
    'clf__subsample': [0.7, 1.0],
    'clf__colsample_bytree': [0.7, 1.0],
    'clf__min_child_samples': [10, 20]
}

tscv = TimeSeriesSplit(n_splits=5)
# FIX 3: RandomizedSearchCV is now imported
search_A_lgbm = RandomizedSearchCV(
    pipeline_A_l,
    param_distributions=param_dist_lgbm,
    n_iter=30,
    cv=tscv,
    scoring='roc_auc',  # roc_auc is for binary, roc_auc_ovo is for multiclass
    n_jobs=-1,
    random_state=24
)

search_A_lgbm.fit(X_A, y_A)

print("Best LGBM ROC_AUC:", np.round(search_A_lgbm.best_score_, 4))
print("Best hyper‐parameters:")
for k, v in search_A_lgbm.best_params_.items():
    print(f"  {k}: {v}")

# =============================================================================
# 6. STAGE B2: RANDOM FOREST
# =============================================================================
df_B2 = df_weather2[df_weather2.cancel_timing != 'no_cancel'].copy()
le = LabelEncoder().fit(df_B2.cancel_timing)
df_B2['timing_code'] = le.transform(df_B2.cancel_timing)

X_B2 = df_B2.drop(columns=[
    'CanceledQty', 'cancel_timing', 'timing_code', 'DateOfOrder', 'DateOfService',
    'DateOfCancel', 'OrderId', 'TransactionId', 'BookingNr', 'hist_cancel_rate',
    'GroupName', 'SchoolID'
])
y_B2 = df_B2['timing_code']

drop_cols_B2 = ['MenuName', 'GroupName', 'MenuNorm', 'MenuCode']
drop_post = ['Site', 'MenuBase', 'net_qty', 'days_to_cancel']

pipeline_B2_smote = ImbPipeline([
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    ('drop', ColumnDropper(drop_cols_B2)),
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
        profile_feats=['hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy', 'month',
                       'day_of_month', 'is_month_end', 'is_month_start', 'tavg_C', 'prcp_mm'],
        n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post)),
    ('impute', MissingFlagImputer()),
    ('smote', SMOTE(random_state=24)),
    ('clf', RandomForestClassifier(
        class_weight='balanced', random_state=24, n_jobs=-1
    )),
])

param_dist_rf = {
    'clf__n_estimators': [600, 1200],
    'clf__max_depth': [6, 10],
    'clf__min_samples_split': [10, 20],
    'clf__min_samples_leaf': [15],
    'clf__max_features': [0.8, 'sqrt'],  # max_features > 1.0 is not valid
    'clf__criterion': ['entropy'],
}

tscv = TimeSeriesSplit(n_splits=5)
search_rf = RandomizedSearchCV(
    pipeline_B2_smote,
    param_distributions=param_dist_rf,
    n_iter=20,
    cv=tscv,
    scoring='roc_auc_ovo_weighted',
    n_jobs=-1,
    random_state=24
)

search_rf.fit(X_B2, y_B2)

print("Best RF roc_auc_ovo_weighted:", np.round(search_rf.best_score_, 4))
print("Best hyper-parameters:")
for k, v in search_rf.best_params_.items():
    print(f"  {k} = {v}")

# =============================================================================
# 7. STAGE B2: LGBM
# =============================================================================
pipeline_B2_lgbm = ImbPipeline([
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    ('drop', ColumnDropper(drop_cols_B2)),
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
        profile_feats=['hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy', 'month',
                       'day_of_month', 'is_month_end', 'is_month_start', 'tavg_C', 'prcp_mm'],
        n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post)),
    ('impute', MissingFlagImputer()),
    ('clf', LGBMClassifier(objective='multiclass',
                           num_class=len(np.unique(y_B2)),
                           random_state=24,
                           metric="multi_logloss",
                           n_jobs=-1))
])

param_dist_lgbm_b2 = {
    'clf__n_estimators': [200, 600],
    'clf__learning_rate': [0.01, 0.07],
    'clf__num_leaves': [31, 40],
    'clf__max_depth': [10, 20],
    'clf__subsample': [0.7, 1.0],
    'clf__colsample_bytree': [1.0],  # colsample_bytree cannot be > 1.0
    'clf__min_child_samples': [20, 30]
}

tscv = TimeSeriesSplit(n_splits=5)
search_lgbm_b2 = RandomizedSearchCV(
    pipeline_B2_lgbm,
    param_distributions=param_dist_lgbm_b2,
    n_iter=30,
    cv=tscv,
    scoring='roc_auc_ovo_weighted',
    n_jobs=-1,
    random_state=24
)

search_lgbm_b2.fit(X_B2, y_B2)

print("Best LGBM ROC_AUC_OVO_weighted:", np.round(search_lgbm_b2.best_score_, 4))
print("Best hyper‐parameters:")
for k, v in search_lgbm_b2.best_params_.items():
    print(f"  {k}: {v}")

# =============================================================================
# 8. STAGE B2: CATBOOST
# =============================================================================
pipeline_B2_cat = ImbPipeline([
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    ('drop', ColumnDropper(drop_cols_B2)),
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(
        profile_feats=['hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy', 'month',
                       'day_of_month', 'is_month_end', 'is_month_start', 'tavg_C', 'prcp_mm'],
        n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post)),
    ('impute', MissingFlagImputer()),
    ('clf', CatBoostClassifier(
        iterations=500,
        auto_class_weights='Balanced',
        loss_function='MultiClass',
        learning_rate=0.05,
        depth=6,
        early_stopping_rounds=50,
        l2_leaf_reg=3,
        verbose=False,
        random_seed=24,
        thread_count=-1
    ))
])

cat_param_dist = {
    'clf__iterations': [500, 800, 1000],
    'clf__learning_rate': [0.01, 0.03, 0.1],
    'clf__depth': [6, 8, 10],
    'clf__l2_leaf_reg': [1, 3, 10],
    'clf__bagging_temperature': [0, 3, 7],
    'clf__rsm': [0.5, 0.8, 1.0]  # same as colsample_bylevel
}
scoring_1 = {
    'pr_auc': 'average_precision',
    'roc_auc_ovo': 'roc_auc_ovo',
    'f1_macro': 'f1_macro',
    'bal_acc': make_scorer(balanced_accuracy_score),
    'neg_log_loss': 'neg_log_loss'
}

tscv = TimeSeriesSplit(n_splits=5)
search_cat = RandomizedSearchCV(
    pipeline_B2_cat,
    param_distributions=cat_param_dist,
    n_iter=30,
    cv=tscv,
    refit='roc_auc_ovo',
    scoring=scoring_1,
    n_jobs=-1,
    random_state=24
)

search_cat.fit(X_B2, y_B2)

print("Best CatBoost Score (refit='roc_auc_ovo'):", np.round(search_cat.best_score_, 4))
print("Best hyper‐parameters:")
for k, v in search_cat.best_params_.items():
    print(f"  {k}: {v}")