In [2]:
# =============================================================================
# 1. IMPORTS
# =============================================================================
import numpy as np
import pandas as pd
from IPython.display import display
import re
import time
import multiprocessing  # <-- ADDED

# Sklearn & related imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Other libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# =============================================================================
# 2. CONFIGURATION CONTROLS
# =============================================================================
# --- Adjust these variables to control the trade-off between speed and accuracy ---

# ⚙️ Fraction of data to use. (e.g., 1.0 for all data, 0.25 for a quick 25% sample)
DATA_SAMPLE_FRAC = 0.1

# ⚙️ Number of splits for TimeSeriesSplit cross-validation. (e.g., 5 for robust, 3 for fast)
CV_SPLITS = 2

# ⚙️ Number of parameter combinations to try in RandomizedSearchCV. (e.g., 30 for thorough, 10 for fast)
N_ITER_SEARCH = 5

# ⚙️ CPU Usage Limit Control  <-- ADDED SECTION
CPU_USAGE_PERCENT = 0.40  # Set desired usage (e.g., 0.50 for 50%, 0.75 for 75%)
TOTAL_CORES = multiprocessing.cpu_count()
# Calculate the number of cores to use, ensuring it's at least 1
print('your pc has cores',TOTAL_CORES)
N_JOBS_LIMIT = max(1, int(TOTAL_CORES * CPU_USAGE_PERCENT))

print(
    f"--- System has {TOTAL_CORES} cores. Limiting parallel jobs to {N_JOBS_LIMIT} ({CPU_USAGE_PERCENT * 100:.0f}%) ---")

# Load data
# Make sure the path to your CSV file is correct
df_weather2 = pd.read_csv('/Users/shayan/Desktop/IDS2/Stattkueche/df_weather3.csv',
                          parse_dates=['DateOfCancel', 'DateOfService'])


# =============================================================================
# 3. CUSTOM TRANSFORMER DEFINITIONS
# =============================================================================

class InCVTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, smoothing=1.0):
        self.cols = cols
        self.smoothing = smoothing

    def fit(self, X, y):
        self.global_mean_ = y.mean()
        self.mapping_ = {}
        for c in self.cols:
            df = pd.DataFrame({c: X[c], 'target': y})
            agg = df.groupby(c)['target'].agg(['mean', 'count'])
            agg['enc'] = ((agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (
                        agg['count'] + self.smoothing))
            self.mapping_[c] = agg['enc']
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            X[c + '_te'] = X[c].map(self.mapping_.get(c)).fillna(self.global_mean_)
        return X


class HistCancelRateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_keys=('Site', 'MenuBase'), value_col='net_qty', out_col='hist_cancel_rate'):
        self.group_keys = group_keys
        self.value_col = value_col
        self.out_col = out_col

    def fit(self, X, y=None):
        keys = list(self.group_keys)
        self.hist_ = X.groupby(keys)[self.value_col].mean()
        self.default_ = self.hist_.median()
        return self

    def transform(self, X):
        keys = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X = X.copy()
        X[self.out_col] = [self.hist_.get(t, self.default_) for t in tuples]
        return X


class ClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, profile_feats, group_keys=('Site', 'MenuBase'), n_clusters=5, out_col='cluster_id'):
        self.profile_feats = profile_feats
        self.group_keys = group_keys
        self.n_clusters = n_clusters
        self.out_col = out_col

    def fit(self, X, y=None):
        keys = list(self.group_keys)
        prof = (X.groupby(keys)[self.profile_feats].mean().reset_index())
        prof[self.profile_feats] = prof[self.profile_feats].fillna(prof[self.profile_feats].median())
        self.scaler_ = StandardScaler().fit(prof[self.profile_feats])
        scaled = self.scaler_.transform(prof[self.profile_feats])
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10).fit(scaled)
        tuples = [tuple(r) for r in prof[keys].values]
        self.cluster_map_ = dict(zip(tuples, self.kmeans_.labels_))
        self.default_ = int(np.median(self.kmeans_.labels_))
        return self

    def transform(self, X):
        X = X.copy()
        keys = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X[self.out_col] = [self.cluster_map_.get(t, self.default_) for t in tuples]
        return X


class MissingFlagImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy

    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=[np.number]).columns
        clean = X[self.num_cols].replace([np.inf, -np.inf], np.nan)
        self.imputer_ = SimpleImputer(strategy=self.strategy).fit(clean)
        return self

    def transform(self, X):
        X = X.copy()
        X[self.num_cols] = X[self.num_cols].replace([np.inf, -np.inf], np.nan)
        for c in self.num_cols:
            X[c + '_missing'] = X[c].isna().astype(int)
        X[self.num_cols] = self.imputer_.transform(X[self.num_cols])
        return X


class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.cols_to_drop, errors='ignore')


class FeatureNameSanitizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', str(col)) for col in df.columns]
        return df


# =============================================================================
# 4. VIF CHECK
# =============================================================================
print("--- Starting VIF Check ---")
num_cols = df_weather2.select_dtypes(include=[np.number]).columns.to_list()
drop_col = ['DateOfOrder', 'DateOfService', 'days_to_cancel', 'CanceledQty', 'OrderId', 'TransactionId',
            'cancel_timing', 'BookingNr', 'DateOfCancel', 'net_qty']
vif_col_1 = [c for c in num_cols if c not in drop_col]

vif_dat_1 = df_weather2[vif_col_1].copy()
vif_dat_1.replace([np.inf, -np.inf], np.nan, inplace=True)
vif_dat_1 = vif_dat_1.dropna()

vif_scores_1 = [variance_inflation_factor(vif_dat_1.values, i) for i in range(vif_dat_1.shape[1])]
vif_table_1 = (
    pd.DataFrame({'feature': vif_col_1, 'VIF': vif_scores_1}).sort_values('VIF', ascending=False).reset_index(
        drop=True))

print('VIF values:')
display(vif_table_1)

hg_vif_1 = vif_table_1.loc[vif_table_1['VIF'] > 10, 'feature'].to_list()
print('\nHigh VIF columns to be aware of (or drop):')
print(hg_vif_1)

# =============================================================================
# 5. STAGE A: CANCELLATION PREDICTION (BINARY CLASSIFICATION)
# =============================================================================
print("\n--- Starting Stage A: Binary Cancellation Prediction ---")
df_A = df_weather2.copy()

# Use sample or full data based on config
if DATA_SAMPLE_FRAC < 1.0:
    print(f"Using a {DATA_SAMPLE_FRAC * 100:.0f}% sample of the data for Stage A.")
    df_A = df_A.sample(frac=DATA_SAMPLE_FRAC, random_state=42)

y_A = (df_A['CanceledQty'] > 0).astype(int)
X_A = df_A.drop(columns=[
    'CanceledQty', 'cancel_timing', 'DateOfOrder', 'DateOfService', 'DateOfCancel',
    'OrderId', 'TransactionId', 'BookingNr', 'hist_cancel_rate', 'GroupName', 'SchoolID'
])

# Define base pipeline steps for Stage A
profile_feats = ['hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy', 'month',
                 'day_of_month', 'is_month_end', 'is_month_start', 'tavg_C', 'prcp_mm']
drop_cols_A = ['Site', 'MenuBase', 'MenuName', 'GroupName', 'MenuNorm', 'MenuCode', 'net_qty', 'days_to_cancel']

base_steps_A = [
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(profile_feats=profile_feats, n_clusters=5)),
    ('drop', ColumnDropper(drop_cols_A)),
    ('impute', MissingFlagImputer()),
    ('sanitize', FeatureNameSanitizer())
]

tscv = TimeSeriesSplit(n_splits=CV_SPLITS)

# --- Model 1: LightGBM ---
print(f"\nRunning Stage A with LightGBM (CV Splits={CV_SPLITS}, Search Iterations={N_ITER_SEARCH})...")
start_time = time.time()
pipeline_lgbm_A = ImbPipeline(
    base_steps_A + [('clf', LGBMClassifier(objective='binary', random_state=24, n_jobs=N_JOBS_LIMIT))])  # MODIFIED

param_dist_lgbm = {
    'clf__n_estimators': [200, 500, 800],
    'clf__learning_rate': [0.01, 0.03, 0.05],
    'clf__num_leaves': [15, 31, 40],
    'clf__max_depth': [6, 10, -1],
    'clf__subsample': [0.7, 0.9, 1.0],
    'clf__colsample_bytree': [0.7, 0.9, 1.0],
}

search_lgbm_A = RandomizedSearchCV(
    pipeline_lgbm_A, param_distributions=param_dist_lgbm, n_iter=N_ITER_SEARCH,
    cv=tscv, scoring='roc_auc', n_jobs=N_JOBS_LIMIT, random_state=24  # MODIFIED
)
search_lgbm_A.fit(X_A, y_A)
print(f"LGBM finished in {time.time() - start_time:.2f} seconds.")
print(f"Best LGBM ROC AUC: {search_lgbm_A.best_score_:.4f}")
print(f"Best LGBM Params: {search_lgbm_A.best_params_}")

# =============================================================================
# 6. STAGE B: TIMING PREDICTION (MULTICLASS CLASSIFICATION)
# =============================================================================
print("\n--- Starting Stage B: Multiclass Timing Prediction ---")

df_B = df_weather2[df_weather2.cancel_timing != 'no_cancel'].copy()

# Use sample or full data based on config
if DATA_SAMPLE_FRAC < 1.0:
    print(f"Using a {DATA_SAMPLE_FRAC * 100:.0f}% sample of the data for Stage B.")
    df_B = df_B.sample(frac=DATA_SAMPLE_FRAC, random_state=42)

le = LabelEncoder().fit(df_B.cancel_timing)
y_B = le.transform(df_B.cancel_timing)
X_B = df_B.drop(columns=['CanceledQty', 'cancel_timing', 'DateOfOrder', 'DateOfService',
                         'DateOfCancel', 'OrderId', 'TransactionId', 'BookingNr',
                         'hist_cancel_rate', 'GroupName', 'SchoolID'])

# Define a single, robust base pipeline for all Stage B models
drop_cols_B = ['MenuName', 'GroupName', 'MenuNorm', 'MenuCode']
drop_post_B = ['Site', 'MenuBase', 'net_qty', 'days_to_cancel']
base_steps_B = [
    ('te', InCVTargetEncoder(cols=['Site', 'MenuBase'], smoothing=0.3)),
    ('drop', ColumnDropper(drop_cols_B)),
    ('hist', HistCancelRateTransformer()),
    ('cluster', ClusterTransformer(profile_feats=profile_feats, n_clusters=5)),
    ('drop_post', ColumnDropper(drop_post_B)),
    ('impute', MissingFlagImputer()),
    ('sanitize', FeatureNameSanitizer()),
    ('smote', SMOTE(random_state=24))  # Apply SMOTE to all models in Stage B
]

print(f"\nRunning Stage B models (CV Splits={CV_SPLITS}, Search Iterations={N_ITER_SEARCH})...")

# --- Model 1: Random Forest ---
start_time = time.time()
print("\nRunning Stage B with Random Forest...")
pipeline_rf_B = ImbPipeline(base_steps_B + [
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=24, n_jobs=N_JOBS_LIMIT))])  # MODIFIED
param_dist_rf_B = {
    'clf__n_estimators': [200, 600, 1000],
    'clf__max_depth': [8, 10, 20],
    'clf__min_samples_split': [5, 10, 20],
    'clf__max_features': ['sqrt', 0.8],
}
search_rf_B = RandomizedSearchCV(pipeline_rf_B, param_distributions=param_dist_rf_B, n_iter=N_ITER_SEARCH, cv=tscv,
                                 scoring='roc_auc_ovo_weighted', n_jobs=N_JOBS_LIMIT, random_state=24)  # MODIFIED
search_rf_B.fit(X_B, y_B)
print(f"RF finished in {time.time() - start_time:.2f} seconds.")
print(f"Best RF ROC AUC (OVO weighted): {search_rf_B.best_score_:.4f}")
print(f"Best RF Params: {search_rf_B.best_params_}")

# --- Model 2: LightGBM ---
start_time = time.time()
print("\nRunning Stage B with LightGBM...")
pipeline_lgbm_B = ImbPipeline(base_steps_B + [('clf', LGBMClassifier(objective='multiclass', num_class=len(le.classes_),
                                                                     random_state=24,
                                                                     n_jobs=N_JOBS_LIMIT))])  # MODIFIED
param_dist_lgbm_B = {
    'clf__n_estimators': [200, 600, 800],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__num_leaves': [20, 31, 40],
    'clf__subsample': [0.7, 1.0],
    'clf__colsample_bytree': [0.7, 1.0]
}
search_lgbm_B = RandomizedSearchCV(pipeline_lgbm_B, param_distributions=param_dist_lgbm_B, n_iter=N_ITER_SEARCH,
                                   cv=tscv, scoring='roc_auc_ovo_weighted', n_jobs=N_JOBS_LIMIT,
                                   random_state=24)  # MODIFIED
search_lgbm_B.fit(X_B, y_B)
print(f"LGBM finished in {time.time() - start_time:.2f} seconds.")
print(f"Best LGBM ROC AUC (OVO weighted): {search_lgbm_B.best_score_:.4f}")
print(f"Best LGBM Params: {search_lgbm_B.best_params_}")

# --- Model 3: CatBoost ---
start_time = time.time()
print("\nRunning Stage B with CatBoost...")
pipeline_cat_B = ImbPipeline(base_steps_B + [('clf', CatBoostClassifier(auto_class_weights='Balanced',
                                                                        loss_function='MultiClass',
                                                                        early_stopping_rounds=50, verbose=False,
                                                                        random_seed=24,
                                                                        thread_count=N_JOBS_LIMIT))])  # MODIFIED
param_dist_cat_B = {
    'clf__iterations': [500, 800, 1000],
    'clf__learning_rate': [0.03, 0.05, 0.1],
    'clf__depth': [6, 8, 10],
    'clf__l2_leaf_reg': [1, 3, 7],
    'clf__rsm': [0.7, 0.9, 1.0]  # 'colsample_bylevel'
}
search_cat_B = RandomizedSearchCV(pipeline_cat_B, param_distributions=param_dist_cat_B, n_iter=N_ITER_SEARCH, cv=tscv,
                                  scoring='roc_auc_ovo', n_jobs=N_JOBS_LIMIT, random_state=24)  # MODIFIED
search_cat_B.fit(X_B, y_B)
print(f"CatBoost finished in {time.time() - start_time:.2f} seconds.")
print(f"Best CatBoost ROC AUC (OVO): {search_cat_B.best_score_:.4f}")
print(f"Best CatBoost Params: {search_cat_B.best_params_}")

print("\n--- Script Finished ---")

your pc has cores 10
--- System has 10 cores. Limiting parallel jobs to 4 (40%) ---
--- Starting VIF Check ---


  vif = 1. / (1. - r_squared_i)


VIF values:


Unnamed: 0,feature,VIF
0,afternoon,inf
1,morning,inf
2,evening,inf
3,day_of_year,31028.98
4,month,30864.69
5,day_of_month,130.0759
6,quarter,22.09046
7,MenuSubsidy,6.593575
8,MenuPrice,6.495096
9,order_hour,5.359547



High VIF columns to be aware of (or drop):
['afternoon', 'morning', 'evening', 'day_of_year', 'month', 'day_of_month', 'quarter']

--- Starting Stage A: Binary Cancellation Prediction ---
Using a 10% sample of the data for Stage A.

Running Stage A with LightGBM (CV Splits=2, Search Iterations=5)...
[LightGBM] [Info] Number of positive: 3828, number of negative: 327292
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014267 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1795
[LightGBM] [Info] Number of data points in the train set: 331120, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011561 -> initscore=-4.448510
[LightGBM] [Info] Start training from score -4.448510
[LightGBM] [Info] Number of positive: 3828, number of negative: 327292
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhe




CatBoost finished in 42.40 seconds.
Best CatBoost ROC AUC (OVO): 0.8548
Best CatBoost Params: {'clf__rsm': 0.7, 'clf__learning_rate': 0.03, 'clf__l2_leaf_reg': 3, 'clf__iterations': 1000, 'clf__depth': 8}

--- Script Finished ---
