In [None]:
# =============================================================================
# 1. IMPORTS & SETUP
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import re

# Sklearn & related imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', 50)


# Load data
# ⚠️ Ensure the file path is correct for your system.
df = pd.read_csv('/Users/shayan/Desktop/IDS2/Stattkueche/df_weather3.csv', parse_dates=['DateOfService'])



In [None]:
# =============================================================================
# 2. REUSABLE CUSTOM TRANSFORMERS
# (Copied from the original script)
# =============================================================================

class HistCancelRateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_keys=('Site','MenuBase'), value_col='net_qty', out_col='hist_cancel_rate'):
        self.group_keys = group_keys
        self.value_col  = value_col
        self.out_col    = out_col
    def fit(self, X, y=None):
        df = X.copy()
        df['target'] = y # Use the regression target 'CanceledQty'
        keys = list(self.group_keys)
        self.hist_    = df.groupby(keys)['target'].mean()
        self.default_ = y.mean()
        return self
    def transform(self, X):
        keys   = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X      = X.copy()
        X[self.out_col] = [self.hist_.get(t, self.default_) for t in tuples]
        return X

class ClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, profile_feats, group_keys=('Site','MenuBase'),
                 n_clusters=5, out_col='cluster_id'):
        self.profile_feats = profile_feats
        self.group_keys    = group_keys
        self.n_clusters    = n_clusters
        self.out_col       = out_col
    def fit(self, X, y=None):
        keys = list(self.group_keys)
        # Ensure profile feats exist before grouping
        existing_profile_feats = [f for f in self.profile_feats if f in X.columns]
        prof = (X.groupby(keys)[existing_profile_feats].mean().reset_index())
        prof[existing_profile_feats] = prof[existing_profile_feats].fillna(prof[existing_profile_feats].median())
        self.scaler_ = StandardScaler().fit(prof[existing_profile_feats])
        scaled      = self.scaler_.transform(prof[existing_profile_feats])
        self.kmeans_ = KMeans(n_clusters=self.n_clusters, random_state=42, n_init='auto').fit(scaled)
        tuples      = [tuple(r) for r in prof[keys].values]
        self.cluster_map_ = dict(zip(tuples, self.kmeans_.labels_))
        self.default_     = int(np.median(self.kmeans_.labels_))
        return self
    def transform(self, X):
        X = X.copy()
        keys   = list(self.group_keys)
        tuples = [tuple(r) for r in X[keys].values]
        X[self.out_col] = [self.cluster_map_.get(t, self.default_) for t in tuples]
        # Make the cluster_id categorical for the model
        X[self.out_col] = X[self.out_col].astype('category')
        return X

class MissingFlagImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy
    def fit(self, X, y=None):
        self.num_cols = X.select_dtypes(include=[np.number]).columns
        clean = X[self.num_cols].replace([np.inf,-np.inf], np.nan)
        self.imputer_ = SimpleImputer(strategy=self.strategy).fit(clean)
        return self
    def transform(self, X):
        X = X.copy()
        X[self.num_cols] = X[self.num_cols].replace([np.inf,-np.inf], np.nan)
        for c in self.num_cols:
            if X[c].isna().any():
                X[c + '_missing'] = X[c].isna().astype(int)
        X[self.num_cols] = self.imputer_.transform(X[self.num_cols])
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_drop):
        self.cols_to_drop = cols_to_drop
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop(columns=self.cols_to_drop, errors='ignore')

class FeatureNameSanitizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        df = X.copy()
        df.columns = [re.sub(r'[^A-Za-z0-9_]+', '', str(col)) for col in df.columns]
        return df



In [None]:
# =============================================================================
# 3. BASELINE MODEL: DATA PREPARATION
# =============================================================================
# The target variable is the actual quantity canceled.
TARGET = 'CanceledQty'

# Sort data by service date for time-series splitting
df = df.sort_values('DateOfService').reset_index(drop=True)

# Define features (X) and target (y)
y = df[TARGET]
X = df.drop(columns=[TARGET])

# Define feature types for the pipeline
# These columns will be dropped as they are identifiers, high cardinality,
# redundant, or cause target leakage.
COLS_TO_DROP = [
    'OrderId', 'TransactionId', 'BookingNr', 'SchoolID', 'GroupName',
    'DateOfOrder', 'DateOfCancel', 'DateOfService',
    'MenuName', 'MenuNorm', 'MenuCode',
    'cancel_timing' # This is post-event info (leakage)
]

# Features to be used for creating customer profile clusters
CLUSTER_PROFILE_FEATS = [
    'hist_cancel_rate', 'rain_flag', 'temp_dev', 'sin_doy', 'cos_doy',
    'month', 'day_of_month', 'tavg_C', 'prcp_mm'
]

# LightGBM can handle categorical features directly, which is very efficient.
CATEGORICAL_FEATURES = ['Site', 'MenuBase']



In [None]:
# =============================================================================
# 4. BASELINE MODEL: PIPELINE DEFINITION & TRAINING
# =============================================================================
# We build a single, streamlined pipeline.
# Note: For this baseline, we use default LGBM parameters.

pipeline = Pipeline([
    # Step 1: Drop irrelevant or leaky columns first.
    ('initial_drop', ColumnDropper(cols_to_drop=COLS_TO_DROP)),

    # Step 2: Create the historical cancellation rate feature.
    # We've adapted this to use the regression target 'CanceledQty'.
    ('hist_rate', HistCancelRateTransformer(value_col=TARGET)),

    # Step 3: Create customer profile clusters.
    ('cluster', ClusterTransformer(profile_feats=CLUSTER_PROFILE_FEATS)),

    # Step 4: Impute missing numerical values and add flags.
    ('impute', MissingFlagImputer(strategy='median')),

    # Step 5: Sanitize feature names to remove special characters.
    ('sanitize', FeatureNameSanitizer()),

    # Step 6: The regression model.
    ('regressor', lgb.LGBMRegressor(random_state=42, n_jobs=-1))
])

# Use TimeSeriesSplit for cross-validation to respect the data's temporal order.
tscv = TimeSeriesSplit(n_splits=5)

# --- Cross-Validation ---
# We calculate negative RMSE because scikit-learn convention maximizes scores.
print("🚀 Starting 5-fold time-series cross-validation...")
scores = cross_val_score(
    pipeline, X, y,
    cv=tscv,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)
avg_rmse = -scores.mean()
print(f"✅ Cross-validation complete.\nAverage RMSE: {avg_rmse:.4f}")


# --- Final Model Training ---
# Train the pipeline on the full dataset to create the final model.
print("\n⚙️ Training final model on the entire dataset...")
pipeline.fit(X, y)
print("✅ Final model trained.")



In [None]:
# =============================================================================
# 5. BASELINE MODEL: EVALUATION & VISUALIZATION
# =============================================================================
# To evaluate, we need predictions. We'll get them from the last fold of TSCV.
train_indices, val_indices = list(tscv.split(X))[-1]
X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]

# Train a temporary pipeline on the training part of the split to get predictions
# for the validation set. This avoids predicting on data the model has seen.
print("\n📊 Generating predictions on a hold-out validation set for evaluation...")
eval_pipeline = pipeline.fit(X_train, y_train)
y_pred = eval_pipeline.predict(X_val)

# Ensure predictions are non-negative integers (as CanceledQty can't be fractional or negative)
y_pred = np.maximum(0, y_pred).round()

# --- Performance Metrics ---
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("\n--- Model Performance on Validation Set ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE):     {mae:.4f}")
print(f"R-squared (R²):                {r2:.4f}")
print("-------------------------------------------")


# --- Visualizations ---

# 1. Predicted vs. Actual Plot
plt.figure(figsize=(10, 8))
sns.regplot(x=y_val, y=y_pred,
            scatter_kws={'alpha': 0.2, 's': 10, 'color': 'skyblue'},
            line_kws={'color': 'red', 'linewidth': 2, 'linestyle': '--'})
plt.plot([min(y_val), max(y_val)], [min(y_val), max(y_val)], color='black', linestyle='-', linewidth=1)
plt.title('Predicted vs. Actual Canceled Quantities', fontsize=16, pad=20)
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.legend(['Regression Line', 'Perfect Fit (y=x)'])
plt.show()

# 2. Residuals Plot
residuals = y_val - y_pred
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals, alpha=0.3, s=15, color='green')
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs. Predicted Values', fontsize=16, pad=20)
plt.xlabel('Predicted Values', fontsize=12)
plt.ylabel('Residuals (Actual - Predicted)', fontsize=12)
plt.show()

# 3. Feature Importance Plot
# Extract the trained model and feature names from the final pipeline
lgbm_model = pipeline.named_steps['regressor']
sanitized_cols = pipeline.named_steps['sanitize'].transform(
    pipeline.named_steps['impute'].transform(
        pipeline.named_steps['cluster'].transform(
            pipeline.named_steps['hist_rate'].transform(
                pipeline.named_steps['initial_drop'].transform(X)
            )
        )
    )
).columns

feature_importance = pd.DataFrame({
    'feature': sanitized_cols,
    'importance': lgbm_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title('Top 20 Most Important Features', fontsize=16, pad=20)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.show()