# Imports

In [5]:
import numpy as np 
import pandas as pd
import datetime as dt
import seaborn as sns
from colorama import Style, Fore
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, SplineTransformer, FunctionTransformer
from category_encoders import OneHotEncoder, TargetEncoder
from datetime import datetime
from lightgbm import LGBMRegressor
from scipy.optimize import differential_evolution, minimize
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline, Pipeline
import gc
from scipy.signal import periodogram
from scipy.stats import kurtosis
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.base import clone, BaseEstimator, TransformerMixin
from matplotlib.ticker import MaxNLocator
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import PredictionErrorDisplay, mean_absolute_error
import os
from sklearn.kernel_approximation import Nystroem

# Loading Data

## Data Preprocessing

In [6]:
# Load datasets
def load_data(file_path, parse_dates=None):
    """Load a CSV file into a DataFrame."""
    return pd.read_csv(file_path, parse_dates=parse_dates)

# Merge datasets
def merge_datasets(base_df, merge_df, on_columns, how='left'):
    """Merge two DataFrames."""
    return pd.merge(base_df, merge_df, how=how, on=on_columns)

def preprocess_data(sales_train, sales_test, inventory, calendar):
    """Perform preprocessing steps: merging, indexing, and cleaning."""
    # Merge inventory data first
    sales_train = merge_datasets(
        sales_train, 
        inventory,
        on_columns=['unique_id', 'warehouse']
    )
    sales_test = merge_datasets(
        sales_test, 
        inventory,
        on_columns=['unique_id', 'warehouse']
    )
    
    # Merge calendar data
    sales_train = merge_datasets(
        sales_train,
        calendar,
        on_columns=['date', 'warehouse']
    )
    sales_test = merge_datasets(
        sales_test,
        calendar,
        on_columns=['date', 'warehouse']
    )
    
    # Set index and sort
    for df in [sales_train, sales_test]:
        df.set_index('date', inplace=True)
        df.sort_index(inplace=True)
    
    # Drop unnecessary columns
    sales_train.drop(['availability'], axis=1, inplace=True, errors='ignore')
    
    return sales_train, sales_test

# Handle categorical features
def process_categorical_columns(sales_train, sales_test, cat_cols):
    """Handle categorical columns by ensuring consistent categories."""
    for c in cat_cols:
        train_unique = sales_train[c].fillna('None').astype(str).unique()
        test_unique = sales_test[c].fillna('None').astype(str).unique()
        diff_values = np.setdiff1d(test_unique, train_unique)
        
        if diff_values.size > 0:
            print(f"Unseen categories in test for {c}: {diff_values}")
            sales_train.loc[~sales_train[c].isin(diff_values), c] = 'None'
        
        sales_train[c] = sales_train[c].astype('category')
        sales_test[c] = sales_test[c].astype('category')

    return sales_train, sales_test

# Handle missing values
def handle_missing_values(sales_train):
    """Handle missing values in the sales_train DataFrame."""
    # Analyze missing data
    missing_analysis = (
        sales_train.loc[sales_train['sales'].isnull()]
        .reset_index()
        .groupby(['warehouse'], observed=False)
        .agg(
            size=('warehouse', 'size'),
            min_date=('date', 'min'),
            max_date=('date', 'max'),
            days=('date', lambda x: x.max() - x.min()),
            split_date=('date', lambda x: list(np.unique(x.dt.strftime('%Y-%m-%d'))))
        )
        .dropna()
    )
    print(missing_analysis)
    
    # Fill missing values
    sales_train['sales'] = sales_train['sales'].fillna(0)
    sales_train['total_orders'] = sales_train['total_orders'].fillna(0)
    sales_train['sell_price_main'] = sales_train['sell_price_main'].interpolate()
    
    return sales_train

In [7]:
from category_encoders import CatBoostEncoder, TargetEncoder

def encode_product_id(df, target_col, sigma=0.1):
    """
    Encodes the 'product_unique_id' column using CatBoostEncoder with regularization.
    """
    encoder = CatBoostEncoder(cols=['product_unique_id'], sigma=sigma)
    df['product_unique_id'] = encoder.fit_transform(df['product_unique_id'], df[target_col])
    return df

def encode_warehouse(df, target_col, smoothing=10):
    """
    Encodes the 'warehouse' column using TargetEncoder with smoothing.
    """
    warehouse_encoder = TargetEncoder(cols=['warehouse'], smoothing=smoothing)
    df['warehouse'] = warehouse_encoder.fit_transform(df['warehouse'], df[target_col])
    return df

def encode_hierarchical_categories(df, target_col, levels, smoothing=5):
    """
    Encodes hierarchical category columns (e.g., L1 → L4) using TargetEncoder with smoothing.
    """
    for level in levels:
        col = f"{level}_category_name_en"
        encoder = TargetEncoder(cols=[col], smoothing=smoothing)
        df[f"{col}"] = encoder.fit_transform(df[col], df[target_col])
    return df


def main_encoding_pipeline(df):
    """
    Main encoding pipeline for the dataset.
    Applies all encoding methods: product ID, warehouse, hierarchical categories, and holiday frequency.
    """
    target_col = 'sales'
    hierarchical_levels = ['L1', 'L2', 'L3', 'L4']
    
    # Encode product_unique_id
    df = encode_product_id(df, target_col)
    
    # Encode warehouse
    df = encode_warehouse(df, target_col)
    
    # Encode hierarchical categories
    df = encode_hierarchical_categories(df, target_col, hierarchical_levels)

    df = df.drop('holiday_name', axis=1)
    
    return df

def propagate_encodings(train_df, test_df):
    """
    Propagates encodings from training data to test data.
    """
    # Product ID encoding
    product_encoder = CatBoostEncoder(cols=['product_unique_id'])
    product_encoder.fit(train_df['product_unique_id'], train_df['sales'])
    test_df['product_unique_id'] = product_encoder.transform(test_df['product_unique_id'])

    # Warehouse encoding
    warehouse_encoder = TargetEncoder(cols=['warehouse'])
    warehouse_encoder.fit(train_df['warehouse'], train_df['sales'])
    test_df['warehouse'] = warehouse_encoder.transform(test_df['warehouse'])

    # Hierarchical categories
    for level in ['L1', 'L2', 'L3', 'L4']:
        col = f"{level}_category_name_en"
        encoder = TargetEncoder(cols=[col])
        encoder.fit(train_df[col], train_df['sales'])
        test_df[f"{col}"] = encoder.transform(test_df[col])
    
    return test_df

In [8]:
def preprocessing_pipeline(config):
    """Preprocessing pipeline with encoding integration"""
    # Load raw data
    sales_train = load_data(config['train_path'], parse_dates=['date'])
    sales_test = load_data(config['test_path'], parse_dates=['date'])
    inventory = load_data(config['inventory_path'])
    calendar = load_data(config['calendar_path'], parse_dates=['date'])

    # Core preprocessing
    def _base_preprocessing(df, is_train=True):
        df = merge_datasets(df, inventory, ['unique_id', 'warehouse'])
        df = merge_datasets(df, calendar, ['date', 'warehouse'])
        df = df.set_index('date').sort_index()
        df = df.drop(columns=['availability'], errors='ignore')
        return df

    # Process datasets
    sales_train = _base_preprocessing(sales_train, is_train=True)
    sales_test = _base_preprocessing(sales_test, is_train=False)

    # Handle missing values
    sales_train = handle_missing_values(sales_train)
    
    # Process categorical columns
    cat_cols = ['warehouse', 'L1_category_name_en', 'L2_category_name_en',
                'L3_category_name_en', 'L4_category_name_en']
    sales_train, sales_test = process_categorical_columns(
        sales_train, sales_test, cat_cols
    )

    # Apply encoding pipeline
    sales_train = main_encoding_pipeline(sales_train)
    
    # Propagate encodings to test data
    sales_test = propagate_encodings(sales_train, sales_test)

    # Validate preprocessing
    _validate_preprocessing(sales_train, sales_test)
    
    return sales_train, sales_test

def _validate_preprocessing(train_df, test_df):
    """Validation with encoding checks"""
    # Core columns
    required_columns = [
        'unique_id', 'warehouse', 'sales', 'sell_price_main',
        'type_0_discount', 'L1_category_name_en'
    ]
    
    # Encoded columns
    encoded_columns = [
        'product_unique_id', 'warehouse',
        'L1_category_name_en', 'L2_category_name_en',
        'L3_category_name_en', 'L4_category_name_en'
    ]

    # Check train data
    missing_train = set(required_columns + encoded_columns) - set(train_df.columns)
    if missing_train:
        raise ValueError(f"Missing columns in train: {missing_train}")

    # Check test data (excluding sales)
    missing_test = set(required_columns + encoded_columns) - set(test_df.columns)
    missing_test.discard('sales')  # Sales is expected to be missing in test
    if missing_test:
        raise ValueError(f"Missing columns in test: {missing_test}")

    print("Preprocessing validation passed successfully")

In [9]:
# 1. Run preprocessing
config = {
    'train_path': 'data/sales_train.csv',
    'test_path': 'data/sales_test.csv',
    'inventory_path': 'data/inventory.csv',
    'calendar_path': 'data/calendar.csv'
}

preprocessed_train, preprocessed_test = preprocessing_pipeline(config)

             size   min_date   max_date    days  \
warehouse                                         
Frankfurt_1     6 2021-12-09 2021-12-10  1 days   
Munich_1       46 2021-05-21 2021-07-11 51 days   

                                                    split_date  
warehouse                                                       
Frankfurt_1                           [2021-12-09, 2021-12-10]  
Munich_1     [2021-05-21, 2021-05-22, 2021-05-23, 2021-05-2...  
Preprocessing validation passed successfully


# Feature Engineering


| Feature              | Encoder                  | Reason                                    |
|----------------------|--------------------------|------------------------------------------|
| product_unique_id    | CatBoost/LeaveOneOut    | High cardinality, time-series safety     |
| warehouse            | Target Encoding         | Moderate cardinality, relationship to sales |
| L1/L2/L3/L4_category | Target + Smoothing      | Hierarchical structure, stability        |
| holiday_name         | Frequency Encoding      | Low cardinality, frequency relevance     |
| Binary features      | Keep as 0/1             | No encoding needed                       |


In [10]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class CreateTimeFeatures(BaseEstimator, TransformerMixin):
    """
    Creates time features from either a datetime column or index
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        
        # Handle datetime index
        if isinstance(df.index, pd.DatetimeIndex):
            date_series = df.index.to_series()  # Convert to Series for consistency
        # Handle date column
        elif 'date' in df.columns:
            date_series = pd.to_datetime(df['date'])
        else:
            raise ValueError("Input must have either DatetimeIndex or 'date' column")

        # Create time features
        df['year'] = date_series.dt.year
        df['month'] = date_series.dt.month
        df['weekday'] = date_series.dt.weekday
        df['week'] = date_series.dt.isocalendar().week.astype(int)
        df['weekend'] = (date_series.dt.weekday >= 5).astype(int)
        df['semiweekly'] = np.where(date_series.dt.weekday < 3, 0, 1)

        # Cyclical features
        df['month_sin'] = np.sin(df['month'] * (2 * np.pi / 12))
        df['month_cos'] = np.cos(df['month'] * (2 * np.pi / 12))
        
        return df

def calculate_aggregates(df, group_col):
    """Calculate aggregates using date column"""
    return (
        df.groupby(group_col, observed=False)
        .agg(
            days_in_sale=('date', 'nunique'),
            purchase_interval=('date', lambda x: (x.max() - x.min()).days)
        )
        .reset_index()
    )

def calculate_date_gaps(df, group_col):
    """Calculate date gaps using date column"""
    df = df.sort_values([group_col, 'date'])
    df['date_diff'] = (
        df.groupby(group_col, observed=False)['date']
        .diff()
        .dt.days
        .fillna(0)
    )
    df['gap'] = df['date_diff'] > 1
    df['gap_group'] = df.groupby(group_col, observed=False)['gap'].cumsum()

    gap_agg_df = (
        df.groupby([group_col, 'gap_group'], observed=False)['date_diff']
        .max()
        .reset_index()
        .groupby(group_col, observed=False)['date_diff']
        .max()
        .rename('days_without_sale')
    )
    return gap_agg_df

def feature_engineering_pipeline(train_df, test_df):
    """Feature engineering pipeline with proper date handling"""

    # 1. Make sure the index is named 'date' if it's a DatetimeIndex
    train_df.index.name = 'date'
    test_df.index.name = 'date'
    
    # 2. Create temporary copies with 'date' column
    train_temp = train_df.reset_index()
    test_temp = test_df.reset_index()

    # 3. Time features
    time_featurizer = CreateTimeFeatures()
    train_temp = time_featurizer.fit_transform(train_temp)
    test_temp = time_featurizer.transform(test_temp)

    # 4. Calculate aggregates
    agg_features = calculate_aggregates(train_temp, 'name')
    train_temp = train_temp.merge(agg_features, on='name', how='left')
    test_temp = test_temp.merge(agg_features, on='name', how='left')

    # 5. Calculate date gaps
    gap_features = calculate_date_gaps(train_temp, 'name')
    train_temp = train_temp.merge(gap_features, on='name', how='left')
    test_temp = test_temp.merge(gap_features, on='name', how='left')

    # 6. Restore index
    # Because we renamed the index above, 'date' is now a valid column
    train_temp = train_temp.set_index('date')
    test_temp = test_temp.set_index('date')

    return train_temp, test_temp


def _validate_feature_engineering(train_df, test_df):
    """Validation for feature engineering outputs"""
    expected_features = [
        'year', 'month', 'weekday', 'days_in_sale',
        'purchase_interval', 'days_without_sale'
    ]
    
    missing_train = set(expected_features) - set(train_df.columns)
    missing_test = set(expected_features) - set(test_df.columns)
    
    if missing_train:
        raise ValueError(f"Missing engineered features in train: {missing_train}")
    if missing_test:
        raise ValueError(f"Missing engineered features in test: {missing_test}")

    print("Feature engineering validation passed successfully")


In [11]:
# 2. Run feature engineering 
engineered_train, engineered_test = feature_engineering_pipeline(
    preprocessed_train.copy(),
    preprocessed_test.copy()
)

In [76]:
engineered_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4007419 entries, 2020-08-01 to 2024-06-02
Data columns (total 41 columns):
 #   Column                       Dtype   
---  ------                       -----   
 0   unique_id                    int64   
 1   warehouse                    category
 2   total_orders                 float64 
 3   sales                        float64 
 4   sell_price_main              float64 
 5   type_0_discount              float64 
 6   type_1_discount              float64 
 7   type_2_discount              float64 
 8   type_3_discount              float64 
 9   type_4_discount              float64 
 10  type_5_discount              float64 
 11  type_6_discount              float64 
 12  product_unique_id            int64   
 13  name                         object  
 14  L1_category_name_en          category
 15  L2_category_name_en          category
 16  L3_category_name_en          category
 17  L4_category_name_en          category
 18  holiday

In [12]:
test_weights = pd.read_csv('data/test_weights.csv')
target = 'sales'
weight_map = test_weights.set_index('unique_id')['weight'].to_dict()

oofs = {}
scores = {}
test_preds = {}
COMPUTE_TEST = True

def cross_validate(estimator, features, plot_residuals=False, fit_params={}):
    kf = TimeSeriesSplit(n_splits=5,test_size=dt.timedelta(weeks=2).days)
    X = engineered_train[features].copy()
    y = engineered_train[target]
       
    model = clone(estimator)
    val_preds = np.zeros(len(X))
    list_scores = []
    
    for fold, (trx_idx, val_idx) in enumerate(kf.split(X,y,groups=X['warehouse'])):        
        X_train, y_train = X.iloc[trx_idx], y.iloc[trx_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]      
                
        model.fit(X_train.drop('unique_id',axis=1),y_train, **fit_params)
        y_pred = model.predict(X_val.drop('unique_id',axis=1)).clip(0,None)
        val_preds[val_idx] += y_pred
        wmape = mean_absolute_error(y_val,y_pred,sample_weight=X_val["unique_id"].map(weight_map).values)
        list_scores.append(wmape)
        
        print(f' #{fold} - wmae: {wmape}')
        if plot_residuals:
            display = PredictionErrorDisplay.from_predictions(y_val,y_pred)            
            plt.show()
    if isinstance(model,Pipeline):
        name_model = type(model[-1]).__name__
    else:
        name_model = type(model).__name__                              
    

    oofs[name_model] = val_preds
    scores[name_model] = list_scores
    print(f'wmae mean: {np.mean(list_scores)}')   
    
    if COMPUTE_TEST:
        print('Computing Test prediction....')
        model = clone(estimator)
        model.fit(X,y)
        
        test_pred = model.predict(engineered_test[features]).clip(0,None)
        test_preds[name_model] = test_pred
        print('Computing Test prediction - Ok')

In [13]:
features = engineered_train.drop('sales', axis=1).columns.tolist()

In [15]:
# Define the pipeline
lgb_pipeline = make_pipeline(
    TargetEncoder(cols=['name']),
    LGBMRegressor(verbosity=-1)
)

# Run cross-validation with filtered features
cross_validate(lgb_pipeline, features)



 #0 - wmae: 18.406316640244796
 #1 - wmae: 28.10453226028088
 #2 - wmae: 32.47101624006897
 #3 - wmae: 21.071137500107394
 #4 - wmae: 21.662696956610624
wmae mean: 24.343139919462534
Computing Test prediction....
Computing Test prediction - Ok


In [None]:
# Define the pipeline
xgb_pipeline = make_pipeline(
    TargetEncoder(cols=['name', 'holiday_name']),
    XGBRegressor(verbosity=0, enable_categorical=True)
)

# Run cross-validation with filtered features
cross_validate(xgb_pipeline, filtered_features)

In [21]:
from catboost import CatBoostRegressor

# Define the pipeline with CatBoost
cat_pipeline = make_pipeline(
    CatBoostRegressor(
        cat_features=['name'],  # Explicitly specify categorical columns
        verbose=1,
        task_type='GPU', 
        allow_writing_files=False
    )
)

# Run cross-validation
cross_validate(cat_pipeline, features)



Learning rate set to 0.119753
0:	learn: 325.3174681	total: 145ms	remaining: 2m 25s
1:	learn: 295.2950363	total: 277ms	remaining: 2m 18s
2:	learn: 269.2490765	total: 406ms	remaining: 2m 14s
3:	learn: 247.1958899	total: 551ms	remaining: 2m 17s
4:	learn: 228.0744097	total: 689ms	remaining: 2m 17s
5:	learn: 211.8256781	total: 813ms	remaining: 2m 14s
6:	learn: 197.8755623	total: 941ms	remaining: 2m 13s
7:	learn: 186.1555941	total: 1.08s	remaining: 2m 13s
8:	learn: 175.9763769	total: 1.21s	remaining: 2m 12s
9:	learn: 166.5917363	total: 1.36s	remaining: 2m 14s
10:	learn: 159.2522337	total: 1.49s	remaining: 2m 14s
11:	learn: 152.3825020	total: 1.63s	remaining: 2m 14s
12:	learn: 146.5922973	total: 1.76s	remaining: 2m 13s
13:	learn: 141.4759141	total: 1.91s	remaining: 2m 14s
14:	learn: 136.6579204	total: 2.04s	remaining: 2m 13s
15:	learn: 133.0060424	total: 2.17s	remaining: 2m 13s
16:	learn: 129.7165157	total: 2.31s	remaining: 2m 13s
17:	learn: 126.4466593	total: 2.46s	remaining: 2m 13s
18:	lear

In [18]:
print(sklearn.__version__)

NameError: name 'sklearn' is not defined

# Submission

In [23]:
solution = pd.read_csv('data/solution.csv')

In [26]:
if COMPUTE_TEST:
    solution['sales_hat'] = test_preds['LGBMRegressor'].clip(0,None)
    solution.to_csv('submission_lgbm.csv',index=False)


In [27]:
solution

Unnamed: 0,id,sales_hat
0,1226_2024-06-03,79.841788
1,1226_2024-06-11,72.054763
2,1226_2024-06-13,60.297780
3,1226_2024-06-15,64.426296
4,1226_2024-06-09,127.571231
...,...,...
47016,4572_2024-06-03,147.586792
47017,3735_2024-06-04,63.082169
47018,3735_2024-06-03,61.143945
47019,2129_2024-06-03,94.349845


In [25]:
solution

Unnamed: 0,id,sales_hat
0,1226_2024-06-03,73.790006
1,1226_2024-06-11,58.321924
2,1226_2024-06-13,30.570593
3,1226_2024-06-15,52.389194
4,1226_2024-06-09,146.530344
...,...,...
47016,4572_2024-06-03,129.656772
47017,3735_2024-06-04,33.990089
47018,3735_2024-06-03,39.295015
47019,2129_2024-06-03,108.814379
