In [92]:
# Import necessary libraries
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

In [108]:
# Load data files
data = pd.read_csv("C:/Data/M5store_2.csv")

# Ensure proper formatting
data['ds'] = pd.to_datetime(data['d'])
data = data.sort_values(by=['store_id', 'ds']).reset_index(drop=True)

data.rename(columns={'revenue': 'y'}, inplace=True)

data['lagged_1_price'] = data['sell_price'].shift(1)
data['lagged_3_price'] = data['sell_price'].shift(3)
data['lagged_7_price'] = data['sell_price'].shift(7)

#data['avg_price'] = data['y'] / data['sales']
#data['lagged_1_avg_price'] = data['avg_price'].shift(1)
#data['lagged_3_avg_price'] = data['avg_price'].shift(3)
#data['lagged_7_avg_price'] = data['avg_price'].shift(7)
#data.drop(columns=['avg_price'], inplace=True)
#data.drop(columns=['sales'], inplace=True)

In [109]:
# Generate lag and rolling features
def create_features(df, lags, rolling_windows):
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby('store_id')['y'].shift(lag)
    for window in rolling_windows:
        df[f'roll_mean_{window}'] = df.groupby('store_id')['y'].shift(1).rolling(window=window).mean()
        df[f'roll_std_{window}'] = df.groupby('store_id')['y'].shift(1).rolling(window=window).std()
    df.dropna(inplace=True)
    return df

# Apply feature engineering
lags = [7, 14, 21, 28, 364]
rolling_windows = [7, 14, 28]
data = create_features(data, lags, rolling_windows)

In [110]:
# Function to calculate rolling slopes
def calculate_rolling_slope(series, window):
    slopes = [np.nan] * (window - 1)  # Fill with NaN for the first (window - 1) rows
    regressor = LinearRegression()

    for i in range(len(series) - window + 1):
        y = series[i:i + window].values  # Sales in the rolling window
        x = np.arange(window).reshape(-1, 1)  # Time indices for the window
        regressor.fit(x, y)
        slopes.append(regressor.coef_[0])  # Extract the slope

    return slopes

# Group by store_id and calculate rolling slopes
window_size = 3  # Specify the rolling window size
data['rolling_sales_slope'] = data.groupby('store_id')['y'].transform(
    lambda x: calculate_rolling_slope(x, window_size)
)

data['lagged_1_rolling_sales_slope'] = data['rolling_sales_slope'].shift(1)
data.drop(columns=['rolling_sales_slope'], inplace=True)

In [111]:
# Extract temporal features
data['month'] = data['ds'].dt.month  # Month of the year (1-12)
data['day_of_month'] = data['ds'].dt.day  # Day of the month (1-31)
data['day_of_week'] = data['ds'].dt.weekday  # Day of the week (0=Monday, 6=Sunday)

In [112]:
# Encode Thanksgiving
# Thanksgiving is the 4th Thursday in November in the US
data['is_thanksgiving'] = data['ds'].apply(lambda x: (x.month == 11) and (x.weekday() == 3) and (15 < x.day <= 28))

# Encode Christmas
# Christmas is always on December 25
data['is_christmas'] = data['ds'].apply(lambda x: (x.month == 12) and (x.day == 25))

In [113]:
# Create pivot table
pivot_data = data.pivot(index='ds', columns='store_id', values='y')
pivot_data.columns = [f'sales_store_{store}' for store in pivot_data.columns]

# Merge the pivoted data with the original dataset
data = pd.merge(
    data, 
    pivot_data.reset_index(), 
    on=['ds'], 
    how='left'
)

# Drop self-sales for each store
for store in data['store_id'].unique():
    feature_to_drop = f'sales_store_{store}'
    data.loc[data['store_id'] == store, feature_to_drop] = None

# Create lagged values for all columns starting with 'sales_store_'
for col in [col for col in data.columns if col.startswith('sales_store_')]:
    # Replace the column with its lagged version
    data[col] = data.groupby('store_id')[col].shift(1)  # Replace '1' with desired lag period


In [114]:
# Calculate total demand across all stores for each date
total_demand = data.groupby('ds')['y'].sum().reset_index()
total_demand.rename(columns={'y': 'total_sales_all_stores'}, inplace=True)

# Merge total demand back into the original data
data = pd.merge(data, total_demand, on='ds', how='left')

# Subtract the current store's sales to get demand from other stores
data['demand_other_stores'] = data['total_sales_all_stores'] - data['y']

# Generate lagged features for the demand from other stores
for lag in [7, 14, 28, 364]:  # Specify the lag periods
    data[f'demand_other_stores_lag_{lag}'] = (
        data.groupby('store_id')['demand_other_stores'].shift(lag)
    )

# Drop the intermediate column if not needed
data.drop(columns=['total_sales_all_stores'], inplace=True)
data.drop(columns=['demand_other_stores'], inplace=True)
#data.drop(columns=['level_0'], inplace=True)
data.drop(columns=['d'], inplace=True)

In [115]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15770 entries, 0 to 15769
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   index                         15770 non-null  int64         
 1   store_id                      15770 non-null  object        
 2   y                             15770 non-null  float64       
 3   sell_price                    15770 non-null  float64       
 4   ds                            15770 non-null  datetime64[ns]
 5   lagged_1_price                15770 non-null  float64       
 6   lagged_3_price                15770 non-null  float64       
 7   lagged_7_price                15770 non-null  float64       
 8   lag_7                         15770 non-null  float64       
 9   lag_14                        15770 non-null  float64       
 10  lag_21                        15770 non-null  float64       
 11  lag_28                      

In [118]:
# Function for leave-one-out cross-validation and store-wise performance with MASE
def loocv_lgb_by_store_with_mase(data, target_col, features, seasonality=7, h=1, m=10):
    store_performance = []
    
    # Initialize progress bar for stores
    store_iterator = tqdm(data['store_id'].unique(), desc="Processing Stores", unit="store")
    
    # Iterate over each store
    for store_id in store_iterator:
        mase_scores = []
        
        # Filter and sort data for the current store
        store_data = data[data['store_id'] == store_id].sort_values('ds').reset_index(drop=True)
        
        # Perform leave-one-out cross-validation for the last m periods
        for i in range(len(store_data) - m, len(store_data)):
            train_data = store_data.iloc[:i]  # Train up to point i
            val_data = store_data.iloc[i:i + h]  # Predict h steps ahead
            
            # Ensure validation data exists
            if len(val_data) < h:
                continue
            
            train_x = train_data[features]
            train_y = train_data[target_col]
            val_x = val_data[features]
            val_y = val_data[target_col]
            
            # LightGBM model
            train_dataset = lgb.Dataset(train_x, label=train_y, categorical_feature=['month', 'day_of_month', 'day_of_week'])
            
            # Define LightGBM parameters
            params = {
                "objective": "regression",
                "metric": "mae",
                "learning_rate": 0.1,
                "max_depth": 6,
                "num_leaves": 31,
                "verbosity": -1
            }
            
            # Train the model
            model = lgb.train(
                params,
                train_dataset,
                num_boost_round=1000
            )
            
            # Validation predictions
            val_preds = model.predict(val_x)
            
            # Calculate MASE
            mae = np.mean(np.abs(val_y - val_preds))
            naive_forecast = train_y.shift(seasonality).iloc[seasonality:]
            naive_errors = np.abs(train_y[seasonality:] - naive_forecast)
            scaling_factor = naive_errors.mean()
            mase = mae / scaling_factor if scaling_factor > 0 else np.nan
            mase_scores.append(mase)
        
        # Store average MASE for the store
        store_performance.append({
            "Store": store_id,
            "Average MASE": np.nanmean(mase_scores)  # Handle NaN values if no valid MASE
        })
    
    # Convert results to a DataFrame
    performance_df = pd.DataFrame(store_performance)
    
    # Calculate overall average MASE
    overall_mase = performance_df['Average MASE'].mean()
    performance_df = pd.concat([
        performance_df,
        pd.DataFrame([{"Store": "Overall Average", "Average MASE": overall_mase}])
    ], ignore_index=True)
    
    return performance_df

# Features and target
target = 'y'
features = [col for col in data.columns if col not in ['ds', 'y', 'store_id']]

# Run leave-one-out cross-validation and get MASE performance per store
performance_df = loocv_lgb_by_store_with_mase(data, target, features, seasonality=7, h=14, m=28)

# Display the performance table
print(performance_df)

Processing Stores: 100%|████████████████████████████████████████████████████████████| 10/10 [01:37<00:00,  9.75s/store]

              Store  Average MASE
0              CA_1      0.743078
1              CA_2      1.413937
2              CA_3      0.446333
3              CA_4      1.077713
4              TX_1      0.912367
5              TX_2      0.609341
6              TX_3      0.832145
7              WI_1      1.224661
8              WI_2      0.585380
9              WI_3      0.537057
10  Overall Average      0.838201



