https://phdinds-aim.github.io/time_series_handbook/08_WinningestMethods/lightgbm_m5_forecasting.html

https://www.kaggle.com/code/ratan123/m5-forecasting-lightgbm-with-timeseries-splits

### Fine tuning only on 1 level ahead forecast horizon

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("../data/phase0_train.csv")
test_df = pd.read_csv("../data/phase0_test.csv")


In [3]:
train_df.head()

Unnamed: 0,Client,Warehouse,Product,ds,Price,y
0,0,1,367,2020-07-06,10.900001,7.0
1,0,1,367,2020-07-13,10.900001,7.0
2,0,1,367,2020-07-20,10.900001,7.0
3,0,1,367,2020-07-27,15.582857,7.0
4,0,1,367,2020-08-03,27.289999,7.0


In [4]:
train_df.shape

(2559010, 6)

In [5]:
test_df.shape

(195689, 6)

In [6]:
test_df

Unnamed: 0,Client,Warehouse,Product,ds,Price,y
0,0,1,367,2023-10-09,51.860000,1.0
1,0,1,367,2023-10-16,51.860000,1.0
2,0,1,367,2023-10-23,51.860000,1.0
3,0,1,367,2023-10-30,51.230000,2.0
4,0,1,367,2023-11-06,51.230000,1.0
...,...,...,...,...,...,...
195684,46,318,14294,2023-12-04,,0.0
195685,46,318,14294,2023-12-11,46.990000,1.0
195686,46,318,14294,2023-12-18,46.990000,1.0
195687,46,318,14294,2023-12-25,39.190000,1.0


In [7]:
df = pd.concat([train_df, test_df])

In [8]:
df = df.reset_index(drop=True)

In [18]:
df['ds'] = pd.to_datetime(df['ds'])
df = df.sort_values(by  = ['Client', 'Warehouse', 'Product', 'ds'])

In [19]:
df.head()

Unnamed: 0,Client,Warehouse,Product,ds,Price,y
0,0,1,367,2020-07-06,10.900001,7.0
1,0,1,367,2020-07-13,10.900001,7.0
2,0,1,367,2020-07-20,10.900001,7.0
3,0,1,367,2020-07-27,15.582857,7.0
4,0,1,367,2020-08-03,27.289999,7.0


In [20]:
test_df.tail()

Unnamed: 0,Client,Warehouse,Product,ds,Price,y
195684,46,318,14294,2023-12-04,,0.0
195685,46,318,14294,2023-12-11,46.99,1.0
195686,46,318,14294,2023-12-18,46.99,1.0
195687,46,318,14294,2023-12-25,39.19,1.0
195688,46,318,14294,2024-01-01,45.423336,3.0


In [21]:
df.tail()

Unnamed: 0,Client,Warehouse,Product,ds,Price,y
2754694,46,318,14294,2023-12-04,,0.0
2754695,46,318,14294,2023-12-11,46.99,1.0
2754696,46,318,14294,2023-12-18,46.99,1.0
2754697,46,318,14294,2023-12-25,39.19,1.0
2754698,46,318,14294,2024-01-01,45.423336,3.0


In [22]:
id_cols = ['Client', 'Warehouse', 'Product']
for col in id_cols:
    df[col] = df[col].astype('category')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2754699 entries, 0 to 2754698
Data columns (total 6 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Client     category      
 1   Warehouse  category      
 2   Product    category      
 3   ds         datetime64[ns]
 4   Price      float64       
 5   y          float64       
dtypes: category(3), datetime64[ns](1), float64(2)
memory usage: 97.6 MB


In [24]:
def feature_creation(data, prediction_horizon = 13):
    id_cols = ['Client', 'Warehouse', 'Product']
    data['prev_day']= data['ds'] - pd.Timedelta(days=7)

    from pandas.tseries.holiday import USFederalHolidayCalendar
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=data['prev_day'].min(), end=data['ds'].max()).to_pydatetime()
    
    data['holiday_count'] = data.apply(
        lambda x: sum(1 for holiday in holidays if holiday.date() >= x['prev_day'].date() and holiday.date() <= x['ds'].date()), axis=1)
    data['is_holiday'] = data['holiday_count'].apply(lambda x: 1 if x > 0 else 0)

    data['is_prev_1week_holidays'] = data.groupby(id_cols, observed=True)['holiday_count'].transform(lambda x: x.shift(1)) 
    data['is_prev_2week_holidays'] = data.groupby(id_cols, observed=True)['holiday_count'].transform(lambda x: x.shift(2)) 

    data["year"] = data["ds"].dt.year
    data["month"] = data["ds"].dt.month
    data["week"] = data["ds"].dt.isocalendar().week
    data["day"] = data["ds"].dt.day
    data["day_of_week"] = data["ds"].dt.weekday
    data["quarter"] = data["ds"].dt.quarter

    for week in range(prediction_horizon, prediction_horizon+10):#[13,14,15,16,17,18,19]:
        data[f'sales_lag_{week}'] = data.groupby(id_cols, observed=True)['y'].transform(lambda x: x.shift(week))
    
    for week in range(1,14):
        data[f'price_lag_{week}'] = data.groupby(id_cols, observed=True)['Price'].transform(lambda x: x.shift(week))
        data[f'price_rolling_mean_{week}'] = data.groupby(id_cols, observed=True)['Price'].transform(lambda x: x.rolling(week).mean())
        data[f'price_rolling_std_{week}'] = data.groupby(id_cols, observed=True)['Price'].transform(lambda x: x.rolling(week).std())
    
    
    for rolling_window in [13,20,30]:
        data[f'sales_rolling_mean_{rolling_window}'] = data.groupby(id_cols, observed=True)['y'].transform(lambda x: x.shift(prediction_horizon).rolling(rolling_window).mean())
        data[f'sales_rolling_std_{rolling_window}'] = data.groupby(id_cols, observed=True)['y'].transform(lambda x: x.shift(prediction_horizon).rolling(rolling_window).std())

    return data    


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2754699 entries, 0 to 2754698
Data columns (total 6 columns):
 #   Column     Dtype         
---  ------     -----         
 0   Client     category      
 1   Warehouse  category      
 2   Product    category      
 3   ds         datetime64[ns]
 4   Price      float64       
 5   y          float64       
dtypes: category(3), datetime64[ns](1), float64(2)
memory usage: 97.6 MB


In [None]:
data = feature_creation(df, prediction_horizon=1)

In [None]:
data.shape

(2754699, 61)

In [None]:
import gc

In [None]:
gc.collect()

0

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
# data = reduce_mem_usage(data)

In [None]:
data

Unnamed: 0,Client,Warehouse,Product,ds,Price,y,sales_lag_1,sales_lag_2,sales_lag_3,sales_lag_4,...,price_rolling_std_12,price_lag_13,price_rolling_mean_13,price_rolling_std_13,sales_rolling_mean_13,sales_rolling_std_13,sales_rolling_mean_20,sales_rolling_std_20,sales_rolling_mean_30,sales_rolling_std_30
0,0,1,367,2020-07-06,10.900001,7.0,,,,,...,,,,,,,,,,
1,0,1,367,2020-07-13,10.900001,7.0,7.0,,,,...,,,,,,,,,,
2,0,1,367,2020-07-20,10.900001,7.0,7.0,7.0,,,...,,,,,,,,,,
3,0,1,367,2020-07-27,15.582857,7.0,7.0,7.0,7.0,,...,,,,,,,,,,
4,0,1,367,2020-08-03,27.289999,7.0,7.0,7.0,7.0,7.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2754694,46,318,14294,2023-12-04,,0.0,2.0,2.0,4.0,3.0,...,,45.04,,,2.692308,1.797434,2.30,1.750188,3.566667,2.674056
2754695,46,318,14294,2023-12-11,46.990000,1.0,0.0,2.0,2.0,4.0,...,,46.99,,,2.384615,1.894662,2.20,1.823819,3.366667,2.709922
2754696,46,318,14294,2023-12-18,46.990000,1.0,1.0,0.0,2.0,2.0,...,,34.09,,,2.230769,1.921538,2.20,1.823819,3.100000,2.523681
2754697,46,318,14294,2023-12-25,39.190000,1.0,1.0,1.0,0.0,2.0,...,,,,,1.923077,1.754116,2.10,1.832456,2.766667,2.062528


In [None]:
test_df.columns

Index(['Client', 'Warehouse', 'Product', 'ds', 'Price', 'y'], dtype='object')

In [None]:
# Remove rows that exist in df_subset
# df_result = df_main.merge(df_subset, on=['ID', 'Value'], how='left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])

final_train_df = data.merge(test_df[['Client', 'Warehouse', 'Product', 'ds']], on = ['Client', 'Warehouse', 'Product', 'ds'], how = 'left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
final_test_df = data.merge(train_df[['Client', 'Warehouse', 'Product', 'ds']], on = ['Client', 'Warehouse', 'Product', 'ds'], how = 'left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])

In [None]:
x = final_test_df.merge(test_df, on = ['Client', 'Warehouse', 'Product', 'ds'], how = 'outer', indicator=True)
print(x['_merge'].value_counts())
x = final_train_df.merge(train_df, on = ['Client', 'Warehouse', 'Product', 'ds'], how = 'outer', indicator=True)
print(x['_merge'].value_counts())

_merge
both          195689
left_only          0
right_only         0
Name: count, dtype: int64
_merge
both          2559010
left_only           0
right_only          0
Name: count, dtype: int64


In [None]:
# del(x, test_df, train_df, df, data)

In [None]:
gc.collect()

8

In [None]:
final_train_df.reset_index(drop=True, inplace=True)
final_test_df.reset_index(drop=True, inplace=True)

In [None]:
x_train= final_train_df.copy()
x_train.drop(columns=['y'], inplace= True)
y_train = final_train_df['y']

x_test= final_test_df.copy()
x_test.drop(columns=['y'], inplace= True)
y_test = final_test_df['y']

In [None]:
print("Train data shape", x_train.shape, y_train.shape)
print("Test data shape", x_test.shape, y_test.shape)

Train data shape (2559010, 60) (2559010,)
Test data shape (195689, 60) (195689,)


In [None]:
import lightgbm as lgb

In [None]:
# !pip install lightgbm

In [None]:
from sklearn.model_selection import  TimeSeriesSplit

In [None]:
n_fold = 3 #3 for timely purpose of the kernel
folds = TimeSeriesSplit(n_splits=n_fold)