In [9]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import lightgbm as lgb

In [11]:

lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1,
                } 

In [12]:
lgb_params=lgb_params = {
    #— core boosting and loss
    'boosting_type':       'gbdt',
    'objective':           'tweedie',            # good for non‐negative, skewed sales
    'tweedie_variance_power': 1.1,                # often ~1.1 in M5‐style contests
    'metric':              'rmse',               # track RMSE on hold‐out

    #— capacity / interaction constraints
    'num_leaves':          2**8 - 1,             # ~255 leaves; controls tree complexity
    'max_depth':           10,                   # cap depth to avoid overfitting
    'min_data_in_leaf':    100,                  # require 100 samples per leaf

    #— regularization
    'feature_fraction':    0.8,                  # randomly select 80% of features each tree
    'bagging_fraction':    0.8,                  # bag 80% of data per iteration
    'bagging_freq':        5,                    # perform bagging every 5 rounds
    'lambda_l1':           0.1,                  # L1 regularization
    'lambda_l2':           0.1,                  # L2 regularization

    #— learning rate & early stopping
    'learning_rate':       0.03,                 # slow and steady
    'n_estimators':        3000,                 # large cap; will stop early
    'early_stopping_rounds': 100,                # if no gain in 100 rounds → stop

    #— data bucketing & performance
    'max_bin':             255,                  # finer splits for numeric
    'subsample':           0.8,                  # equivalent to bagging_fraction
    'subsample_freq':      1,                    # subsample every iteration

    #— reproducibility & verbosity
    'seed':                42,
    'verbose':             -1,
}


In [2]:

#df = pd.read_csv('./data/M4-daily-train.csv') # 91 M
#df=df.drop(['V1'],axis=1)
#df = pd.read_csv('./data/M5-sales_train_evaluation.csv') # 91 M
raw_data_dir= './data/M5-'
train_df = pd.read_csv(raw_data_dir+'sales_train_evaluation.csv')
prices_df = pd.read_csv(raw_data_dir+'sell_prices.csv')
calendar_df = pd.read_csv(raw_data_dir+'calendar.csv')

In [3]:
TARGET = 'sales'         # Our main target
END_TRAIN = 1941         # Last day in train set
MAIN_INDEX = ['d']
index_columns = ['item_id','dept_id','cat_id','store_id','state_id']

grid_df = pd.melt(train_df, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

calendar_df=calendar_df.reset_index().rename(columns={'index':'d'})
calendar_df['d']='d_'+calendar_df['d'].astype(str)


# adding calendar features
grid_df=grid_df.merge(calendar_df, on='d', how='left')


prices_df.loc[lambda x:x['item_id']=='HOBBIES_1_001']\
    .loc[lambda x:x['store_id']=='CA_2']\
    .set_index('wm_yr_wk').sort_index()['sell_price'].plot()

In [4]:
grid_df

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-23,11617,Monday,3,5,2016,,,,,0,0,0
59181086,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-23,11617,Monday,3,5,2016,,,,,0,0,0
59181087,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,2016-05-23,11617,Monday,3,5,2016,,,,,0,0,0
59181088,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-23,11617,Monday,3,5,2016,,,,,0,0,0


In [None]:
# changing types
icols = ['event_name_1',
         'event_type_1',
         'event_name_2',
         'event_type_2',
         'snap_CA',
         'snap_TX',
         'snap_WI']
for col in icols:
    grid_df[col] = grid_df[col].astype('category')

# Convert to DateTime
grid_df['date'] = pd.to_datetime(grid_df['date'])



In [7]:
SHIFT_DAY=28
for i in [7,14,30,60,180]:
    print('Rolling period:', i)
    grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['item_id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean())
    grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['item_id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std())


Rolling period: 7
Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 180


In [8]:

icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
            ]

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    grid_df['enc'+col_name+'mean'] = grid_df.groupby(col)['sales'].transform('mean').astype(np.float16)
    grid_df['enc'+col_name+'std'] = grid_df.groupby(col)['sales'].transform('std').astype(np.float16)


Encoding ['state_id']
Encoding ['store_id']
Encoding ['cat_id']
Encoding ['dept_id']
Encoding ['state_id', 'cat_id']
Encoding ['state_id', 'dept_id']
Encoding ['store_id', 'cat_id']
Encoding ['store_id', 'dept_id']
Encoding ['item_id']
Encoding ['item_id', 'state_id']
Encoding ['item_id', 'store_id']


In [None]:
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    display(pd.DataFrame({'name':estimator.feature_name(),
                          'imp':estimator.feature_importance()}).sort_values('imp',ascending=False).head(25))

    
    model_name = model_dir+'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))


In [15]:
from crptmidfreq.utils.common import to_csv
to_csv(calendar_df.dropna(subset=['event_name_1']),'events')

Saved : /Users/sachadrevet/data_tmp/analysis/events.csv
