In [2]:


import numpy as np
import pandas as pd
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import  BottomUp, TopDown, MinTrace
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder


from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
    

callbacks = [lgb.log_evaluation(period=0)]

pd.options.mode.chained_assignment = None

sales_train_eval = pd.read_csv('sales_train_evaluation.csv')
sell_price = pd.read_csv('sell_prices.csv')
calendar = pd.read_csv('calendar.csv')

foods = pd.read_csv('List_of_foods.csv')





#making the summing matrix





# rows / columns
list1 = ['Total', 'CA','CA_1','CA_2','CA_3','CA_4','TX','TX_1','TX_2','TX_3','WI','WI_1','WI_2','WI_3']
list2 = ['CA_1','CA_2','CA_3','CA_4','TX_1','TX_2','TX_3','WI_1','WI_2','WI_3']
S = np.zeros((len(list1), len(list2)))

S = pd.DataFrame(S); S.index = list1; S.columns = list2


# encode the hierarchical structure
S.loc['Total'] = 1
S.loc['CA'][['CA_1','CA_2','CA_3', 'CA_4']] = 1
S.loc['TX'][['TX_1','TX_2','TX_3']] = 1
S.loc['WI'][['WI_1','WI_2','WI_3']] = 1
for x in S.columns:
    S.loc[x][x]= 1
S = S.astype(int)





tags = {}
tags['Country'] = np.array(['Total'], dtype=object)
tags['Country/State'] = np.array(['CA', 'TX', 'WI'], dtype=object)
tags['Country/State/Store'] = np.array(['CA_1', 'CA_2', 'CA_3', 'CA_4',  
                                        'TX_1', 'TX_2', 'TX_3',
                                        'WI_1', 'WI_2', 'WI_3'], dtype=object)



horizon = 28



def label_encoding(train, feature):

    encoder = LabelEncoder()
    encoder.fit(train[feature].values.astype(str))
    train[feature] = encoder.fit_transform(train[feature].values.astype(str))
    
    return train[feature]







col = ['event_name_1', 'event_type_1',
       'event_name_2', 'event_type_2', 'wday','month', 'year','snap_CA','snap_TX','snap_WI', 'value_lag_1', 'value_lag_2', 
       'value_lag_3', 'value_lag_6', 'value_lag_12', 'value_lag_24', 'value_lag_36', 'rolling_value_mean']


hyperparameters = {
    'boosting_type': ['gbdt'],
    'metric': ['rmse'],
    'objective': ['regression'],
    'n_jobs': [-1],
    #'seed': [236],
    'learning_rate': [0.28],
    'bagging_fraction': [0.75],
    'bagging_freq': [5],
    'colsample_bytree': [0.75],
    'force_row_wise' : [True],
    'verbose':[-1],
    'num_leaves':[31]
}



food_num=1


product_id = foods.loc[food_num].at["Foods"]


product_data = sales_train_eval[sales_train_eval['item_id'].str.contains(product_id)]
product_sell_price = sell_price[sell_price['item_id'].str.contains(product_id)]


df = pd.melt(
product_data,
id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
var_name='d',
value_name='value').dropna()
df = pd.merge(df, calendar, on='d', how='left')

df = df[(df['date'] > '2014-01-01')]

df["event_name_1"] = df["event_name_1"].fillna("no_event")
df["event_name_2"] = df["event_name_2"].fillna("no_event")
df["event_type_1"] = df["event_type_1"].fillna("no_event")
df["event_type_2"] = df["event_type_2"].fillna("no_event")


df_stores = df.groupby(['date', 'store_id','wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI'])[['value']].sum()
df_stores.reset_index(inplace=True)
df_stores = df_stores.T.reset_index(drop=True).T
df_stores.columns = ['d', 'unique_id', 'wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI','sales']


df_state = df.groupby(['date', 'state_id','wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI'])[['value']].sum()
df_state.reset_index(inplace=True)
df_state = df_state.T.reset_index(drop=True).T
df_state.columns = ['d', 'unique_id','wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI', 'sales']



df_total = df.groupby(['date','wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI'])[['value']].sum()
df_total.reset_index(inplace=True)
df_total['unique_id'] = 'Total'
df_total.columns = ['d','wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI','sales', 'unique_id']

df_all = pd.concat([df_stores, df_state, df_total], axis = 0)

df_all.columns = ['ds','unique_id','wday','month','year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2','snap_CA','snap_TX','snap_WI', 'y']
df_all['ds'] = pd.to_datetime(df_all['ds'])





#Introduce lags
lags = [1,2,3,6,12,24,36]
for lag in lags:
    df_all['value_lag_'+str(lag)] = df_all.groupby(['unique_id'],as_index=False)['y'].shift(lag)

for lag in lags:
    df_all['value_lag_'+str(lag)] = df_all['value_lag_'+str(lag)].fillna(0)

df_all['rolling_value_mean'] = df_all.groupby(['unique_id'])['y'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
df_all['rolling_value_mean'] = df_all['rolling_value_mean'].fillna(0)






df_all['wday']  = label_encoding(df_all,"wday" )
df_all['month']  = label_encoding(df_all,"month" )
df_all['year']  = label_encoding(df_all,"year" )
df_all['snap_CA']  = label_encoding(df_all,"snap_CA" )
df_all['snap_TX']  = label_encoding(df_all,"snap_TX" )
df_all['snap_WI']  = label_encoding(df_all,"snap_WI" )


df_all['event_name_1']  = label_encoding(df_all,"event_name_1" )
df_all['event_name_2']  = label_encoding(df_all,"event_name_2" )
df_all['event_type_1']  = label_encoding(df_all,"event_type_1" )
df_all['event_type_2']  = label_encoding(df_all,"event_type_2" )



x_test = df_all.groupby('unique_id').tail(horizon)
x_train = df_all.drop(x_test.index)
x_val = x_train.groupby('unique_id').tail(horizon)
x_train = x_train.drop(x_val.index)


x_train['y'] = x_train['y'].astype(float)
x_test['y'] = x_test['y'].astype(float)
x_val['y'] = x_val['y'].astype(float)


y_train = x_train['y']
y_test = x_test['y']
y_val = x_val['y']


from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import RandomizedSearchCV

# Step 1: Define parameter distribution for randomized search
param_distribution = {
    'max_depth': np.arange(1, 10),
    'max_samples':np.arange(0.1,1.01,0.05),
    'n_estimators': np.arange(100, 200),
    'max_features':np.arange(0.2,1.001,0.05),
    
}

# Step 2: Initialize LGBMRegressor estimator
estimator = RandomForestRegressor()

# Step 3: Initialize Randomized Search with 3-fold cross validation and fit the model
model = RandomizedSearchCV(estimator=estimator,
                           param_distributions=param_distribution,
                           n_iter=100,  # Number of random combinations to try
                           cv=3,
                           n_jobs=4,
                           scoring='neg_root_mean_squared_error')
model.fit(x_train[col], y_train)

# Step 4: Print best parameters
best_params = model.best_estimator_
print(best_params)


18 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Rishi\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Rishi\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "c:\Users\Rishi\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Rishi\AppData\Local\Programs\Python\Python39\lib\s

RandomForestRegressor(max_depth=9, max_features=0.8999999999999999,
                      max_samples=0.5500000000000002, n_estimators=157)
