In [1]:
# Importing required note

import os
import gc
import warnings

import pandas as pd
from pandas.plotting import register_matplotlib_converters
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import joblib

warnings.filterwarnings("ignore")

# 1. Loading INPUT data

In [11]:
os.chdir('/kaggle/input/m5-forecasting-accuracy')

calendar = pd.read_csv('calendar.csv')
sales_te = pd.read_csv('sales_train_evaluation.csv')
price = pd.read_csv('sell_prices.csv')
submission = pd.read_csv('sample_submission.csv')

In [12]:
store_list = sales_te.store_id.unique()
store_list

array(['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1',
       'WI_2', 'WI_3'], dtype=object)

In [None]:
#Downcast in order to save memory

def downcast(df):
    
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    
    start_mem = df.memory_usage().sum() / 1024**2  
    
    for i,t in enumerate(types):
        
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2           
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))

    return df  

In [None]:
calendar = downcast(calendar)
sales_te = downcast(sales_te)
price = downcast(price)

In [None]:
sales_te.head()

In [None]:
# Concating the test rows 

df_sales_te = pd.melt(sales_te, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')

eval_rows = [row for row in submission['id'] if 'evaluation' in row]
df_eval = submission[submission['id'].isin(eval_rows)]

df_eval.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                  'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']

product = df_sales_te[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()


# df_eval['id'] = df_eval['id'].str.replace('_evaluation','_validation')
df_eval = df_eval.merge(product, how = 'left', on = 'id')
# df_eval['id'] = df_eval['id'].str.replace('_validation','_evaluation')


df_eval = pd.melt(df_eval, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
df_sales_te = pd.concat([df_sales_te, df_eval], axis = 0)



In [None]:
df_sales_te.tail()

In [None]:
# 
calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
df_te = pd.merge(df_sales_te, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
df_te = pd.merge(df_te, price, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
df_te.head()

In [None]:
df_te.tail()

In [None]:
del sales_te, df_sales_te, calendar, price
gc.collect()

# 2. Feature Engineering

In [None]:
def demand_features(df):
    
    df['lag_t28'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    df['lag_t29'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    df['lag_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    df['rolling_mean_t7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    df['rolling_std_t7'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    df['rolling_mean_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    df['rolling_mean_t90'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    df['rolling_mean_t180'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    df['rolling_std_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    df['rolling_skew_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    df['rolling_kurt_t30'] = df.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    return df

def seasonal_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['weekofyear'] = df['date'].dt.week
    df['dayofmonth'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['weekofmonth'] = df['dayofmonth'].apply(lambda x: ((x - 1)//7 + 1))
    
    return df

def price_features(df):
    df['lag_price_t1'] = (df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))).round(2)
    df['price_change_t1'] = (df['lag_price_t1'] - df['sell_price']) / (df['lag_price_t1'])
    df['rolling_price_max_t365'] = df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    df['price_change_t365'] = (df['rolling_price_max_t365'] - df['sell_price']) / (df['rolling_price_max_t365'])
    df['rolling_price_std_t7'] = df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    df['rolling_price_std_t30'] = df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    df.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    return df

def gen_features(df):
    df = demand_features(df)
    df = seasonal_features(df)
    df = price_features(df)
    
    return df

def cat_encode(df):
    
    cat_cols = ['item_id', 'dept_id', 'cat_id', 'state_id', 'event_name_1', 'event_name_2']
    for col in cat_cols:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
    
    return df


In [None]:
df_te.info()

In [46]:
# Generating features by store to limit the usage of memory


# Directory to save the pickle files
os.mkdir('/kaggle/working/DataFeatures')

df_te.drop(columns = ['event_type_1', 'event_type_2', 'day', 'wm_yr_wk'], inplace = True)

store_list = list(df_te.store_id.unique())

for store in store_list:
    globals()['df_' + str(store)] = df_te[df_te.store_id == store]
    globals()['df_' + str(store)] = gen_features(globals()['df_' + str(store)])
    print('Features generated for store - '+str(store))
    flt_features = [col for col in globals()['df_' + str(store)].columns if '_t' in col]
    print(flt_features)
    for ftr in flt_features:
        globals()['df_' + str(store)][ftr] = globals()['df_' + str(store)][ftr].apply(lambda x: round(x, 2))
        
    globals()['df_' + str(store)] = cat_encode(globals()['df_' + str(store)])   
    
    globals()['df_' + str(store)] = downcast(globals()['df_' + str(store)])    
    
    globals()['df_' + str(store)].to_pickle('/kaggle/working/DataFeatures/data_'+str(store)+'.pkl')
#     del globals()['df_' + str(store)]
    


In [None]:
df_CA_1.tail()

In [None]:
# Downloading the data features as a pickle files from output directory

import zipfile
from IPython.display import FileLink

def zip_dir(directory = '/kaggle/working/DataFeatures', file_name = 'DataFeatures.zip'):
    print(directory)
    """
    zip all the files in a directory
       
    Returns
    _____
    Creates a hyperlink, which can be used to download the zip file)
    """
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)


!zip -r DataFeatures.zip /kaggle/working/DataFeatures

zip_dir()


# 3. Hypertuning the LightGBM parameters

In [33]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer
from lightgbm import LGBMRegressor as lgbr
from sklearn.metrics import mean_squared_error


def hypertune_lgb(data) :
    
    
    ftrs = data.columns
    ftrs = list(set(ftrs) - set(['d', 'date', 'demand', 'store_id', 'id']))
    
    
    X_train = data[(data.d < 1914)]
    y_train = X_train['demand']
    X_train = X_train[ftrs]
    
    
    param_grid = {'n_estimators':hp.quniform('n_estimators', 900, 1200, 100),
               'learning_rate':hp.quniform('learning_rate', 0.1, 0.4, 0.1),
               'max_depth':hp.quniform('max_depth', 4,8,1),
               'num_leaves':hp.quniform('num_leaves', 25,75,25),
               'colsample_bytree':hp.quniform('colsample_bytree', 0.5, 0.9, 0.1)

              }

    def objective_grid(params):
        
        params = {'n_estimators': int(params['n_estimators']),
                  'learning_rate': params['learning_rate'],
                  'max_depth': int(params['max_depth']),
                  'num_leaves': int(params['num_leaves']),
                  'colsample_bytree': params['colsample_bytree']

                 }

        lgb_model = lgbr(**params)
        score = cross_val_score(lgb_model, X_train, y_train, cv=StratifiedKFold(),
                                scoring=make_scorer(mean_squared_error, greater_is_better=False), n_jobs=-1).mean()


    #Running through 10 iterations to identify the best params
    bestParams = fmin(fn=objective_grid, space= param_grid, max_evals=10 , algo=tpe.suggest)
    
    return bestParams



#Identifying the best parameters for 1 store  (using subset of store data)

for store in store_list :
    print('*****Prediction for Store: {}*****'.format(store))
    df = pd.read_pickle("/kaggle/input/M5-CustomFeatures2/data_"+str(store)+".pkl")
    
    df_filter = pretrain_filter(df, 1600)
    
    best = hypertune_lgb(df_filter)
    

In [None]:
gc.collect()

# 4. Training LightGBM model 

In [101]:
import lightgbm as lgb

def train_lightGBM(df):
    
    # Features to be included for model training
    ftrList = df.columns
    ftrList = list(set(ftrList) - set(['d', 'date', 'demand', 'store_id', 'id']))
    
    print(ftrList)
    
    # Train - Valid split
    x_train = df[(df.d < 1914)]
    y_train = x_train['demand']
    
    x_val = df[(df.d >= 1914) & (df.d < 1942)]
    y_val = x_val['demand']
    
    x_train.shape
    
    
    x_test = df[(df.d >= 1942)]
#     del dte
#     gc.collect()

    # parammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'huber',
#         'objective': 'tweedie',
#         'tweedie_variance_power': 1.1,
        'n_jobs': -1,
        'seed': 236,
        'bagging_fraction': 0.75,
        'lambda_l2' : 0.1,
        'bagging_freq': 10, 
        'tree_learner':'voting',
        'device' : 'gpu',
        'gpu_platform_id' : 0,
        'gpu_device_id': 0
    }
    
    # Tuned Hyperparametrs
    
    params_2  = {
            'n_estimators': 550,
           'max_depth': 6,
           'num_leaves': 30,
           'colsample_bytree': 0.75,
            'learning_rate': 0.1   
        }
        
    params.update(params_2)

    train_set = lgb.Dataset(x_train[ftrList], y_train)
    val_set = lgb.Dataset(x_val[ftrList], y_val)
    
    print(x_val.shape)
    
#     del x_train, y_train
    
    

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50,
                      valid_sets = [train_set, val_set], verbose_eval = 100)
#     model = lgb.LGBMRegressor.fit(params, x_train, y_train, eval_set=[(x_train,y_train),(x_val,y_val)], early_stopping_rounds = 50, verbose = 20)

    val_pred = model.predict(x_val[ftrList])
    val_score = np.sqrt(mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(x_test[ftrList])
    x_test['demand'] = y_pred
      
    # Returning both model and test data predictions
    return x_test, model


In [94]:
store_list[-3:-2]

array(['WI_1'], dtype=object)

In [102]:
# Training models - store-wise (10 models)
# Saving the output to df_test_{store_id} and also model to model_{store_id}



# final_sub = pd.DataFrame(columns = submission.columns)

# ftrList = list(df_te.columns)
# ftrList = list(set(ftrList) - set(['d', 'date', 'demand', 'store_id', 'id']))
# len(ftrList)


for store in store_list:
    print('*****Prediction for Store: {}*****'.format(store))
    globals()['df_'+ str(store)] = pd.read_pickle("/kaggle/input/M5-CustomFeatures2/data_"+str(store)+".pkl")
    
    
    
    globals()['df_'+ str(store)] = pretrain_filter(globals()['df_'+ str(store)], 850)
    
    ftrList = list(globals()['df_'+ str(store)].columns)
    ftrList = list(set(ftrList) - set(['d', 'date', 'demand', 'store_id', 'id']))
 


    globals()['df_test_'+ str(store)], globals()['model_'+ str(store)] = train_lightGBM(globals()['df_'+ str(store)])
    
#     joblib.dump(globals()['model_'+ str(store)], '/kaggle/working/model_'+str(store)+'.pkl')
    
    
    

*****Prediction for Store: CA_1*****
['snap_WI', 'rolling_mean_t90', 'lag_t30', 'rolling_price_std_t30', 'snap_TX', 'month', 'weekofmonth', 'rolling_std_t30', 'sell_price', 'rolling_kurt_t30', 'lag_t29', 'year', 'dayofweek', 'price_change_t1', 'rolling_mean_t180', 'lag_t28', 'event_name_1', 'dayofmonth', 'rolling_mean_t30', 'dept_id', 'rolling_std_t7', 'weekofyear', 'item_id', 'rolling_price_std_t7', 'rolling_mean_t7', 'snap_CA', 'rolling_skew_t30', 'cat_id', 'state_id', 'event_name_2', 'price_change_t365']
(85372, 36)
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3220
[LightGBM] [Info] Number of data points in the train set: 3241087, number of used features: 30
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[

In [103]:
del df_test_final
df_test_final = pd.DataFrame(columns = df_test_CA_1.columns)

for store in store_list:
    df_test_final = df_test_final.append(globals()['df_test_'+ str(store)])
    
df_test_final.shape
    

(853720, 36)

In [104]:
df_test_final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,demand,date,d,event_name_1,...,year,month,weekofyear,dayofmonth,dayofweek,weekofmonth,price_change_t1,price_change_t365,rolling_price_std_t7,rolling_price_std_t30
59181090,HOBBIES_1_001_CA_1_evaluation,1437,3,1,CA_1,0,0.609984,2016-05-23,1942,30,...,2016,5,21,23,0,4,0.0,0.0,0.0,0.0
59181091,HOBBIES_1_002_CA_1_evaluation,1438,3,1,CA_1,0,0.20262,2016-05-23,1942,30,...,2016,5,21,23,0,4,0.0,0.0,0.0,0.0
59181092,HOBBIES_1_003_CA_1_evaluation,1439,3,1,CA_1,0,0.432644,2016-05-23,1942,30,...,2016,5,21,23,0,4,0.0,0.0,0.0,0.0
59181093,HOBBIES_1_004_CA_1_evaluation,1440,3,1,CA_1,0,1.311941,2016-05-23,1942,30,...,2016,5,21,23,0,4,0.0,0.0,0.0,0.0
59181094,HOBBIES_1_005_CA_1_evaluation,1441,3,1,CA_1,0,0.892137,2016-05-23,1942,30,...,2016,5,21,23,0,4,0.0,0.0,0.0,0.0


# 5. Transforming the test data predictions  (d- 1942 to 1969) into submission format

In [105]:

def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    validation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    validation = submission[submission['id'].isin(validation_rows)]
    validation['id'] = validation['id'].str.replace('_evaluation','_validation')
    evaluation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
#     final.to_csv('submission.csv', index = False)
    
    return final

sub = predict(df_test_final, submission)


# submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
# submission

In [106]:
sub.to_csv('/kaggle/working/submission_hub.csv', index = False)
sub.tail()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,FOODS_3_823_WI_3_evaluation,0.268591,0.255323,0.251298,0.322343,0.414304,0.401227,0.38094,0.301011,0.391339,...,0.454402,0.482919,0.596501,0.408235,0.479233,0.479493,0.415987,0.443299,0.519808,0.474259
30486,FOODS_3_824_WI_3_evaluation,0.159667,0.172015,0.225666,0.237938,0.267842,0.292276,0.288183,0.219277,0.232293,...,0.159918,0.188728,0.188728,0.140711,0.150704,0.150704,0.164283,0.185401,0.242884,0.224948
30487,FOODS_3_825_WI_3_evaluation,0.427591,0.400598,0.403648,0.416184,0.466004,0.601378,0.633713,0.460191,0.452317,...,0.610567,0.710513,0.720211,0.622596,0.613489,0.572602,0.50783,0.593669,0.629409,0.629779
30488,FOODS_3_826_WI_3_evaluation,0.651644,0.691362,0.613448,0.611846,0.739375,0.907657,0.804703,0.6191,0.61909,...,0.825607,0.994348,0.95022,0.801783,1.168471,0.979524,0.808202,0.882771,1.128162,0.934209
30489,FOODS_3_827_WI_3_evaluation,0.488638,0.470574,0.439066,0.424163,0.43877,0.629591,0.657121,0.505377,0.603194,...,0.656164,0.859605,0.734528,0.731424,1.054513,0.925136,0.823367,1.003751,1.339789,1.315974


In [None]:

import shutil
shutil.rmtree('/kaggle/working/DataFeatures.zip')

# # os.remove("fi.zip")

# # os.listdir('/kaggle/working/')
# os.mkdir('/kaggle/working/DataFeatures')

# # calendar.to_pickle('DataFeatures/cal.pkl')


In [None]:
# for store in store_list:
    
#     print('*****Prediction for Store: {}*****'.format(store))
#     globals()['df_'+ str(store)] = pd.read_pickle('/kaggle/input/M5-CustomFeatures2/data_TX_2.pkl')
# #     df_test = pretrain_filter(globals()['df_'+ str(store)], 1941)
#     model = joblib.load('/kaggle/input/Models/model_TX_2.pkl')
    
#     df_test['demand'] = model.predict(df_test[ftrList])
    
    
    
#     break
    
    
    

In [43]:
# compute importances
importance_df = (
    pd.DataFrame({
        'feature_name': model_CA_1.feature_name(),
        'importance_gain': model_CA_1.feature_importance(importance_type='gain'),
        'importance_split': model_CA_1.feature_importance(importance_type='split'),
    })
    .sort_values('importance_gain', ascending=False)
    .reset_index(drop=True)
)
print(importance_df)

             feature_name  importance_gain  importance_split
0        rolling_mean_t30     1.697307e+07               772
1        rolling_mean_t90     5.035025e+06               892
2       rolling_mean_t180     4.690806e+06              1202
3         rolling_mean_t7     1.519796e+06               795
4       price_change_t365     9.932589e+05               617
5                 lag_t28     7.924144e+05               571
6         rolling_std_t30     6.978736e+05               519
7              sell_price     6.778739e+05              1735
8               dayofweek     4.676320e+05               479
9              weekofyear     3.639261e+05              1262
10                item_id     2.707389e+05              1093
11  rolling_price_std_t30     2.280203e+05               413
12        price_change_t1     1.154081e+05                75
13                lag_t29     8.725339e+04               260
14                  month     8.503246e+04               449
15           event_name_