# Baseline model

## Import data and setup

In [57]:
# Import libs
import os
import pandas as pd
import numpy as np

In [54]:
# given error metric to calculate RMSPE
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [28]:
# set up paths
data_dir = "../data"

In [59]:
# import data
X_train = pd.read_csv(os.path.join(data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(data_dir, 'X_test.csv'))

y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv'))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(397900, 24)
(99476, 24)
(397900, 1)
(99476, 1)


In [60]:
X_train.columns

Index(['Date', 'Store', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Month', 'Year',
       'Weekday', 'Holiday', 'StoreType_enc', 'Assortment_enc', 'Store_enc'],
      dtype='object')

## Select training features

In [62]:
cols=['Open', 'Promo', 'Month', 'Year', 'Weekday', 'SchoolHoliday',
      'Holiday', 'StoreType_enc', 'Assortment_enc', 'Store_enc',
     ]

In [63]:
X_train = X_train[cols]
X_train

Unnamed: 0,Open,Promo,Month,Year,Weekday,SchoolHoliday,Holiday,StoreType_enc,Assortment_enc,Store_enc
0,1.0,0.0,4,2014,0,0.0,0,6840.915870,7129.431344,9500.651917
1,1.0,0.0,2,2014,2,0.0,0,6822.692432,7129.431344,4913.129794
2,1.0,1.0,1,2013,2,0.0,0,6661.654185,7129.431344,6791.359116
3,1.0,1.0,7,2014,4,0.0,0,6661.654185,7129.431344,6078.828804
4,1.0,0.0,6,2014,2,0.0,0,6840.915870,6563.090397,7400.400560
...,...,...,...,...,...,...,...,...,...,...
397895,1.0,1.0,10,2013,3,0.0,0,6840.915870,7129.431344,3492.442786
397896,1.0,0.0,2,2014,1,0.0,0,6661.654185,7129.431344,8606.338936
397897,1.0,1.0,6,2013,1,0.0,0,6661.654185,6563.090397,6776.713018
397898,1.0,1.0,6,2013,3,0.0,0,6840.915870,6563.090397,4851.476190


In [64]:
X_test = X_test[cols]
X_test

Unnamed: 0,Open,Promo,Month,Year,Weekday,SchoolHoliday,Holiday,StoreType_enc,Assortment_enc,Store_enc
0,1.0,0.0,10,2013,1,0.0,0,6840.915870,6563.090397,4358.621387
1,1.0,1.0,6,2013,4,0.0,0,6661.654185,6563.090397,5680.800546
2,1.0,1.0,3,2013,4,0.0,0,6840.915870,6563.090397,6879.028490
3,1.0,0.0,4,2013,2,0.0,0,6840.915870,7129.431344,3664.011299
4,1.0,1.0,11,2013,3,0.0,0,6661.654185,7129.431344,6286.585561
...,...,...,...,...,...,...,...,...,...,...
99471,1.0,1.0,6,2013,4,0.0,0,6840.915870,6563.090397,7570.425876
99472,1.0,0.0,1,2014,3,0.0,0,6840.915870,6563.090397,7003.822289
99473,1.0,1.0,5,2014,4,0.0,0,6840.915870,6563.090397,5120.865435
99474,1.0,1.0,3,2014,4,0.0,0,6840.915870,7129.431344,7747.239583


## Try baseline predictions with different measures

In [76]:
mean_sales = y_train.mean()
median_sales = y_train.median()
mode_sales = y_train.mode()

In [73]:
# Check with mean
y_hat1=np.full((len(y_test), 1), mean_sales)
metric(y_hat1, y_test.to_numpy())

61.33499739289624

In [74]:
# Check with median
y_hat2=np.full((len(y_test), 1), median_sales)
metric(y_hat2, y_test.to_numpy())

53.79453665223901

In [75]:
# Check with mode
y_hat3=np.full((len(y_test), 1), mode_sales)
metric(y_hat3, y_test.to_numpy())

47.060713436233584