In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

# Get data

In [None]:
DATA_PATH = '../input/walmart-recruiting-store-sales-forecasting/'

stores = pd.read_csv(DATA_PATH + 'stores.csv')
features = pd.read_csv(DATA_PATH + 'features.csv.zip')
features['Date'] = pd.to_datetime(features['Date'])


train = pd.read_csv(DATA_PATH + 'train.csv.zip')
train['Date'] = pd.to_datetime(train['Date'])

test = pd.read_csv(DATA_PATH + 'test.csv.zip')
test['Date'] = pd.to_datetime(test['Date'])


In [None]:
stores.head(5)

In [None]:
train.head(5)

In [None]:
features.head(5)

# Join all data for train and test set

In [None]:
all_data = train.merge(stores, how = 'left')\
     .merge(features, how = 'left')


all_test_data = test.merge(stores, how = 'left')\
     .merge(features, how = 'left')
        

In [None]:
all_data.head(5)

# EDA

There are 45 distinct Stores and 81 distinct Departments. Combined we have 3331 (not all stores have all departments) different times series. 

PS: Although times series statistical modeling requeires autocorrelation and partial auto-correlation functions (ACF and PACF) to set a SARIMA family models parameters, wouldn't be so smart fitting a SARIMA model for each one of 3.331 times series due this amount of granularity. So, for now on I will evaluate the mean weekly sales to summarize all times series as one, so the EDA can be more effective and we keep some of statistical rigors.

In [None]:
print(all_data[['Store']].drop_duplicates().count())
print(all_data[['Dept']].drop_duplicates().count())
print(all_data[['Store', 'Dept']].drop_duplicates().count())

In [None]:
plt.figure(figsize=(15,3))
all_data.groupby('Date')['Weekly_Sales'].mean().plot()
plt.title('Average weekly Sales of the company across all stores in train dataset timeframe', fontsize=18)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Date', fontsize=16);

In [None]:
all_data.agg({'Date': [min, max]}).T


Using mean of departments and stores weekly sales (from February 2010 until October 2012) plotted above something comes to mind when we see those spikes at the end of the years (2010, 2011), this could be a yearly seasonality, maybe due to Black Friday, Christmas and all the commemorative dates effects; As soon as we expect different behavior between normal days and commemorative ones it should be used as feature to out modeling.

So let's build some features from Date column to use as features:

In [None]:
def get_date_features(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Inputs:
        - df: pandas DataFrame; it must have a column named Date and it must be datetime type
    Outputs:
        - Year: year extracted from Date column;
        - Month: month extracted from Date column; 
        - WeekOfYear: week number extracted from Date column;
        
    Example of usage:
        > print(df)
            |Date      |
            |2010-02-05|

        > df2 = get_date_features(df)
        > print(df2)
            |Date      |Year|Month|Day|WeekOfYear|
            |2010-02-05|2010|2    |5  |6         |
    '''
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week.astype(int)
    
    return df

all_data = get_date_features(all_data)
all_test_data = get_date_features(all_test_data)

# Looking for yearly seasonalities

In [None]:
mean_weekly_sales_2010 = all_data[all_data.Year==2010].groupby('WeekOfYear')['Weekly_Sales'].mean()
mean_weekly_sales_2011 = all_data[all_data.Year==2011].groupby('WeekOfYear')['Weekly_Sales'].mean()
mean_weekly_sales_2012 = all_data[all_data.Year==2012].groupby('WeekOfYear')['Weekly_Sales'].mean()

plt.figure(figsize=(22,8))
plt.plot(mean_weekly_sales_2010.index, mean_weekly_sales_2010.values)
plt.plot(mean_weekly_sales_2011.index, mean_weekly_sales_2011.values)
plt.plot(mean_weekly_sales_2012.index, mean_weekly_sales_2012.values)

plt.xticks(np.arange(1, 53, step=1), fontsize=16)
plt.yticks( fontsize=16)
plt.xlabel('Week of Year', fontsize=20, labelpad=20)
plt.ylabel('Sales', fontsize=20, labelpad=20)

plt.title("Average Weekly Sales - By Year", fontsize=24)
plt.legend(['2010', '2011', '2012'], fontsize=20);

Behavior above can be described as yearly seasonality, because 2010, 2011 and 2012 times series "walk" together up and down through weeks of years. 

In [None]:
plt.figure(figsize=(15,3))
all_data[all_data['Year']==2010].groupby('Month').mean()['Weekly_Sales'].plot()
all_data[all_data['Year']==2011].groupby('Month').mean()['Weekly_Sales'].plot()
all_data[all_data['Year']==2012].groupby('Month').mean()['Weekly_Sales'].plot()
plt.title('Average weekly Sales of the company in each year', fontsize=18)
plt.legend(['2010', '2011', '2012'], loc='best', fontsize=16)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Months', fontsize=16);

Same as we saw by year but here Months are on x axis. It shows the same pattern for all three of the times series.

In [None]:
plt.figure(figsize=(15,3))
all_data[all_data['Type']=='A'].groupby('Month').mean()['Weekly_Sales'].plot()
all_data[all_data['Type']=='B'].groupby('Month').mean()['Weekly_Sales'].plot()
all_data[all_data['Type']=='C'].groupby('Month').mean()['Weekly_Sales'].plot()
plt.title('Average weekly Sales of the company by type of the store', fontsize=18)
plt.legend(['Type A', 'Type B', 'Type C'], loc='best', fontsize=16)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Months', fontsize=16);

For each type of the store, there is the same pattern of ups and downs by month for Type A and Type B Stores, but it happens in different levels, in other words, the three times series are randomly distributed in different levels.
So we expect the months affects differently by Type of the Stores, so we should build our model considering Type as feature too.

Below we see MarkDown columns has >60% of missing data, so I decided not filling it and not using due to lack of information.

In [None]:
all_data.isna().mean()

In [None]:
all_data.head(5)

# Feature Engineering

Select numerical columns as raw features and encode categorical columns to be used as features. Also set Weekly_sales as target column.

In [None]:
numeric_cols = ['Size',
       'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month',
       'WeekOfYear']

categorical_cols = ['IsHoliday', 'Type']

target_col = 'Weekly_Sales'

Below we have OneHotEncoder for IsHoliday and Type columns, I choosed this encoding because those as nominal categories and there is not a lot categories by each column (IsHoliday has 2 and Type has 3).

PS: we have to do same transformation on train and test to fit and predict properly.

In [None]:
encoder_train = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(all_data[categorical_cols])
encoded_cols_train = list(encoder_train.get_feature_names(categorical_cols))



encoder_test = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(all_test_data[categorical_cols])
encoded_cols_test = list(encoder_test.get_feature_names(categorical_cols))


all_data[encoded_cols_train] = encoder_train.transform(all_data[categorical_cols])
all_test_data[encoded_cols_test] = encoder_test.transform(all_test_data[categorical_cols])

# Modeling

In [None]:
X = all_data[numeric_cols + encoded_cols_train]
X_test = all_test_data[numeric_cols + encoded_cols_test]

Set 25 trees, each tree with 4 nodes depth. Simple to be baseline model and also not overfitting

In [None]:
model = XGBRegressor(random_state=42, 
                     n_estimators= 25, 
                     max_depth=4)


In [None]:
%%time
model.fit(X, all_data[target_col])

In [None]:
preds = model.predict(X)

In [None]:
preds_test = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(y: float, y_hat: float) -> float:
    '''
    This functions returns RMSE considering two vectors
    '''
    return mean_squared_error(y, y_hat, squared=False)

In [None]:
rmse(all_data[target_col], preds)

21549 as baseline RMSE. There is a lot to be done to squeeze this error.

Below I show feature importances to understand what's helping (and what's not) out modeling have this performance and drive us to collect and build new features in v2 of this model.

In [None]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

importance_df.head(10)

In [None]:
import seaborn as sns
plt.figure(figsize=(10,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

Apperently Size feature is the most important during modeling, econometric data helps (but not as much as Size) maybe because it is not a particularity of Wallmart sales but just a "temperature" of the economy so it might be more inferential feature than predictive one. Maybe collecting, for example, the amount of money used on each promotional events could help us better on predictive side.

Also I always do a small KFold to make sure our model can generalize so we can trust it when it is deployed and it needs to predict data it has never get **any contact**.

In [None]:
from sklearn.model_selection import KFold

def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [None]:
kfold = KFold(n_splits=5)

targets = all_data[target_col].copy()

In [None]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train, 
                                                     train_targets, 
                                                     X_val, 
                                                     val_targets, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

As we see above, performance in train and validation set are not really different for each fold and also does not varies a lot considering all folds, so it is an okay modeling to be generalized.

## Things to try for v2 of this modeling

Something I would do on v2 of this model is consider lag of 1 week sales to be a feature. A important characteristics of times series analysis is autoregressive data, it means data in time t is auto-dependent of its own data in time t-1 (classical AR(p) models are Y_t = mu + theta * Y_t-1 + error). This could increase our predictive perfomance comparing to this MVP.