# Walmart Recruiting - Store Sales Forecasting
Use historical markdown data to predict store sales


![](https://cw39.com/wp-content/uploads/sites/10/2021/06/AP20323809226583.jpg?w=1752&h=986&crop=1)

# Importing Libraries

In [None]:
!pip install numpy pandas matplotlib seaborn --quiet

In [None]:
!pip install jovian opendatasets xgboost graphviz lightgbm scikit-learn xgboost lightgbm --upgrade --quiet

In [None]:
import os
import opendatasets as od
import pandas as pd
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)

## Downloading the Data

We can download the dataset from Kaggle directly within the Jupyter notebook using the `opendatasets` library.

In [None]:
od.download('https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting')

In [None]:
os.listdir('walmart-recruiting-store-sales-forecasting')

In [None]:
from zipfile import ZipFile

with ZipFile('./walmart-recruiting-store-sales-forecasting/features.csv.zip') as f:
    f.extractall(path='walmart-recruiting-store-sales-forecasting')

with ZipFile('./walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip') as f:
    f.extractall(path='walmart-recruiting-store-sales-forecasting')
    
with ZipFile('./walmart-recruiting-store-sales-forecasting/test.csv.zip') as f:
    f.extractall(path='walmart-recruiting-store-sales-forecasting')

with ZipFile('./walmart-recruiting-store-sales-forecasting/train.csv.zip') as f:
    f.extractall(path='walmart-recruiting-store-sales-forecasting')

os.listdir('walmart-recruiting-store-sales-forecasting')

Let's load the data into Pandas dataframes.

In [None]:
features = pd.read_csv("./walmart-recruiting-store-sales-forecasting/features.csv")
stores = pd.read_csv("./walmart-recruiting-store-sales-forecasting/stores.csv")
walmart = pd.read_csv("./walmart-recruiting-store-sales-forecasting/train.csv")
test = pd.read_csv("./walmart-recruiting-store-sales-forecasting/test.csv")
submission = pd.read_csv("./walmart-recruiting-store-sales-forecasting/sampleSubmission.csv")

In [None]:
print("features.shape", features.shape)
print("stores.shape", stores.shape)
print("walmart.shape", walmart.shape)
print("test.shape", test.shape)
print("submission.shape", submission.shape)

In [None]:
features.head(5)

In [None]:
stores.head(5)

In [None]:
walmart.head(5)

In [None]:
test.head(5)

In [None]:
submission.head(5)

Let's merge the information from `stores` into `walmart` and `test`.

In [None]:
merged_df = walmart.merge(stores, how='left').merge(features, how='left')
merged_test_df = test.merge(stores, how='left').merge(features, how='left')

In [None]:
print("merged_df.shape", merged_df.shape)
print("merged_test_df.shape", merged_test_df.shape)

In [None]:
import numpy as np
import seaborn as sns
import os
import plotly.express as px
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style

style.use('seaborn-poster')
style.use("fivethirtyeight")
plt.rcParams['font.family'] = 'serif'

matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (8, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
%matplotlib inline


### Date

First, let's convert `Date` to a `datecolumn` and extract different parts of the date.

In [None]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week

In [None]:
split_date(merged_df)
split_date(merged_test_df)

## Preprocessing and Feature Engineering

Let's take a look at the available columns, and figure out if we can create new columns or apply any useful transformations.

In [None]:
merged_df.head(5)

In [None]:
merged_df.info()

### Impute missing numerical data

In [None]:
merged_df.isna().sum()

### Exploratory Data Analysis

In [None]:
weekly_sales_2010 = merged_df[merged_df.Year==2010].groupby('WeekOfYear')['Weekly_Sales'].mean()
weekly_sales_2011 = merged_df[merged_df.Year==2011].groupby('WeekOfYear')['Weekly_Sales'].mean()
weekly_sales_2012 = merged_df[merged_df.Year==2012].groupby('WeekOfYear')['Weekly_Sales'].mean()

plt.figure(figsize=(22,8))
plt.plot(weekly_sales_2010.index, weekly_sales_2010.values)
plt.plot(weekly_sales_2011.index, weekly_sales_2011.values)
plt.plot(weekly_sales_2012.index, weekly_sales_2012.values)

plt.xticks(np.arange(1, 53, step=1), fontsize=16)
plt.yticks( fontsize=16)
plt.xlabel('Week of Year', fontsize=20, labelpad=20)
plt.ylabel('Sales', fontsize=20, labelpad=20)

plt.title("Average Weekly Sales - Per Year", fontsize=24)
plt.legend(['2010', '2011', '2012'], fontsize=20);

In [None]:
merged_df.head(10)

In [None]:
merged_df.columns

Let's also identify Input and Target columns.

In [None]:
input_cols = ['Store', 'Dept', 'IsHoliday', 'Type', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Year', 'Month', 'Day',
       'WeekOfYear']
target_col = 'Weekly_Sales'

In [None]:
inputs = merged_df[input_cols].copy()
targets = merged_df[target_col].copy()

In [None]:
test_inputs = merged_test_df[input_cols].copy()

Let's also identify numeric and categorical columns. Note that we can treat binary categorical columns (0/1) as numeric columns.

In [None]:
numeric_cols = ['Store', 'Dept', 'Size',
       'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
       'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Year', 'Month', 'Day',
       'WeekOfYear']
categorical_cols = ['IsHoliday', 'Type']

In [None]:
#numeric_cols = ['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'WeekOfYear']
#categorical_cols = ['IsHoliday', 'Type']

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
zero = 0
zero

In [None]:
inputs['MarkDown1'].fillna(zero, inplace=True)
inputs['MarkDown2'].fillna(zero, inplace=True)
inputs['MarkDown3'].fillna(zero, inplace=True)
inputs['MarkDown4'].fillna(zero, inplace=True)
inputs['MarkDown5'].fillna(zero, inplace=True)
test_inputs['MarkDown1'].fillna(zero, inplace=True)
test_inputs['MarkDown2'].fillna(zero, inplace=True)
test_inputs['MarkDown3'].fillna(zero, inplace=True)
test_inputs['MarkDown4'].fillna(zero, inplace=True)
test_inputs['MarkDown5'].fillna(zero, inplace=True)

In [None]:
mean_CPI = inputs.CPI.mean()
mean_Unemployment = inputs.Unemployment.mean()

In [None]:
inputs['CPI'].fillna(mean_CPI, inplace=True)
inputs['Unemployment'].fillna(mean_Unemployment, inplace=True)
test_inputs['CPI'].fillna(mean_CPI, inplace=True)
test_inputs['Unemployment'].fillna(mean_Unemployment, inplace=True)


In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
inputs[numeric_cols].isna().sum()

### Scale Numeric Values

Let's scale numeric values to the 0 to 1 range.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(inputs[numeric_cols])

In [None]:
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

### Encode Categorical Columns


Let's one-hot encode categorical columns.

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(inputs[categorical_cols])
encoder1 = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(test_inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
encoded_cols1 = list(encoder1.get_feature_names(categorical_cols))

In [None]:
inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols1] = encoder1.transform(test_inputs[categorical_cols])

Finally, let's extract out all the numeric data for training.

In [None]:
X = inputs[numeric_cols + encoded_cols]
X_test = test_inputs[numeric_cols + encoded_cols1]

### Training

To train a GBM, we can use the `XGBRegressor` class from the [`XGBoost`](https://xgboost.readthedocs.io/en/latest/) library.

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)

Let's train the model using `model.fit`.

In [None]:
%%time
model.fit(X, targets)

### Prediction

We can now make predictions and evaluate the model using `model.predict`.

In [None]:
preds = model.predict(X)

### Prediction

Let's predict the submission set directly.

In [None]:
preds1 = model.predict(X_test)

### Evaluation

Let's evaluate the predictions using RMSE error.

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return mean_squared_error(a, b, squared=False)

In [None]:
rmse(preds, targets)

### Visualization

We can visualize individual trees using `plot_tree` (note: this requires the `graphviz` library to be installed).

In [None]:
import matplotlib.pyplot as plt
from xgboost import plot_tree
from matplotlib.pylab import rcParams
%matplotlib inline

rcParams['figure.figsize'] = 30,30

In [None]:
plot_tree(model, rankdir='LR');

In [None]:
plot_tree(model, rankdir='LR', num_trees=1);

In [None]:
trees = model.get_booster().get_dump()

In [None]:
len(trees)

In [None]:
print(trees[0])

### Feature importance

Just like decision trees and random forests, XGBoost also provides a feature importance score for each column in the input.

In [None]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
import seaborn as sns
plt.figure(figsize=(10,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

## K Fold Cross Validation

Notice that we didn't create a validation set before training our XGBoost model. We'll use a different validation strategy this time, called K-fold cross validation 

In [None]:
from sklearn.model_selection import KFold

Scikit-learn provides utilities for performing K fold cross validation.

Let's define a helper function `train_and_evaluate` which trains a model the given parameters and returns the trained model, training error and validation error.

In [None]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

Now, we can use the `KFold` utility to create the different training/validations splits and train a separate model for each fold.

In [None]:
kfold = KFold(n_splits=5)

In [None]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train, 
                                                     train_targets, 
                                                     X_val, 
                                                     val_targets, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

Let's also define a function to average predictions from the 5 different models.

In [None]:
import numpy as np

def predict_avg(models, inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

In [None]:
preds = predict_avg(models, X)

## Hyperparameter Tuning and Regularization

Just like other machine learning models, there are several hyperparameters we can to adjust the capacity of model and reduce overfitting.


In [None]:
model

Here's a helper function to test hyperparameters with K-fold cross validation.

In [None]:
def test_params_kfold(n_splits, **params):
    train_rmses, val_rmses, models = [], [], []
    kfold = KFold(n_splits)
    for train_idxs, val_idxs in kfold.split(X):
        X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
        X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
        model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)
        models.append(model)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
    print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses), np.mean(val_rmses)))
    return models

Since it may take a long time to perform 5-fold cross validation for each set of parameters we wish to try, we'll just pick a random 10% sample of the dataset as the validation set.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, train_targets, val_targets = train_test_split(X, targets, test_size=0.1)

In [None]:
def test_params(**params):
    model = XGBRegressor(n_jobs=-1, random_state=42, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

#### `n_estimators`

The number of trees to be created. More trees = greater capacity of the model.


In [None]:
test_params(n_estimators=10)

In [None]:
test_params(n_estimators=100)

In [None]:
test_params(n_estimators=500)

#### `max_depth`

As you increase the max depth of each tree, the capacity of the tree increases and it can capture more information about the training set.

In [None]:
test_params(max_depth=2)

In [None]:
test_params(max_depth=15)

#### `learning_rate`

The scaling factor to be applied to the prediction of each tree. A very high learning rate (close to 1) will lead to overfitting, and a low learning rate (close to 0) will lead to underfitting.

In [None]:
test_params(n_estimators=50, learning_rate=0.01)

In [None]:
test_params(n_estimators=500, learning_rate=0.9)

In [None]:
test_params(n_estimators=500, learning_rate=0.9, max_depth=15)

#### `booster`

Instead of using Decision Trees, XGBoost can also train a linear model for each iteration. This can be configured using `booster`.

In [None]:
test_params(booster='gblinear')

## Putting it Together and Making Predictions

Let's train a final model on the entire training set with custom hyperparameters. 

In [None]:
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1000, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7)

In [None]:
model1 = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1000, 
                     learning_rate=0.9, max_depth=15, subsample=0.9, 
                     colsample_bytree=0.7)

In [None]:
%%time
model.fit(X, targets)

In [None]:
%%time
model1.fit(X, targets)

Now that the model is trained, we can make predictions on the test set.

model is by using (n_jobs=-1, random_state=42, n_estimators=1000, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7)) these parameters

In [None]:
test_preds_sub = model.predict(X_test)

In [None]:
test_preds = model.predict(X)

model1 is by using (n_jobs=-1, random_state=42, n_estimators=1000, 
                     learning_rate=0.9, max_depth=15, subsample=0.9, 
                     colsample_bytree=0.7) these tested parameters

In [None]:
test_preds1 = model1.predict(X)

In [None]:
test_preds_sub1 = model1.predict(X_test)

In [None]:
rmse(test_preds, targets)

In [None]:
rmse(test_preds1, test_preds)

In [None]:
rmse(preds, targets)

We tested final regult with XGBRegressor without any useful parameters and with parameters

Creating Submitting file

In [None]:
submission

In [None]:
test_preds1.shape

In [None]:
submission['Weekly_Sales'] = test_preds_sub1
submission.to_csv('submission.csv',index=False)