In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install linearmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import dates
from datetime import datetime

from linearmodels import PooledOLS
from linearmodels import PanelOLS
from linearmodels import RandomEffects
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson

from scipy import stats

%matplotlib inline

# Context

- Walmart is an American multinational retail corporation that operates a chain of hypermarkets, discount department stores, and grocery stores from the United States, headquartered in Bentonville, Arkansas (Wikipedia).
 
- In Retail Industry, sales is the most important metric in their business model that make profit. On this reason, sales analysis will be a good way in determining business operation.

- In this dataset, We'll do analyse how Walmart Sales looks like (Trend and Seasonal) and explain wether available feature (macroeconomic condition) would be affecting it using Econometric Panel Regression (Fixed Effect and Random Effect).

# Data Description 

This dataset contain 8 available feature:
1. Store: the store number
2. Date - the week of sales
3. Weekly_Sales: sales for the given store
4. Holiday_Flag: whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week
5. Temperature: temperature on the day of sale
6. Fuel_Price: cost of fuel in the region
7. CPI: prevailing consumer price index
8. Unemployment: prevailing unemployment rate


Holiday Events
1. Super Bowl: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13
2. Labour Day: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13
3. Thanksgiving: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13
4. Christmas: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13

In [None]:
df = pd.read_csv('/kaggle/input/retail-analysis-with-walmart-data/Walmart_Store_sales.csv')
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().any()

Luckily we have clean dataset that not containing null values, duplicate rows.

In [None]:
years = dates.YearLocator()
months = dates.MonthLocator()
years_fmt = dates.DateFormatter('%b %Y')

df['Date'] = pd.to_datetime(df['Date'], format = '%d-%m-%Y')

#for simplicity, we use 1k unit in sales
df['Weekly_Sales'] = round(df['Weekly_Sales'] / 1000, 3)
df['CPI'] = round(df['CPI'], 2)

df2 = df.copy()

#build panel data sorting by store and date
df2 = df2.sort_values(by = 'Date')
panel = pd.pivot_table(df2, values = ['Weekly_Sales', 'Holiday_Flag', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment'], index = ['Store', 'Date']).reset_index()
panel['Month'] = panel['Date'].dt.month
panel['Year'] = panel['Date'].dt.year

panel.head()

In [None]:
panel.drop(['Month', 'Year', 'Store'], axis = 1).describe()

# A. Aggregate Time Series Analysis

# How is the Walmart aggregate sales looks like? Does this sales has seasonal component?

In [None]:
#Aggregate Sales for the whole Walmart Store
holiday_date = pd.to_datetime(['2010-02-12', '2010-09-10',
       '2010-11-26', '2010-12-31',
       '2011-02-11', '2011-09-09',
       '2011-11-25', '2011-12-30',
       '2012-02-10', '2012-09-07'])
locate = pd.to_datetime('2011-05-06')

agg_sales = panel.groupby('Date')['Weekly_Sales'].sum().reset_index()

fig, ax = plt.subplots(1, 1, figsize = (25, 10))
sns.lineplot(x = agg_sales['Date'], y = agg_sales['Weekly_Sales'], ax = ax, label = 'Data Ori Value')
sns.lineplot(x = agg_sales['Date'], y = agg_sales['Weekly_Sales'].rolling(12).mean(), ax = ax, 
             label = 'Mean Trend')

for i in holiday_date:
    ax.axvline(i, color = 'r', alpha = 0.2, ls='--')

for s in ['top', 'right']:
    ax.spines[s].set_visible(False)
    
ax.text(locate, 87000, 'Walmart Aggregate Sales Trend\n(In Thousand Dollars)', fontsize = 25, 
        fontweight = 'bold', fontfamily = 'serif', color = 'black', ha = 'center')
ax.set_ylabel('')
ax.set_xlabel('')
ax.xaxis.set_major_formatter(years_fmt)

plt.show()

The graph tell us that this data contain seasonal component on holiday date (red line). Walmart sales seems rapidly increased when holiday date taking place. But in the other side (non-holiday date), sales tend to be in stagnant position. It means we have to be careful interpreting sales volume (Growth sales would be good metrics). Before that, next we'll discuss more little depth which holiday contribute more in Walmart sales aggregately.

In [None]:
#Detail about sales in holiday, are thanksgiving holiday have highest sales?

#Defining specific holiday
super_bowl = ['2010-02-12', '2011-02-11', '2012-02-10']
labour_day = ['2010-09-10', '2011-09-09', '2012-09-07']
thanksgiving = ['2010-11-26', '2011-11-25', '2012-11-23']
christmas = ['2010-12-31', '2011-12-30', '2012-12-28']

super_bowl_sales = df[df['Date'] == '2011-02-11']['Weekly_Sales'].sum()
labour_day_sales = df[df['Date'] == '2011-09-09']['Weekly_Sales'].sum()
thanksgiving_sales = df[df['Date'] == '2011-11-25']['Weekly_Sales'].sum()
christmas_sales = df[df['Date'] == '2011-12-30']['Weekly_Sales'].sum()

data = {'Super Bowl': super_bowl_sales,'Labour Day': labour_day_sales, 
        'Thanksgiving': thanksgiving_sales,'Christmas': christmas_sales}

holiday_sales = round(pd.Series(data),2)

fig, ax = plt.subplots(1, 1, figsize=(13, 8))
sns.barplot(x = holiday_sales.index, y = holiday_sales, ax = ax, palette = 'viridis')

for s in ['top', 'right', 'left']:
    ax.spines[s].set_visible(False)
ax.set_yticks([])
ax.set_ylabel('')
for ind, val in enumerate(holiday_sales):
    ax.text(x = ind, y = val/2, s = f'{str(val)}k', fontsize = 20, fontweight = 'bold', ha = 'center', 
            fontfamily = 'monospace', color = 'white')
ax.text(1.5, 75000, 'Are Thanksgiving Holiday Contribute Highest Sales?', fontsize = 20, fontweight = 'bold', 
            fontfamily = 'serif', color = 'black', ha = 'center')

plt.show()

In [None]:
pivot_table = pd.pivot_table(panel, values = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment'], 
                             index = 'Date', aggfunc = {'Weekly_Sales': np.sum, 'Temperature': np.mean, 'Fuel_Price': np.mean,
                                                        'CPI': np.mean, 'Unemployment': np.mean}).reset_index()
pivot_table['Month'] = pivot_table['Date'].dt.month
pivot_table['Year'] = pivot_table['Date'].dt.year
pivot_table.head()

In [None]:
sns.pairplot(pivot_table.drop(['Month', 'Year'], axis = 1))

In [None]:
pivot_table.drop(['Month', 'Year'], axis = 1).corr()

It seems that sales have less correlation with macroeconomic condition. Possible reason of this realtionship that because we use weekly data and macroeconomic variables like CPI, unemployment, fuel price have slow/rigid change in the short run.

In [None]:
growth_df = panel[['Date', 'Weekly_Sales']].resample('M', on = 'Date').sum().reset_index()
growth_df['Rolling_Month'] = np.roll(growth_df['Weekly_Sales'], 1)
growth_df['Growth_MoM'] = round((growth_df['Weekly_Sales'] - growth_df['Rolling_Month']) / growth_df['Rolling_Month'] * 100, 2)
grotwh_df = growth_df.shift(-1)
growth_df['Year'] = growth_df['Date'].dt.year
growth_df['Month'] = growth_df['Date'].dt.month

super_bowl = ['2010-02-28', '2011-02-28', '2012-02-28']
labour_day = ['2010-09-30', '2011-09-30', '2012-09-30']
thanksgiving = ['2010-11-30', '2011-11-30', '2012-11-30']
christmas = ['2010-12-31', '2011-12-31', '2012-12-31']

g2010 = growth_df[growth_df['Year'] == 2010][['Month', 'Growth_MoM']].max()
g2011 = growth_df[growth_df['Year'] == 2011][['Month', 'Growth_MoM']].max()
g2012 = growth_df[growth_df['Year'] == 2012][['Month', 'Growth_MoM']].max()

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (6, 3), dpi = 150)

for s in ['top','right','left','bottom']:
    ax.spines[s].set_visible(False)
    
ax.set_yticklabels('')
ax.set_xticklabels('')
ax.tick_params(axis='both',length=0)

ax.text(0.7, 0.85, "Highest Sales MoM Growth" , color = 'black', fontsize = 24, fontweight = 'bold', 
         fontfamily = 'sanserif', ha = 'center')
ax.text(0.2, 0.5, "December", color = 'blue', fontsize = 25, fontweight = 'bold', fontfamily = 'monospace', ha = 'center')
ax.text(0.2, 0.3, "42.35%", color = 'gray', fontsize = 15, fontfamily = 'monospace',ha = 'center', fontweight = 'bold')
ax.text(0.2, 0.1, "2010", color = 'gray',fontsize = 15, fontfamily = 'monospace', ha = 'center')
ax.text(0.75, 0.5, "December", color = 'blue', fontsize = 25, fontweight = 'bold', fontfamily = 'monospace', ha = 'center')
ax.text(0.75, 0.3, "37.07%", color = 'gray', fontsize = 15, fontweight = 'bold', fontfamily = 'monospace', ha = 'center')
ax.text(0.75, 0.1, "2011", color = 'gray', fontsize = 15, fontfamily = 'monospace', ha = 'center')
ax.text(1.3, 0.5, "October", color = 'blue', fontsize = 25, fontweight = 'bold', fontfamily = 'monospace', ha = 'center')
ax.text(1.3, 0.3, "27.46%", color = 'gray', fontsize = 15, fontfamily = 'monospace', ha = 'center', fontweight = 'bold')
ax.text(1.3, 0.1, "2012", color = 'gray', fontsize = 15, fontfamily = 'monospace', ha = 'center')

Although Thanksgiving holiday on November has highest sales than the other holiday, December has the highest growth sales happening before Christmas in 2010 and 2011. It stated that in 2012, October become the highest MoM sales growth because we don't have sales data on December 2012.

# B. Sales Panel Analysis by Store

In [None]:
# Which store perform better?

sales_store = panel.groupby('Store')['Weekly_Sales'].sum().reset_index()

fig, ax = plt.subplots(1, 1, figsize = (15, 7))
sns.barplot(x='Store', y='Weekly_Sales', data=sales_store, order=sales_store.sort_values('Weekly_Sales').Store, ax = ax,
           palette = 'viridis')

for s in ['top', 'right', 'left']:
    ax.spines[s].set_visible(False)
ax.set_yticks([])
ax.set_ylabel('')
ax.text(7, 310000, 'Walmart Aggregate Sales by Store \n(In Thousand Dollars)', fontsize = 18, fontweight = 'bold', 
        fontfamily = 'serif', color = 'black', ha = 'center')
ax.text(-2.5, 230000, '''
Walmart Store that have higher sales in the period is Store 20 followed by Store 4 and 14. 
The insight stop here because we dont know exactly where the store location is.
But we will defined how is the trend behind it.
''', fontsize = 14, fontweight = 'light', 
        fontfamily = 'serif', color = 'black')

In [None]:
#Highest Store

locate1 = pd.to_datetime(['2011-02'])

store14 = panel[panel['Store'] == 14][['Date', 'Weekly_Sales']]
store14['roll'] = np.roll(store14['Weekly_Sales'], 1)
store14['weekly_growth'] = round((store14['Weekly_Sales'] - store14['roll']) / store14['roll'] * 100, 2)
store14 = store14.shift(-1)

store4 = panel[panel['Store'] == 4][['Date', 'Weekly_Sales']]
store4['roll'] = np.roll(store4['Weekly_Sales'], 1)
store4['weekly_growth'] = round((store4['Weekly_Sales'] - store4['roll']) / store4['roll'] * 100, 2)
store4 = store4.shift(-1)

store20 = panel[panel['Store'] == 20][['Date', 'Weekly_Sales']]
store20['roll'] = np.roll(store20['Weekly_Sales'], 1)
store20['weekly_growth'] = round((store20['Weekly_Sales'] - store20['roll']) / store20['roll'] * 100, 2)
store20 = store20.shift(-1)

#Lowest Store
store33 = panel[panel['Store'] == 33][['Date', 'Weekly_Sales']]
store33['roll'] = np.roll(store33['Weekly_Sales'], 1)
store33['weekly_growth'] = round((store33['Weekly_Sales'] - store33['roll']) / store33['roll'] * 100, 2)
store33 = store33.shift(-1)

store44 = panel[panel['Store'] == 44][['Date', 'Weekly_Sales']]
store44['roll'] = np.roll(store44['Weekly_Sales'], 1)
store44['weekly_growth'] = round((store44['Weekly_Sales'] - store44['roll']) / store44['roll'] * 100, 2)
store44 = store44.shift(-1)

store5 = panel[panel['Store'] == 5][['Date', 'Weekly_Sales']]
store5['roll'] = np.roll(store5['Weekly_Sales'], 1)
store5['weekly_growth'] = round((store5['Weekly_Sales'] - store5['roll']) / store5['roll'] * 100, 2)
store5 = store5.shift(-1)

fig, ax = plt.subplots(2, 1, figsize = (25, 15))
sns.lineplot(x =store14['Date'], y = store14['weekly_growth'], ax = ax[0], label = 'Store 14', palette = 'Set2')
sns.lineplot(x =store4['Date'], y = store4['weekly_growth'], ax = ax[0], label = 'Store 4', palette = 'Set2')
sns.lineplot(x =store20['Date'], y = store20['weekly_growth'], ax = ax[0], label = 'Store 20', palette = 'Set2')
#sns.lineplot(x = pivot_table['Date'], y = pivot_table['Temperature'], ax = ax[0], label = 'CPI')

sns.lineplot(x =store33['Date'], y = store33['weekly_growth'], ax = ax[1], label = 'Store 33', palette = 'Set2')
sns.lineplot(x =store44['Date'], y = store44['weekly_growth'], ax = ax[1], label = 'Store 44', palette = 'Set2')
sns.lineplot(x =store5['Date'], y = store5['weekly_growth'], ax = ax[1], label = 'Store 5', palette = 'Set2')

for s in ['top', 'right']:
    for i in [0,1]:
        ax[i].spines[s].set_visible(False)
        ax[i].set_ylabel('')
        ax[i].set_xlabel('')
        ax[i].xaxis.set_major_formatter(years_fmt)

ax[0].text(locate1, 70, 'Top 3 Sales Stores Growth Trend', color = 'black', fontsize = 20, fontweight = 'bold', 
         fontfamily = 'serif')
ax[1].text(locate1, 70, 'Bottom 3 Sales Stores Growth Trend', color = 'black', fontsize = 20, fontweight = 'bold', 
         fontfamily = 'serif')

plt.show()

In [None]:
sales_store = sales_store.sort_values('Weekly_Sales', ascending = False)
store_list = list(sales_store['Store'][:4])

sales = panel.groupby(['Store', 'Year'])['Weekly_Sales'].sum()
temp = panel.groupby(['Store', 'Year'])['Temperature'].mean()
#cpi = pivot.groupby(['Store', 'Year'])['CPI'].mean()
#unem = pivot.groupby(['Store', 'Year'])['Unemployment'].mean()
#fp = pivot.groupby(['Store', 'Year'])['Fuel_Price'].mean()

fig, ax = plt.subplots(1, 1, figsize=(9, 6))

for sto in store_list:
    sns.regplot(x = sales[sto].values, y = temp[sto].values, label = sto)
    for s in ['top', 'right']:
        ax.spines[s].set_visible(False)
    
legend_labels, _ = ax.get_legend_handles_labels()
ax.legend(legend_labels, ['Store 20', 'Store 4', 'Store 14', 'Store 13'], ncol = 4, bbox_to_anchor = (0.75, 0.9), 
          loc = 'lower center')

ax.text(90000, 150, "Sales vs Temperature Relationship" , color = 'black', fontsize = 20, fontweight = 'bold', 
         fontfamily = 'sanserif')

plt.show()

Here we test the relationship between temperature and sales since this variable has the highest correlation among all feature. As we see that the slope (relative change in dependent variable if independent variable change) are small in all highest sales top 4 store. What we are gonna do next is using econometric to clarify the relationship.



Note: We better do this econometric domain using SPSS/Stata software, but for simplicity purpose we use statmodels and linearregression library.

# C. Econometrics Analysis: Does Macroeconomic Variable Affect Weekly Sales?

Econometrics is a tool that economist used for knowing how the causal relationship between dependent (target) variable and independent (feature matrix) variable. There are three types of data: 1.) Cross-section, 2.) Time Series, 3.) Pooled Data (Cross-section and Time Series). 

Basically, there are three types of regression for panel data:
1. **PooledOLS** can be described as simple OLS (Ordinary Least Squared) model that is performed on panel data. It ignores time and individual characteristics and focuses only on dependencies between the individuums
2. **Fixed-Effects (FE) Model**: The FE-model determines individual effects of unobserved, independent variables as constant (“fix“) over time
3. **Random-Effects (RE) Model**: RE-models determine individual effects of unobserved, independent variables as random variables over time. They are able to “switch” between OLS and FE and hence, can focus on both, dependencies between and within individuals

Here we will run the data into all the method.

In [None]:
#building panel dataset format
panel_data = panel.set_index(['Store', 'Date'])
dates = panel_data.index.get_level_values('Date').to_list()
panel_data['Date'] = pd.Categorical(dates)
panel_data.drop(['Month', 'Year', 'Holiday_Flag'], axis = 1, inplace = True)
panel_data

In [None]:
#building dependent and independent variable
independent = sm.tools.tools.add_constant(panel_data[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']])
dependent = panel_data['Weekly_Sales']

#fitting variable into panel OLS method
model = PooledOLS(dependent, independent)
pooledOLS_res = model.fit(cov_type='clustered', cluster_entity=True)

residuals = pooledOLS_res.resids

Firstly, we run data into pooled OLS regression method. There are 3 classical assumptions that have to be fulfilled. Two of them can help us in choosing between PooledOLS and Fixed Effect and Random Effect Model. These are *heteroskedasticity* and *autocorrelation*. If these two assumptions cant be fulfilled by PooledOLS, then Fixed Effect or Random Effect might be more suitable.

In [None]:
pooled_data = pd.concat([panel_data, residuals], axis = 1)
pooled_data = pooled_data.drop(['Date'], axis = 1).fillna(0)

#check regression assumption (heteroskedasticity)
breusch_pagan_test = list(het_breuschpagan(pooled_data['residual'], independent))
label_aut = ['LM-Stat', 'LM p-val', 'F-Stat', 'F p-val']
result_auto = pd.Series(breusch_pagan_test, index = label_aut)
print(result_auto)

If p value less than alpha (0.05), then there is no heteroskedasticity vice versa. Because p value is 3.53 that more than 0.05, so there is violation in heteroskedasticity.

In [None]:
#check regression assumption (autocorrelation)
durbin_watson_test = durbin_watson(pooled_data['residual']) 
print(durbin_watson_test)

If p value less than alpha (0.05), then there is no autocorrelation vice versa. Because p value is 3.53 that more than 0.05, so there is violation in autocorrelation.

Well here we will use Fixed Effect and Random Effect method.

In [None]:
#Fixed Effect
model_fe = PanelOLS(dependent, independent, entity_effects = True) 
fe_reg = model_fe.fit() 

#Random Effect
model_re = RandomEffects(dependent, independent)
re_reg = model_re.fit()

In order to choose between Fixed Effect or Random Effect to be the robust model, we will use statistical hausman test. If p value less than 0.05, so Fixed Effect would be a good model, vice versa.

In [None]:
def hausman(fe, re):
    b = fe.params
    B = re.params
    v_b = fe.cov
    v_B = re.cov
    df = b[np.abs(b) < 1e8].size
    chi2 = np.dot((b - B).T, np.linalg.inv(v_b - v_B).dot(b - B)) 
    pval = stats.chi2.sf(chi2, df)
    return chi2, df, pval

hausman_results = hausman(fe_reg, re_reg)
print('p-Value: ' + str(hausman_results[2]))

In [None]:
#Use Random Effect for interpretation purpose.

print(re_reg)

An independent variable is said to have a significant effect on the dependent variable when the p value is less than 0.05.

We see here that all independent variable (Temperature, CPI, Fuel Price and Unemployemnet) are significant on Weekly Sales. Although these relationship has significant correlation, the slopes/coeficient/parameter are small, so do R2.

For example, temperature has -1.0308: if temperature increase 1 degree, then it could be lower the sales by 1k dollar. 

# Conclusion

1. There is seasonal component in sales through holiday date, so the management have to taking into account the seasonal to stocking, merchandising, marketing etc. decision.
2. Although Thanksgiving Holiday in November contributed more sales than other holiday, it seems that December has higher MoM growth sales happening before Christmas. 
3. Store 20 indeed has a high sales volume for 3 years. However, it's growth has been dominated by holiday date.
4. Using econometrics (panel regression method), we found that macroeconomic condition have significant relationship with weekly_sales even though it has small slope and R2. It means that there are external factor outside the model greatly affecting Walmart sales. Possibly it can be the customer buying behaviour, social factor, technology and other issue.

# References

1. https://towardsdatascience.com/a-guide-to-panel-data-regression-theoretics-and-implementation-with-python-4c84c5055cf8