# Retail Analytics
Visualization and forecasting

## This notebook is divided into three parts
- Exploratory data analysis
- Сategorical data analysis
- Time series forecasting with Fbprophet (In-Sample and Out-Sample Forecast, forecast with and without regressors)

### Source of data:
 https://www.kaggle.com/manjeetsingh/retaildataset

In [None]:
#Importing all the necessary libraies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns


import datetime as dt


import sklearn
from sklearn.model_selection import train_test_split
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


import fbprophet
from fbprophet import Prophet

from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import performance_metrics

from fbprophet.plot import add_changepoints_to_plot

In [None]:
# Import data and parse date columns

stores=pd.read_csv('/kaggle/input/retaildataset/stores data-set.csv')
features=pd.read_csv('/kaggle/input/retaildataset/Features data set.csv', parse_dates=['Date'])
sales=pd.read_csv('/kaggle/input/retaildataset/sales data-set.csv', parse_dates=['Date'])

## 1 - Exploratory Data Analysis (EDA)

### Table features
Additional data related to the store, department and regional activity for the given dates.

In [None]:
features.head(2)

In [None]:
features.shape

In [None]:
features.info()

In [None]:
features.describe()

### Table stores
Anonymized information about the 45 stores, indicating the type and size of store

In [None]:
stores.head()

In [None]:
stores.shape

In [None]:
#Number of stores
stores.Store.nunique()

In [None]:
# Types of stores
stores['Type'].unique()

### Table sales
Historical sales data

In [None]:
sales.tail(2)

In [None]:
sales.shape

In [None]:
# Date unique values
sales['Date'].nunique()

In [None]:
# Holiday weeks

sales_weekly = sales.groupby(['Date', 'IsHoliday'], as_index = False).agg({'Weekly_Sales':"sum"})

In [None]:
sales_weekly['IsHoliday'].value_counts()

In [None]:
# Total sales for 3 years
sales['Weekly_Sales'].sum()


### Check for missing values (NaN)

In [None]:
#check for nan values in features
features.isna().sum()

In [None]:
#Visualize missing values (NaN) with Missingno Library in features
msno.matrix(features)

In [None]:
#Check for NaN in sales
sales.isna().sum()

In [None]:
#Check for NaN in stores
stores.isna().sum()

# Data Pre-Processing
### Merge the information of 3 tables (sales, features, stores) into one dataframe df

In [None]:
df=pd.merge(sales,features, on=['Store','Date', 'IsHoliday'], how='left')

df=pd.merge(df,stores, on=['Store'], how='left')

In [None]:
df.head()

In [None]:
# df structure
df.shape

In [None]:
# check total sales after joining
df['Weekly_Sales'].sum()

In [None]:
#check for missing values

df.isna().sum()

In [None]:
#check for duplicates

df.duplicated().sum()

### Drop MarkDown columns from analysis

In [None]:
df = df.drop(columns = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4','MarkDown5', 'Size'],axis = 1)

In [None]:
df.head()

In [None]:
#temperature unit conversion
df['Temperature'] = (df['Temperature']- 32) * 5./9.

#factorize (Yes/No to 1/0 conversion)
holidays_factor, types =df['IsHoliday'].factorize()
df['IsHoliday'] = holidays_factor

df.head()

In [None]:
df.info()

### Generate descriptive statistics on df 

In [None]:
df.describe()

For further analysis, let's add the columns of the year, month, date of year and week number

In [None]:
df['Y'] = df['Date'].dt.year

In [None]:
df['M'] = df['Date'].dt.month

In [None]:
df['D'] = df['Date'].dt.dayofyear

In [None]:
df['W'] = df['Date'].dt.weekofyear

In [None]:
df.head()

In [None]:
# There are 52 Weeks in a year 
df['W'].nunique()

In [None]:
# There are 12 Months in a year 
df['M'].nunique()

# 2 - Сategorical data analysis

Sales analysis by dates, different types of stores and conditions

Plot weekly sales sums

In [None]:
df_average_sales_weekly = df.groupby('Date', as_index=False)\
    .agg({'Weekly_Sales': 'sum'})

df_average_sales_sorted = df_average_sales_weekly.sort_values('Weekly_Sales', ascending = False)

plt.figure(figsize=(20,5))

plt.plot(df_average_sales_weekly.Date, df_average_sales_weekly.Weekly_Sales, color = 'b')

plt.show()

Plot weekly sales mean by months

In [None]:
df_average_sales_monthly = df.groupby('M', as_index=False)\
    .agg({'Weekly_Sales': 'mean'})
plt.figure(figsize=(20,5))
plt.plot(df_average_sales_monthly.M, df_average_sales_monthly.Weekly_Sales, color = 'r')
plt.show()

In [None]:
# Most profitable weeks
df_average_sales_sorted.head()

In [None]:
# Compare weekly sales by years

In [None]:
df10 = df.query('Y == 2010').groupby('D', as_index=False)\
    .agg({'Weekly_Sales': 'sum'})
df11 = df.query('Y == 2011').groupby('D', as_index=False)\
    .agg({'Weekly_Sales': 'sum'})
df12 = df.query('Y == 2012').groupby('D', as_index=False)\
    .agg({'Weekly_Sales': 'sum'})
fig, ax = plt.subplots(figsize=(25,8))
ax.plot(df10.D, df10.Weekly_Sales, label = "2010")
ax.plot(df11.D, df11.Weekly_Sales, label = "2011")
ax.plot(df12.D, df12.Weekly_Sales, label = "2012")
ax.legend()
plt.show()

In [None]:
# Sales sums distribution

fig, ax = plt.subplots(figsize=(25,8))

df10['Weekly_Sales'].plot(kind='hist', title='Sales distribution', label = "2010");

df11['Weekly_Sales'].plot(kind='hist', label = "2011");
df12['Weekly_Sales'].plot(kind='hist', label = "2012");

ax.legend()
plt.show()

2012 sales do not have significant positive emissions like in 2010 and 2011 because of lack of data on 2012 december holidays weeks

In [None]:
df_average_sales_weekly['Date'].tail(5)

In [None]:
# Rolling mean (window = 4)

In [None]:
df_average_sales_weekly['rol_month'] = df_average_sales_weekly['Weekly_Sales'].rolling(4).mean()

In [None]:
fig = plt.figure(figsize=(25,8))
line1, = plt.plot(df_average_sales_weekly.Date, df_average_sales_weekly.rol_month, '--', color='red')
line2, = plt.plot(df_average_sales_weekly.Date, df_average_sales_weekly.Weekly_Sales,  color='green')

fig.suptitle('Rolling mean - window = 4', fontsize=24)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Sales', fontsize=16)

plt.legend((line2, line1), ['sum', 'rolling mean'])



plt.show()

The most profitable weeks and months coincide with the holidays of Christmas and Thanksgiving

In [None]:
# holiday weeks are marked with red lines
df2=df.groupby(by=['Date'], as_index=False)['Weekly_Sales'].sum()
f_1 = plt.figure(figsize=(12,6), dpi=100)
ax_1 = f_1.add_axes([0.0, 0.0, 0.9, 0.9])
ax_1.set_ylabel('Weekly_Sales')
ax_1.plot(df2['Date'], df2['Weekly_Sales'])

for x in df[df['IsHoliday']==1]['Date']:
    ax_1.axvline(x=x, color='red', linewidth=0.5)

Difference of Weekly Sales sum by day of year compared with a previous day of year

In [None]:
df_SW = df.groupby('D', as_index=False)\
    .agg({'Weekly_Sales': 'sum'})

df_SW['differ'] = df_SW['Weekly_Sales'].diff()

In [None]:
plt.figure(figsize=(20,5))
plt.plot(df_SW.D, df_SW.differ, '--', color = 'b')
plt.show()

Difference of Weekly Sales sum by date compared with a previous date

In [None]:
df_SW2 = df.groupby('Date', as_index=False)\
    .agg({'Weekly_Sales': 'sum'})

df_SW2['differ'] = df_SW2['Weekly_Sales'].diff()

In [None]:
plt.figure(figsize=(20,5))
plt.plot(df_SW2.Date, df_SW2.differ, '--', color = 'b')
plt.show()

### Sales analysis by store types

In [None]:
df_type = df.groupby('Type', as_index=False).agg(Mean=('Weekly_Sales', 'mean'), Sum=('Weekly_Sales', 'sum'))
df_type

In [None]:
plt.figure(figsize = (16,5))
ax = sns.barplot(x="Type", y="Weekly_Sales", hue="Y", data=df, palette= "Paired")
ax.set_title('Mean weekly sales by years',fontsize=10)

In [None]:
df_gr_type = df.groupby('Type', as_index = False)\
    .agg({'Weekly_Sales':'mean'})\
    .sort_values('Weekly_Sales', ascending = False)

In [None]:
df_gr_type

Top-5 stores by sales

In [None]:
df_gr = df.groupby('Store', as_index = False)\
    .agg({'Weekly_Sales':'sum'})\
    .sort_values('Weekly_Sales', ascending = False)

In [None]:
df_gr.head(5)

### Unemployment analysis

In [None]:
df_gr_u = df.groupby('Y', as_index = False)\
    .agg({'Unemployment':'mean'})\
    .sort_values('Unemployment', ascending = False)

In [None]:
df_gr_u

### The consumer price index


In [None]:
fig = plt.figure(figsize=(25,8))
df_average_sales_weekly2 = df.groupby('Date', as_index=False)\
    .agg({'CPI': 'sum'})
df_average_sales_sorted = df_average_sales_weekly2.sort_values('CPI', ascending = False)


df_average_sales_weekly2['rol_month'] = df_average_sales_weekly2['CPI'].rolling(20).mean()
line1 = plt.plot(df_average_sales_weekly2.Date, df_average_sales_weekly2.rol_month, '--', color='b')


plt.plot(df_average_sales_weekly2.Date, df_average_sales_weekly2.CPI, color = 'r')
plt.show()

In [None]:
df_gr_cpi = df.groupby('Y', as_index = False)\
    .agg({'CPI':'mean'})\
    .sort_values('CPI', ascending = False)

In [None]:
df_gr_cpi

### Fuel Price tendency

In [None]:
df_average_sales_weekly2 = df.groupby('Date', as_index=False)\
    .agg({'Fuel_Price': 'sum'})
df_average_sales_sorted = df_average_sales_weekly2.sort_values('Fuel_Price', ascending = False)

plt.figure(figsize=(20,5))
plt.plot(df_average_sales_weekly2.Date, df_average_sales_weekly2.Fuel_Price, color = 'orange')
plt.show()

### Temperature analysis

In [None]:
df.shape

In [None]:
labels=["less than -10 deg", "-10-0", "0-10", "10-15", "15-20", "20-25", "more than 25"]
bins=[-np.inf,-10,0,10,15,20,25, np.inf]
df['temperature_category'] = pd.cut(df['Temperature'], bins=bins, labels=labels,right=False) 

In [None]:
df_gr_t = df.groupby('temperature_category', as_index = False)\
    .agg({'Weekly_Sales':'sum'})\
    .sort_values('Weekly_Sales', ascending = False)

In [None]:
df_gr_t

In [None]:
data_tab = df['temperature_category'].value_counts()

In [None]:
data_tab

In [None]:
plt.figure(figsize = (12,5))
ax = sns.countplot(x="temperature_category", data=df, palette="coolwarm")
ax.set(xlabel="category", ylabel = "num", title="Sales")
vals = ax.get_yticks()

# The correlation of features

In [None]:
#Feature correletion analysis for store 20

In [None]:
df20=df.where( df['Store'] == 20)
df20=df20.groupby(by=['Date'], as_index=False)[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 
                                                  'IsHoliday', 'Weekly_Sales']].mean()
df20 = df20.set_index('Date')
df20.head()

# correlation matrix

In [None]:
corrMatrix20 = df20.corr()

In [None]:
corrMatrix20

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corrMatrix20, annot=True)
plt.show()

# 3 - Forecasting

Fbprophet without regressors

#### Out of sample overall Sales forecast with Fbprophet model

In [None]:
#Data Preparation for Fbprophet

df1=df.groupby(by=['Date'], as_index=False)['Weekly_Sales'].sum()

df1 = df1.rename(columns = {'Date':'ds', 'Weekly_Sales':'y'})

In [None]:
df1.tail()

In [None]:
# define the model
model = Prophet()
# fit the model
model.fit(df1)

In [None]:
#Range of weeks for forecast
future_dates = pd.date_range("20121216","20141230", freq='W')

In [None]:
future_dates = [pd.to_datetime(i) for i in future_dates]

In [None]:
future = pd.DataFrame([dt.datetime.strftime(i, '%Y-%m-%d') for i in future_dates],  columns=['ds'])

In [None]:
forecast = model.predict(future)

print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
# plot forecast
model.plot(forecast)
plt.show()

#### In sample overall Sales forecast with Fbprophet model

In [None]:
dates = pd.date_range("20111126","20121110", freq='W')
dates = [pd.to_datetime(i) for i in dates]
dates_for_forecast = pd.DataFrame([dt.datetime.strftime(i, '%Y-%m-%d') for i in dates],  columns=['ds'])

In [None]:
forecast_dates = model.predict(dates_for_forecast)
# summarize the forecast
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head())
# plot forecast
model.plot(forecast_dates)

plt.plot(df1.ds, df1.y, color = 'orange')

plt.show()

Fbprophet with regressors

#### In sample store 20 Sales forecast with Fbprophet model

Take CPI, Temperature, IsHoliday as regressors for our model

In [None]:
df_grouped_20 = df.query('Store == 20')\
    .groupby(['Date', 'IsHoliday', 'CPI', 'Temperature'], as_index = False)\
    .agg({'Weekly_Sales':'sum'})

In [None]:
df_grouped_20.head()

In [None]:
#Data Preparation for Fbprophet

df_grouped_20 = df_grouped_20.rename(columns = {'Date':'ds','Weekly_Sales':'y'})

In [None]:
datetime_series = pd.to_datetime(df_grouped_20['ds'])

datetime_index = pd.DatetimeIndex(datetime_series.values)

Store20_data=df_grouped_20.set_index(datetime_index)

In [None]:
Store20_data.head()

In [None]:
Store20_data.shape

In [None]:
train_data_pr1 = Store20_data.iloc[:len(Store20_data)-40]
test_data_pr1 = Store20_data.iloc[len(Store20_data)-40:]

In [None]:
test_data_pr2 = test_data_pr1[['ds', 'IsHoliday', 'CPI', 'Temperature']]

In [None]:
# define the model
m1 = Prophet(changepoint_prior_scale=0.05, interval_width=0.95,growth = 'linear',seasonality_mode = 'multiplicative', \
               yearly_seasonality=20, weekly_seasonality=True, changepoint_range=0.9)
m1.add_seasonality('weekly', period=7, fourier_order=15)

m1.add_regressor('IsHoliday')
m1.add_regressor('CPI')
m1.add_regressor('Temperature')
m1.fit(train_data_pr1)




In [None]:
prophet_pred2 = m1.predict(test_data_pr2)


print(prophet_pred2[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())
# plot forecast
m1.plot(prophet_pred2)


plt.plot(Store20_data.ds, Store20_data.y, color = 'orange')

plt.show()

In [None]:
m1.plot_components(prophet_pred2);

In [None]:
Store20_data_2 = Store20_data[['ds', 'IsHoliday', 'CPI', 'Temperature']]

In [None]:
prophet_pred2 = m1.predict(Store20_data_2)


print(prophet_pred2[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())
# plot forecast
m1.plot(prophet_pred2)


plt.plot(Store20_data.ds, Store20_data.y, color = 'orange')

for x in Store20_data[Store20_data['ds']=='2012-12-10']['ds']:
    plt.axvline(x=x, color='red', linewidth=0.5, ls='--', label='2012-12-10')
    

plt.legend()

plt.show()

In [None]:
prophet_pred3 = prophet_pred2[['ds','yhat']]

In [None]:
df_ds=pd.merge(Store20_data, prophet_pred3, on=['ds'], how='left')

In [None]:
df_ds['diff'] = (df_ds['y']-df_ds['yhat'])/df_ds['y']

In [None]:
x1 = df_ds['ds']
y1 = df_ds['diff']
fig, ax = plt.subplots(figsize=(20,5))
ax.plot(x1.values, y1.values, c='r')

In [None]:
df_ds[['diff']].describe()