In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from matplotlib.gridspec import GridSpec
from fbprophet import Prophet
from fbprophet.plot import plot_plotly, plot_components_plotly, add_changepoints_to_plot
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.api as sm
import calendar


In [None]:
import plotly.io as pio
pio.templates["draft"] = go.layout.Template(
    layout_annotations=[
        dict(
            textangle=-30,
            opacity=0.1,
            font=dict(color="black", size=100),
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
        )
    ]
)
pio.templates.default = "draft"

In [None]:
# import plotly.offline as py
# py.init_notebook_mode(connected=True)

In [None]:
pd.set_option('display.max_columns', 2000)

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

- `calendar.csv` - Contains information about the dates on which the products are sold.
- `sales_train_validation.csv` - Contains the historical daily unit sales data per product and store [d_1 - d_1913]
- `sample_submission.csv` - The correct format for submissions. Reference the Evaluation tab for more info.
- `sell_prices.csv` - Contains information about the price of the products sold per store and date.
- `sales_train_evaluation.csv` - Includes sales [d_1 - d_1941] (labels used for the Public leaderboard)

# 1. Read Data

In [None]:
calendar_data = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
calendar_data.head()

In [None]:
sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
sales_train_validation.head()

In [None]:
sales_train_evaluation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv')
sales_train_evaluation.head()

# 2. Exploratory Data Analysis

<i>`sales_train_validation` Dataset is our train data set: [D1 - D1913].</i> <br>
<i>`sales_train_evalutaion` Dataset is data used to evaluate our models, it contains [D1914 - D1941].</i>

In [None]:
train_data = sales_train_validation.copy()

## 2.1 General Information

In [None]:
states = train_data.state_id.unique().tolist()
print(f"States Present in The Dataset: {states} ({len(states)})")

In [None]:
stores = train_data.store_id.unique().tolist()
print(f"Stores Present in The Dataset: {stores} ({len(stores)})")

In [None]:
categories = train_data.cat_id.unique().tolist()
print(f"Categories Present in The Dataset: {categories} ({len(categories)})")

In [None]:
items = train_data.dept_id.unique().tolist()
print(f"Items Present in The Dataset: {items} ({len(items)})")

In [None]:
print(f"There are {len(train_data.item_id.unique())} Items in The Dataset")

In [None]:
print(f"Total Numner of Time Series: {len(train_data.id.unique())} !")

In [None]:
train_data[:10]

## 2.2 Example of Time Series

In [None]:
d_cols = train_data.columns.tolist()[6:] # Sales columns
non_d_cols = list(reversed(train_data.columns.tolist()[:6])) 

In [None]:
train_data.loc[train_data.d_100 == train_data.d_100.max()]

Let's take the example of item <b>FOODS_3_586</b> sales in California store <b>TX_3</b>

In [None]:
def merge_with_calendar(data, calendar_data):
    # data should have a date column "d"
    assert 'd' in data.columns, 'DataFrame should have a column "d" !'
    # Merge With Calendar
    cal = calendar_data[['d', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
                    'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']]
    d = pd.merge(cal, data, on="d")
    # Fill Missing Event Values with None
    for col in ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
        d[col].fillna('None', inplace=True)
    return d

In [None]:
def get_ts_example(data, d_cols, calendar_data, item_id, store_id, idx=None):
    try:
        if idx is None:
            ts = data.loc[(data['item_id'] == item_id) & (data['store_id'] == store_id)]
            ts = ts[d_cols].T.reset_index()
            ts.columns = ['d', 'sales']
        else:
            ts = data.loc[idx][d_cols].reset_index()
            ts.columns = ['d', 'sales']
        # Make sure that sales column's type is int
        ts["sales"] = ts["sales"].astype("int")
        return merge_with_calendar(ts, calendar_data)
    except Exception as e:
        print(f'Can not extract time series: {e}')

In [None]:
ts = get_ts_example(train_data, d_cols, calendar_data, item_id='FOODS_3_586', store_id='TX_3')

In [None]:
ts

In [None]:
print(f"Length of the time series: {len(ts)} days.")

In [None]:
print(f"Years in the TS: {ts.year.unique()}")

In [None]:
print(ts['event_type_2'].value_counts())

<i>From 1913 days there are only 4 days with type 2 event! As a result we'll ignore that event type.</i>

<b> Plots </b>

In [None]:
events_1_data = ts['event_type_1'].value_counts().iloc[1:]
fig = plt.figure(figsize=(11, 5))
ax = fig.add_subplot()
ax.pie(x=events_1_data.values,
       labels=events_1_data.index,
       shadow=True,
       radius=1,
       autopct='%1.1f%%')
ax.set_title('Distribution of Type 1 Events')
plt.show()

In [None]:
fig = plt.figure(figsize=(20, 6))
ax = fig.add_subplot()
ax.plot(ts.date, ts.sales)
ax.set_xticks(ts.date.values[::90])
ax.set_xlabel('Date')
ax.set_ylabel('Sales')
ax.grid()
ax.set_title(f'Sales of FOODS_3_586 in TX_3 Store')
plt.show()

In [None]:
# Area Plot
fig = px.area(ts, 
              x='date',
              y='sales',
              title='Time Series Yearly Area Plot',
              facet_row='year',
              facet_row_spacing=0.05)
fig.update_layout(width=900,
                 height=900)
             
fig.show()

In [None]:
for year in ts.year.unique():
    t = ts[ts.year == year]
    fig = plt.figure(figsize=(20, 6))
    ax = fig.add_subplot()
    t_event_1 = t.loc[t.event_type_1 != 'None']
    ax.plot(t.date, t.sales)
    ax.scatter(t_event_1.date,
               t_event_1.sales,
               color='red',
               label='Type 1 Event')
    ax.set_xticks(t.date.values[::30])
    ax.set_xlabel('Date')
    ax.set_ylabel('Sales')
    ax.grid()
    ax.set_title(f'Sales of FOODS_3_586 in TX_3 store for {year}')
    ax.legend()
    plt.show()

In [None]:
fig = plt.figure(figsize=(20, 15))
ax1 = fig.add_subplot(311)
sns.boxplot(data=ts, x='month', y='sales', ax=ax1)
ax2 = fig.add_subplot(312)
sns.boxplot(data=ts, x='weekday', y='sales', ax=ax2)
ax3 = fig.add_subplot(313)
sns.boxplot(data=ts, x='event_type_1', y='sales', ax=ax3)
plt.show()

- A simple observation: from the first plot we can see that the biggest part of sales of this item (<i>FOODS_3_586</i>) in this store (<i>TX_3</i>) is during Month 8 (August).<br>
- This very detailed level (<i>Item level</i>) won't generate many insights, aggregated levels will do such as <i>State</i>, <i>Store</i>, <i>Category</i> and <i>Department</i> levels.

Plotly offers some interesting plots and visualizations ! Let's try some of them at our time series.

In [None]:
fig = px.histogram(ts,
                   x='sales',
                   marginal='box',
                   title='Sales Distribution for FOODS_3_586 at TX_3 store')
fig.show()

In [None]:
fig = px.line(data_frame=ts, 
              x='date', 
              y='sales', 
              color='year', 
              title='Sales of FOODS_3_586 at TX_3 Store',)
fig.update_layout(legend=dict(x=1,
                              y=1,
                              title_font_family="Times New Roman",
                              bgcolor="snow",
                              bordercolor="Black",
                              borderwidth=1),
                  font=dict(size=11)
                 )
# plot and legend are Interactive !
fig.show()

An interesting pattern to observe is that, for this item example, sales decrease to the lowest level at the end of each year !

In [None]:
for feature in ['wday', 'month', 'year', 'event_name_1']: #wday = weekday
    data_feature = ts.groupby(feature).mean()['sales'].reset_index().sort_values(by='sales')
    fig = px.bar(data_frame=data_feature,
          x=feature,
          y='sales',
          title=f'Average Sales by {feature}')
    fig.show()

We can easily notice that the item's Average sale is at its maximum at Father's Day.

## 2.3 Aggregated Level Analysis

- Now we'll go through more aggregated analysis.

 - The Data hierarchy is:<br>
 <b>State</b> ==>  <b>Store</b> ==>  <b>Category</b> ==>  <b>Department</b> ==>  <b>Item</b>

In [None]:
d = train_data.groupby(['state_id','store_id']).count().reset_index()
d['state_id'].value_counts().plot(kind="bar", grid=True, title="(A) Number of Stores by State", yticks=[0,1,2,3,4])
plt.show()
d = train_data.groupby(['state_id','id']).count().reset_index()
d['state_id'].value_counts().plot(kind="bar", grid=True, title="(B) Number of Items by State")
plt.show()
d = train_data.groupby(['store_id','item_id']).count().reset_index()
d['store_id'].value_counts().plot(kind="bar", grid=True, title=" (C) Number of Items by Store")
plt.show()

- We will use <b>pandas.DataFrame.stack</b> function to transform our data.
- The result data should have as columns <b>['d', 'state_id','store_id','cat_id','dept_id','item_id', 'id', 'sales']</b> where "d" column has values in <br>[d_1,..., d_1913].
- Initially we have data with 10 stores and 3049 items per store so as a result we have 30490 time series !
- Each time series contains sales data of 1913 days. Final output data will have then 8 columns and 30490*1913 = 58327370 rows !

In [None]:
train_data.head()

In [None]:
# Transform Data Structure
data = train_data.set_index(non_d_cols)
# the following will make one column for sales and one columns for "d" values (d_1 ... d_1913)
data = data.stack()
data = data.to_frame() 
data.columns = ["sales"]
data.reset_index(inplace=True)
data.columns = non_d_cols + ["d", "sales"]

In [None]:
data

<i>The followoing are function that will be used for different analysis.</i>

In [None]:
def plot_daily_data(df,level):
    levels_dict = {'cat':'Category', 'dept':'Department', 'store':'Store', 'state':'State'}
    fig = px.line(data_frame=df, 
                  x='date', 
                  y='sales', 
                  color=f'{level}_id', 
                  title=f'Sales by {levels_dict[level]}')
    fig.update_layout(legend=dict(x=1,
                                  y=1,
                                  title_font_family="Times New Roman",
                                  bgcolor="snow",
                                  bordercolor="Black",
                                  borderwidth=1),
                      font=dict(size=11)
                     )
    fig.show()

In [None]:
def plot_yearly_data(df,level):
    levels_dict = {'cat':'Category', 'dept':'Department', 'store':'Store', 'state':'State'}
    n_years = df.year.nunique()
    years = df.year.unique()
    level_elements = df[f'{level}_id'].unique().tolist()  
    all_colors= ['red','green','blue','purple','cyan','orange','pink','yellow','black','magenta']
    colors = all_colors[:len(level_elements)]
    fig = plt.figure(figsize=(15,15))
    gs = GridSpec(n_years, len(level_elements))
    c_idx = 0
    for l_e, color in zip(level_elements,colors):
        r_idx = 0
        for year in years:
            ax = fig.add_subplot(gs[r_idx,c_idx], xticks=[], yticks=[])
            df1 = df.loc[(df.year == year) & (df[f'{level}_id'] == l_e)]
            ax.plot(df1.date, df1['sales'], color=color, linewidth=0.9)
            ax.set_title(f'{l_e}: {year}')
            r_idx += 1
        c_idx+=1
    fig.suptitle(f'Yearly Sales by {levels_dict[level]}')
    plt.show()

In [None]:
def plot_average_sales(df,level):
    for feature in ['weekday', 'month', 'year', 'event_name_1']:
        data_feature = df.groupby([f'{level}_id', feature]).mean()['sales'].reset_index()
        fig = px.bar(data_frame=data_feature,
              x=feature,
              y='sales',
              color=f'{level}_id',
              title=f'Average Sales by {feature}')
        fig .update_layout(legend=dict(x=1,
                                       y=1,
                                       title_font_family="Times New Roman",
                                       bgcolor="mintcream",
                                       bordercolor="black",
                                       borderwidth=1))
        fig.show()

<b>For plotly plots, you can double click on legend to visualize data parts separately. You can also zoom in, zoom out and autoscale plots.</b>

### 2.3.1 State Level Analysis

In [None]:
# Preparing state-level data
state_data = data.groupby(["state_id","d"]).sum()["sales"]
state_data = state_data.reset_index()
state_data = merge_with_calendar(state_data, calendar_data)

In [None]:
state_data.head()

In [None]:
plot_daily_data(state_data,'state')

In [None]:
fig = px.histogram(state_data,
                   x='sales',
                   color='state_id',
                   marginal='box',
                   title='Sales Distribution By State')
fig.show()

- California has the highest number of sales.
- We observe again the sales decrease to their lowest level (previously observed with a single item time series) at the end of every year, let's try to get more details about this pattern !

In [None]:
dlow = state_data.loc[(state_data['sales']<20) & (state_data['month']==12)]
dlow.style.applymap(lambda x:"background-color:yellow", subset=['event_name_1'])

<b> It was Christmas effect !</b>

In [None]:
plot_yearly_data(state_data,'state')

In [None]:
plot_average_sales(state_data,'state')

- Sales average is increasing over years.
- Weekends correspond to the highest sales average.
- Sales average is almost the same over different months.

### 2.3.2 Store Level Analysis

In [None]:
# Preparing store-level data
store_data = data.groupby(["store_id","d"]).sum()["sales"]
store_data = store_data.reset_index()
store_data = merge_with_calendar(store_data, calendar_data)

In [None]:
store_data.head()

In [None]:
plot_daily_data(store_data,'store')

In [None]:
fig = px.histogram(store_data,
                   x='sales',
                   color='store_id',
                   marginal='box',
                   title='Sales Distribution By Store')
fig.show()

- CA_3 is the store having the highest number of sales.

In [None]:
plot_yearly_data(store_data,'store')

In [None]:
plot_average_sales(store_data,'store')

- Let's see Sales' correlations between different Stores.

In [None]:
corr_data = pd.pivot_table(data=store_data,
                           index='date',
                           values='sales',
                           columns='store_id')
corr_data.sort_values(by="date", inplace=True)
plt.figure(figsize=(12,5))
heatmap = sns.heatmap(corr_data.corr(), annot=True, fmt='.2f')
heatmap.set_yticklabels(heatmap.yaxis.get_ticklabels(), rotation=0)
heatmap.set_title('Correlation Between Stores Sales')
plt.show()

- The highest correlation is between CA_1 and CA_3 stores, in the same state.
- The lowest correlation is between WI_1 and WI_3 stores, in the same state!

### 2.3.3 Catgeory Level Analysis

In [None]:
train_data.groupby('cat_id').count()['id'].reset_index().plot(x='cat_id', 
                                                              kind='bar', 
                                                              figsize=(15,5),
                                                              grid=True,
                                                              title='Number of Items by Category')
plt.show()

In [None]:
# Preparing category-level data
cat_data = data.groupby(["cat_id","d"]).sum()["sales"]
cat_data = cat_data.reset_index()
cat_data = merge_with_calendar(cat_data, calendar_data)

In [None]:
cat_data.head()

In [None]:
plot_daily_data(cat_data,'cat')

- FOODS is the category having the highest number of sales, HOBBIES having the lowest one.

In [None]:
fig = px.histogram(cat_data,
                   x='sales',
                   color='cat_id',
                   marginal='box',
                   title='Sales Distribution By Category')
fig.show()

In [None]:
plot_yearly_data(cat_data,'cat')

In [None]:
plot_average_sales(cat_data,'cat')

Let's plot a Sales Calendar Heatmap of the first and last years (by Categroy).

In [None]:
years = cat_data.year.unique()
for year in [years[0],years[-1]]:
    for cat in cat_data.cat_id.unique():
        dyear = cat_data.loc[(cat_data.year == year) & (cat_data.cat_id == cat)] # & (cat_data.cat_id == cat)
        fig = go.Figure(data=go.Heatmap(
                z=dyear.sales, #z,
                x=dyear.date, #dates,
                y=dyear.cat_id, #programmers,
                colorscale=px.colors.sequential.Plasma_r))

        fig.update_layout(
            title=f'{cat} Sales {year}',
            xaxis_nticks=36)

        fig.show()

### 2.3.4 Department Level Analysis

In [None]:
train_data.groupby('dept_id').count()['id'].reset_index().plot(x='dept_id', 
                                                              kind='bar', 
                                                              figsize=(15,5),
                                                              grid=True,
                                                              title='Number of Items by Department')
plt.show()

In [None]:
# Preparing department-level data
dept_data = data.groupby(["dept_id","d"]).sum()["sales"]
dept_data = dept_data.reset_index()
dept_data = merge_with_calendar(dept_data, calendar_data)

In [None]:
dept_data.head()

In [None]:
plot_daily_data(dept_data,'dept')

- FOODS_3 and HOBBIES_2 have respectively the highest and lowest number of sales among all departments.

In [None]:
fig = px.histogram(dept_data,
                   x='sales',
                   color='dept_id',
                   marginal='box',
                   title='Sales Distribution By Department')
fig.show()

In [None]:
plot_yearly_data(dept_data,'dept')

In [None]:
plot_average_sales(dept_data,'dept')

In [None]:
corr_data = pd.pivot_table(data=dept_data,
                           index='date',
                           values='sales',
                           columns='dept_id')
corr_data.sort_values(by="date", inplace=True)
plt.figure(figsize=(12,5))
heatmap = sns.heatmap(corr_data.corr(), annot=True, fmt='.2f')
heatmap.set_yticklabels(heatmap.yaxis.get_ticklabels(), rotation=0)
heatmap.set_title('Correlation Between Department Sales')
plt.show()

After the analyzing and visualizing part comes the forecast part  !

# 3. Forecast

- In this part, we'll choose 6 Time Series from the 30490 ones we have and use them to test different forecasting approaches and evaluate them.

In [None]:
forecast_horizon = 28 # from d_1914 to d_1941
d_fcst_columns = sales_train_evaluation.columns[-forecast_horizon:].tolist()

In [None]:
def get_ground_truth(idx, df, d_fcst_columns):
    return df.loc[idx, d_fcst_columns].values  

In [None]:
def plot_results(fcst, y_eval, rmse, algo, item):
    fig = plt.figure(figsize=(11, 5))
    ax = fig.add_subplot()
    ax.plot(fcst, color='red', label='Forecast')
    ax.plot(y_eval, color='blue', label='Ground Truth')
    ax.set_title(f' {algo} for {item}, RMSE: {rmse}')
    ax.grid()
    ax.legend()
    plt.show()

## 3.1 Time Series Selection

We will choose:
- 3 Time Series with enough data and few zeros.
- 3 Time Series with many zeros.

In [None]:
choice_data = train_data.copy()
choice_data['d_val'] = choice_data[d_cols].mean(axis=1)
choice_data.drop(columns=d_cols,inplace=True)

In [None]:
print(f'Time Series Averages: Min {choice_data.d_val.min()} Max:{choice_data.d_val.max()} Median: {choice_data.d_val.median()}')

In [None]:
idx1 = choice_data.loc[choice_data['d_val'] >= 50].sample(n=3, random_state=1).index
idx2 = choice_data.loc[(choice_data['d_val'] <= 5) & (choice_data['d_val'] > 1)].sample(n=3, random_state=1).index
idx = idx1.tolist() + idx2.tolist()
ts_test = train_data.iloc[idx]#.reset_index(drop=True)
test_items = ts_test.id.unique().tolist()
test_items = [x[:-11] for x in test_items]

In [None]:
test_items

In [None]:
# Dataframe for RMSE
rmse_summary = pd.DataFrame({"items":test_items}, index=idx)

## 3.2 Statistical Method: ARIMA

<b>ARIMA</b> stands for <b>A</b>uto<b>R</b>egressive <b>I</b>ntegrated <b>M</b>oving <b>A</b>verage.

Types:
- <b>ARIMA</b>: Non-Seasonal. 
- <b>SARIMA</b>: Seasonal ARIMA.
- <b>SARIMAX</b>: Seasonal ARIMA with eXogenous variables.

In [None]:
def choose_sarimax_order_and_forecast(ps,ds,qs, y_train, y_eval):
    best_model, best_rmse, best_order, best_fcst = None, None, None, None
    for p in ps:
        for d in ds:
            for q in qs:
                order = (p,d,q)
                model = sm.tsa.SARIMAX(y_train, 
                               order=order, 
                               trend='c',
                               enforce_invertibility=False,
                               enforce_stationarity=False).fit(disp=False, warn_convergence=False)
                fcst = model.predict(start=len(y_train), end=len(y_train) - 1 + len(y_eval))
                try:
                    fcst = [round(x) for x in fcst]
                    rmse = round(np.sqrt(mean_squared_error(fcst, y_eval)), 3)
                    if (best_rmse is None) or (rmse < best_rmse):
                        best_model, best_rmse, best_order, best_forecast= model, rmse, order, fcst
                except Exception as e:
                    print(f'For order={order}, model results are invalid: {e}')
    print(f"Best Order: {best_order}")
    return best_rmse, best_forecast

In [None]:
df0 = get_ts_example(ts_test, d_cols, calendar_data, item_id=None, store_id=None, idx=idx[0])
df0['date'] = df0['date'].apply(lambda x : pd.to_datetime(x))
df0 = df0[['date','sales']]
y_train = df0["sales"].values
df0

In [None]:
result = seasonal_decompose(y_train, model='additive', period=365)
fig = result.plot()

In [None]:
fig, ax = plt.subplots(2,1,figsize=(20,7))
sm.tsa.graphics.plot_acf(y_train, lags=30, ax=ax[0])
ax[0].set_title('Autocorreation Function: lags=30')
sm.tsa.graphics.plot_pacf(y_train, lags=30, ax=ax[1])
ax[1].set_title('Partial Autocorreation Function: lags=30')
plt.show()

order = (p,d,q)
- p: AutoRegression (AR) order.
- d: Trend Differncing order.
- q: Moving Average (MA) order. <br>

In [None]:
# Plotting Autocorrelation with pandas
fig, ax = plt.subplots(1,1,figsize=(20,7))
pd.plotting.autocorrelation_plot(y_train, ax=ax)
plt.show()

In [None]:
sarimax_model = sm.tsa.SARIMAX(y_train, 
                               order=(7,1,7), 
                               trend='c',
                               enforce_invertibility=False,
                               enforce_stationarity=False).fit(disp=False, warn_convergence=False)
sarimax_model.summary()

In [None]:
y_eval = get_ground_truth(idx[0], sales_train_evaluation, d_fcst_columns)

In [None]:
fcst = sarimax_model.predict(start=len(y_train), end=len(y_train) - 1 + len(y_eval))
rmse = round(np.sqrt(mean_squared_error(fcst, y_eval)), 3)

In [None]:
plot_results(fcst, y_eval, rmse, "SARIMAX", test_items[0])

In [None]:
# SARIMAX Parameters Grid
ps = range(1,8)
ds = range(0,2)
qs = range(0,8)

In [None]:
rmse_sarimax = []
for i,ix in enumerate(idx):
    print(f"Processing {test_items[i]}...")
    # Get Time Series (Train)
    df0 = get_ts_example(ts_test, d_cols, calendar_data, item_id=None, store_id=None, idx=ix)
    df0['date'] = df0['date'].apply(lambda x : pd.to_datetime(x))
    df0 = df0[['date','sales']]
    y_train = df0["sales"].values
    y_eval = get_ground_truth(ix, sales_train_evaluation, d_fcst_columns)
    # Train SARIMAX model
    rmse, fcst = choose_sarimax_order_and_forecast(ps,ds,qs, y_train, y_eval)
    # Plot
    plot_results(fcst, y_eval, rmse, "SARIMAX", test_items[i])
    rmse_sarimax.append(rmse)

In [None]:
# Dataframe for RMSE
rmse_summary = pd.DataFrame({"items":test_items}, index=idx)

In [None]:
rmse_summary["RMSE_SARIMAX"] = rmse_sarimax
rmse_summary

## 3.3 FB Prophet

FB Prophet requires a column 'ds' (for dates) and a columns 'y' (target variable).

In [None]:
def generate_holidays(calendar_data, dates):
    holidays = calendar_data.loc[calendar_data['d'].isin(dates), ['date','event_name_1']].dropna()
    holidays['ds'] = holidays['date'].apply(lambda x : pd.to_datetime(x))
    holidays['upper_window'] = 0
    holidays['lower_window'] = 0
    holidays.rename(columns={"event_name_1":"holiday"}, inplace=True)
    holidays.drop(columns='date', inplace=True)
    holidays.reset_index(drop=True, inplace=True)
    return holidays

In [None]:
# holidays for FB prophet model
holidays = generate_holidays(calendar_data, d_cols+d_fcst_columns)
holidays

In [None]:
df = get_ts_example(ts_test, d_cols, calendar_data, item_id=None, store_id=None, idx=idx[0])
df['ds'] = df['date'].apply(lambda x : pd.to_datetime(x))
df = df[['ds','sales']]
df = df.rename(columns={'sales':'y'})
df

In [None]:
df['cap'] = df['y'].max()
df['floor'] = df['y'].min()

In [None]:
# Creating Model
model = Prophet(daily_seasonality=True, 
                holidays=holidays,
                holidays_prior_scale=0.2,
                growth='logistic', # possible value: 'logistic', 'linear' or 'flat'
                changepoint_range=0.8, # default value
                changepoint_prior_scale=0.2) # default value
model.add_seasonality(name='weekly', period=7, fourier_order=3)
model.add_seasonality(name='yearly', period=364, fourier_order=10)
model.fit(df)

In [None]:
# Forecasting
future = model.make_future_dataframe(periods=forecast_horizon)
future['cap'] = df['y'].max()
future['floor'] = df['y'].min()
forecast = model.predict(future)

In [None]:
fig1 = model.plot(forecast)
a = add_changepoints_to_plot(fig1.gca(), model, forecast)

In [None]:
# Interactive plots
plot_plotly(model, forecast)

In [None]:
# Model Components
plot_components_plotly(model, forecast)

In [None]:
fcst = forecast['yhat'].values[-forecast_horizon:]
fcst = [int(x) for x in fcst]
ground_truth = get_ground_truth(idx[0], sales_train_evaluation, d_fcst_columns)
rmse = round(np.sqrt(mean_squared_error(ground_truth, fcst)), 3)

In [None]:
fig = plt.figure(figsize=(11, 5))
ax = fig.add_subplot()
ax.plot(fcst, color='red', label='Forecast')
ax.plot(ground_truth, color='blue', label='Ground Truth')
ax.set_title(f'FB Prophet for {test_items[i]}, RMSE: {rmse}')
ax.grid()
ax.legend()
plt.show()

In [None]:
# FB Prophet Parameters Grid
changepoint_prior_scale = [0.0001, 0.001, 0.1, 0.5]
seasonality_prior_scale = [0.01, 0.1, 1, 10]
holiday_prior_scale = [0.1, 0.2, 0.5, 1]

In [None]:
def tune_fbprophet_and_forecast(y_eval, df, forecast_horizon, chngp, sps, hps):
    best_rmse, best_fcst = None, None
    # Choosing the Best Parameters
    for p1 in chngp:
        for p2 in sps:
            for p3 in hps:
                # Training Model
                model = Prophet(daily_seasonality=True, 
                                holidays=holidays,
                                holidays_prior_scale=p3,
                                seasonality_prior_scale=p2,
                                growth='logistic', 
                                changepoint_range=0.8, 
                                changepoint_prior_scale=p1)
                model.add_seasonality(name='weekly', period=7, fourier_order=3)
                model.add_seasonality(name='yearly', period=364, fourier_order=10)
                model.fit(df)
                # Forecasting
                future = model.make_future_dataframe(periods=forecast_horizon)
                future['cap'] = df['y'].max()
                future['floor'] = df['y'].min()
                forecast = model.predict(future)
                # Evaluating
                fcst = forecast['yhat'].values[-forecast_horizon:]
                fcst = [int(x) for x in fcst]
                rmse = round(np.sqrt(mean_squared_error(y_eval, fcst)), 3)
                if (best_rmse is None) or (rmse < best_rmse):
                    best_rmse, best_fcst = rmse, fcst
    return best_rmse, best_fcst

In [None]:
rmse_fbprophet = []
for i,ix in enumerate(idx):
    print(f"Processing {test_items[i]}...")
    # Get Time Series (Train)
    df = get_ts_example(ts_test, d_cols, calendar_data, item_id=None, store_id=None, idx=ix)
    df['ds'] = df['date'].apply(lambda x : pd.to_datetime(x))
    df = df[['ds','sales']]
    df = df.rename(columns={'sales':'y'})
    # Add Cap and Floor Columns
    df['cap'] = df['y'].max()
    df['floor'] = df['y'].min()
    # Get Ground Truth
    y_eval = get_ground_truth(ix, sales_train_evaluation, d_fcst_columns)
    # Train FBProphet model
    rmse, fcst = tune_fbprophet_and_forecast(y_eval, df, forecast_horizon, changepoint_prior_scale, seasonality_prior_scale, holiday_prior_scale)
    # Plot
    plot_results(fcst, y_eval, rmse, "FB Prophet", test_items[i])
    rmse_fbprophet.append(rmse)

In [None]:
rmse_summary["RMSE_FBPROPHET"] = rmse_fbprophet
rmse_summary

In [None]:
r1 = round(rmse_summary.RMSE_FBPROPHET.mean(), 3)
r2 = round(rmse_summary.RMSE_SARIMAX.mean(), 3)
print(f"MEAN RMSE SARIMAX: {r2}, FBProphet: {r1}")