In [42]:
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from skforecast.datasets import fetch_dataset
# from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
# from skforecast.ForecasterAutoregMultiSeriesCustom import ForecasterAutoregMultiSeriesCustom
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregMultiSeries import ForecasterAutoregMultiSeries

from sklearn.metrics import mean_squared_error


from matplotlib import pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Get Data


In [7]:
df_hols = pd.read_csv(
    '../data_store_sales/holidays_events.csv',
    parse_dates=['date']
    )

df_hols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


In [11]:
df_train = pd.read_csv(
    '../data_store_sales/train.csv',
    parse_dates=['date'],
    index_col=['id']
    )

df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000888 entries, 0 to 3000887
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   store_nbr    int64         
 2   family       object        
 3   sales        float64       
 4   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 137.4+ MB


In [12]:
df_train

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,2013-01-01,1,BABY CARE,0.000,0
2,2013-01-01,1,BEAUTY,0.000,0
3,2013-01-01,1,BEVERAGES,0.000,0
4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0
3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [26]:
store = 1
for i, family in enumerate(df_train.family.unique()):
    fig = px.line(df_train[(df_train.store_nbr == store) & (df_train.family == family)], 
            x='date', 
            y='sales', 
            title=f"{family} sales at store {store}")
    fig.show()
    if i ==2:
        break

# Testing with just one family and 1 store

In [35]:
from sklearn.model_selection import TimeSeriesSplit

In [37]:
df_store1_automotive = df_train[(df_train.family == 'AUTOMOTIVE') & (df_train.store_nbr == 1)]

ts_split = TimeSeriesSplit(n_splits=3,
                max_train_size=None,
                test_size=90,
                gap=0)

### Cross Validation Split

In [77]:
fig = make_subplots(rows=3, cols=1,shared_xaxes=True)

for i, (train_index, test_index) in enumerate(ts_split.split(df_store1_automotive)):

    fig.add_trace(
        go.Scatter(x=df_store1_automotive.iloc[train_index].date, 
                   y=df_store1_automotive.iloc[train_index].sales, 
                   mode='lines', 
                   name='train',
                   line=dict(color='blue')),
        row=i+1, 
        col=1,        
    )
    fig.add_trace(
        go.Scatter(x=df_store1_automotive.iloc[test_index].date, 
                   y=df_store1_automotive.iloc[test_index].sales, 
                   mode='lines', 
                   name='test',
                   line=dict(color='crimson')),
        row=i+1, 
        col=1,        
    )

fig.update_layout(
    showlegend=False,
    title_text='Time series split with 3 splits',
    )
fig.show()