In [None]:
import os
import warnings
from pathlib import Path
import ipywidgets as widgets

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor


warnings.simplefilter("ignore")
pd.set_option('use_inf_as_na', True)

plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(11, 5),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)

plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)

In [None]:
import pandas as pd

data_dir = Path("../input/favorita-small/")

stores = pd.read_csv(
    data_dir / "stores.csv",
    dtype='category',
    index_col='store_nbr',
)
transactions = pd.read_csv(
    data_dir / "transactions.csv",
    dtype={
        'store_nbr': 'category',
        'transactions': 'uint64',
    },
    parse_dates=["date"],
    infer_datetime_format=True,
)
transactions = transactions.set_index("date").to_period("D")

oil = pd.read_csv(
    data_dir / "oil.csv",
    dtype='float32',
    parse_dates=["date"],
    infer_datetime_format=True,
)
oil = oil.set_index('date').to_period('D')

holidays_events = pd.read_csv(
    data_dir / "holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')

dtype = {
    'store_nbr': 'category',
    'family': 'category',
    'unit_sales': 'float32',
    'onpromotion': 'uint64',
}

sales = pd.read_csv(
    data_dir / 'train.csv',
    dtype=dtype,
    parse_dates=['date'],
    infer_datetime_format=True,
)
sales = sales.set_index('date').to_period('D')
sales = sales.set_index(['store_nbr', 'family'], append=True)

Time series of daily sales by store and product family, 1782 in all.

In [None]:
display(sales)

There are some supplemental data as well.

In [None]:
df_names = ['holidays_events', 'oil', 'stores', 'transactions']

data_tabs = widgets.Tab()
data_tabs.children = [widgets.Output() for _ in df_names]

for i, name in enumerate(df_names):
    data_tabs.set_title(i, name)
    with data_tabs.children[i]:
        display(globals()[name])
        
display(data_tabs)

Sample of a few time series, from Store 1.

In [None]:
STORE = '1'
FAMILY = ['AUTOMOTIVE', 'BEAUTY', 'BOOKS', 'DELI', 'PERSONAL CARE']
ts_1 = sales.drop('onpromotion', axis=1).unstack(['store_nbr', 'family']).loc(axis=1)[:, STORE, FAMILY]
_ = ts_1.plot(subplots=True, sharex=True, figsize=(14, 8))