# A simpl(er) Introduction to Hierarchical Models 
### Naive Bayesians, 2021


### Agenda

1. EDA Walmart M5 Dataset


In [2]:
%load_ext nb_black

from IPython.core.display import HTML

display(HTML("<style>.container { width:85% !important; }</style>"))

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [10]:
import os
import pandas as pd
import numpy as np
from typing import Dict

<IPython.core.display.Javascript object>

In [11]:
"""
Download the raw M5 Forecasting - Accuracy data from:
https://www.kaggle.com/c/m5-forecasting-accuracy/data

"""

def read_raw_data() -> Dict[str, pd.DataFrame]:
    """Reads the Walmart M5 Data"""

    P_ROOT = "../../../hts-forecast/volume/m5-forecasting-accuracy/"
    P_SALES = os.path.join(P_ROOT, "sales_train_validation.csv")
    P_CALENDAR = os.path.join(P_ROOT, "calendar.csv")

    df_sales = pd.read_csv(P_SALES)
    df_calendar = pd.read_csv(P_CALENDAR, parse_dates=["date"])
    return {"sales": df_sales, "calendar": df_calendar}


def pivot_and_join_dates(data: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    """Reformats the dataframe and adds dates"""
    
    df_sales = data["sales"]
    df_calendar = data["calendar"]

    # Dates in the original data 
    date_cols = [i for i in df_sales.columns if "d_" in i]
    id_cols = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]
    
    # Pivot dataframe so the every day is a row      
    df_date_keys = df_sales.melt(
        id_vars=id_cols, value_vars=date_cols, var_name="date_key", value_name="qty"
    )

    # Add the dates from the date key
    df_with_dates = df_date_keys.merge(
        df_calendar[["d", "date"]],
        left_on=["date_key"],
        right_on=["d"],
        how="left",
        validate="m:1",
    )
    return df_with_dates


<IPython.core.display.Javascript object>

In [12]:
data = read_raw_data()

df_with_dates = pivot_and_join_dates(data)

<IPython.core.display.Javascript object>

In [16]:
P_ROOT = "../../../hts-forecast/volume/m5-forecasting-accuracy/"
df_with_dates.to_parquet(os.path.join(P_ROOT, "walmart_m5_sales.parquet"))

<IPython.core.display.Javascript object>

In [22]:
df_grp = (
    df_with_dates.groupby(["cat_id", "dept_id", "state_id", "date"])
    .agg(**{"qty": ("qty", "sum")})
    .reset_index()
)

<IPython.core.display.Javascript object>

In [23]:
df_grp.to_parquet(os.path.join(P_ROOT, "walmart_m5_sales_dep_state.parquet"))

<IPython.core.display.Javascript object>

In [26]:
df_grp.head()

Unnamed: 0,cat_id,dept_id,state_id,date,qty
0,FOODS,FOODS_1,CA,2011-01-29,1157
1,FOODS,FOODS_1,CA,2011-01-30,1142
2,FOODS,FOODS_1,CA,2011-01-31,787
3,FOODS,FOODS_1,CA,2011-02-01,743
4,FOODS,FOODS_1,CA,2011-02-02,729


<IPython.core.display.Javascript object>