In [None]:
import plotly.graph_objects as go
import plotly.express as px

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import cycle

pd.set_option('max_columns', 50)
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
INPUT_DIR = '../input/m5-forecasting-accuracy'
Cal = pd.read_csv(f'{INPUT_DIR}/calendar.csv')
SalesValid = pd.read_csv(f'{INPUT_DIR}/sales_train_validation.csv')
SampleSub = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')
SellPrices = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv')

# What exactly are we trying to predict?
We are trying for forecast sales for 28 forecast days. The sample submission has the following format:
- The columns represent 28 forecast days. We will fill these forecast days with our predictions.
- The rows each represent a specific item. This id tells us the item type, state, and store. We don't know what these items are exactly.

In [None]:
SampleSub.head()

We are given historic sales data in the sales_train_validation dataset.

* rows exist in this dataset for days d_1 to d_1913. We are given the department, category, state, and store id of the item.
* d_1914 - d_1941 represents the validation rows which we will predict in stage 1
* d_1942 - d_1969 represents the evaluation rows which we will predict for the final competition standings.

In [None]:
SalesValid.head()

In [None]:
SalesValid['dept_id'].unique()

In [None]:
Cal.head()

In [None]:
Cal['event_name_1'].unique()

In [None]:
Cal.shape

In [None]:
SalesValid.shape

> # Visualizing the data for a single item

- Lets take a random item that sell a lot and see how it's sales look across the training data.
- FOODS_3_090_CA_3_validation sells a lot
- Note there are days where it appears the item is unavailable and sales flatline

In [None]:
d_cols = [c for c in SalesValid.columns if 'd_' in c] # sales data columns

# Below we are chaining the following steps in pandas:
# 1. Select the item.
# 2. Set the id as the index, Keep only sales data columns
# 3. Transform so it's a column
# 4. Plot the data
SalesValid.loc[SalesValid['id'] == 'FOODS_3_090_CA_3_validation'] \
    .set_index('id')[d_cols] \
    .T \
    .plot(figsize=(15, 5),
          title='FOODS_3_090_CA_3 sales by "d" number',
          color=next(color_cycle))
plt.legend('')
plt.show()

## Merging the data with real dates
- We are given a calendar with additional information about past and future dates.
- The calendar data can be merged with our days data
- From this we can find weekly and annual trends

In [None]:
# Calendar data looks like this (only showing columns we care about for now)
Cal[['d','date','event_name_1','event_name_2',
     'event_type_1','event_type_2', 'snap_CA']].head()

In [None]:
# Merge calendar on our items' data
example = SalesValid.loc[SalesValid['id'] == 'FOODS_3_090_CA_3_validation'][d_cols].T
example = example.rename(columns={8412:'FOODS_3_090_CA_3'}) # Name it correctly
example = example.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example = example.merge(Cal, how='left', validate='1:1')
example.set_index('date')['FOODS_3_090_CA_3'] \
    .plot(figsize=(15, 5),
          color=next(color_cycle),
          title='FOODS_3_090_CA_3 sales by actual sale dates')
plt.show()

# Select more top selling examples
example2 = SalesValid.loc[SalesValid['id'] == 'HOBBIES_1_234_CA_3_validation'][d_cols].T
example2 = example2.rename(columns={6324:'HOBBIES_1_234_CA_3'}) # Name it correctly
example2 = example2.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example2 = example2.merge(Cal, how='left', validate='1:1')

example3 = SalesValid.loc[SalesValid['id'] == 'HOUSEHOLD_1_118_CA_3_validation'][d_cols].T
example3 = example3.rename(columns={6776:'HOUSEHOLD_1_118_CA_3'}) # Name it correctly
example3 = example3.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example3 = example3.merge(Cal, how='left', validate='1:1')

In [None]:
examples = ['FOODS_3_090_CA_3','HOBBIES_1_234_CA_3','HOUSEHOLD_1_118_CA_3']
example_df = [example, example2, example3]
for i in [0, 1, 2]:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 3))
    example_df[i].groupby('wday').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: day of week',
              lw=5,
              color=color_pal[0],
              ax=ax1)
    example_df[i].groupby('month').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: month',
              lw=5,
              color=color_pal[4],

              ax=ax2)
    example_df[i].groupby('year').mean()[examples[i]] \
        .plot(kind='line',
              lw=5,
              title='average sale: year',
              color=color_pal[2],

              ax=ax3)
    fig.suptitle(f'Trends for item: {examples[i]}',
                 size=20,
                 y=1.1)
    plt.tight_layout()
    plt.show()