In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
#import seaborn as sns
from itertools import cycle
pd.set_option('max_columns', 50)
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
stv_df = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
stv_df.head()

In [None]:
price_df = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv")
price_df.head()

In [None]:
cal_df = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv")
cal_df.head()

We are trying for forecast sales for 28 forecast days. The sample submission has the following format:

The columns represent 28 forecast days. We will fill these forecast days with our predictions.
The rows each represent a specific item. This id tells us the item type, state, and store. We don't know what these items are exactly.

In [None]:
ss = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')
ss.head(3)

Visualizing the data for a single item

In [None]:
d_cols = [c for c in stv_df.columns if 'd_' in c] # sales data columns

# Below we are chaining the following steps in pandas:
# 1. Select the item.
# 2. Set the id as the index, Keep only sales data columns
# 3. Transform so it's a column
# 4. Plot the data
stv_df.loc[stv_df['id'] == 'HOBBIES_1_234_CA_3_validation'] \
    .set_index('id')[d_cols] \
    .T \
    .plot(figsize=(15, 5),
          title='HOBBIES_1_234_CA_3 sales by "d" number',
          color=next(color_cycle))
plt.legend('')
plt.show()

Merging the data with real dates

In [None]:
# Calendar data looks like this (only showing columns we care about for now)
cal_df[['d','date','event_name_1','event_name_2','event_type_1','event_type_2', 'snap_CA']].head()
#cal_df.head()

In [None]:
# Merge calendar on our items' data
example = stv_df.loc[stv_df['id'] == 'HOBBIES_1_234_CA_3_validation'][d_cols].T
example = example.rename(columns={6324:'HOBBIES_1_234_CA_3'}) # Name it correctly
example = example.reset_index().rename(columns={'index': 'd'}) # make the index "d"
example = example.merge(cal_df, how='left', validate='1:1')
example.set_index('date')['HOBBIES_1_234_CA_3'] \
    .plot(figsize=(15, 5),
          color=next(color_cycle),
          title='HOBBIES_1_234_CA_3 sales by actual sale dates')
plt.show()

Sales broken down by time variables.
Now that we have our example item lets see how it sells by:
-Day of the week
-Month
-Year

In [None]:
examples = ['HOBBIES_1_234_CA_3']
example_df = [example]
for i in [ 0,1]:
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 3))
    example_df[i].groupby('wday').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: day of week',
              lw=5,
              color=color_pal[0],
              ax=ax1)
    example_df[i].groupby('month').mean()[examples[i]] \
        .plot(kind='line',
              title='average sale: month',
              lw=5,
              color=color_pal[4],

              ax=ax2)
    example_df[i].groupby('year').mean()[examples[i]] \
        .plot(kind='line',
              lw=5,
              title='average sale: year',
              color=color_pal[2],

              ax=ax3)
    fig.suptitle(f'Trends for item: {examples[i]}',
                 size=20,
                 y=1.1)
    plt.tight_layout()
    plt.show()

 Combined Sales over Time by Type
* We have several item types:
* * Hobbies
* * Household
* * Foods
1. Lets plot the total demand over time for each type

In [None]:
stv_df['cat_id'].unique()

In [None]:
stv_df.groupby('cat_id').count()['id'] \
    .sort_values() \
    .plot(kind='barh', figsize=(15, 5), title='Count of Items by Category')
plt.show()

In [None]:
past_sales = stv_df.set_index('id')[d_cols] \
    .T \
    .merge(cal_df.set_index('d')['date'],
           left_index=True,
           right_index=True,
            validate='1:1') \
    .set_index('date')


for i in stv_df['cat_id'].unique():
    items_col = [c for c in past_sales.columns if i in c]
    past_sales[items_col] \
        .sum(axis=1) \
        .plot(figsize=(15, 5),
              alpha=0.8,
              title='Total Sales by Item Type')
plt.legend(stv_df['cat_id'].unique())
plt.show()

Sales by Store
We are provided data for 10 unique stores. What are the total sales by stores?


In [None]:
store_list = price_df['store_id'].unique()
for s in store_list:
    store_items = [c for c in past_sales.columns if s in c]
    past_sales[store_items] \
        .sum(axis=1) \
        .rolling(90).mean() \
        .plot(figsize=(15, 5),
              alpha=0.8,
              title='Rolling 90 Day Average Total Sales (10 stores)')
plt.legend(store_list)
plt.show()

Submit the average value from the past 30 days

In [None]:
thirty_day_avg_map = stv_df.set_index('id')[d_cols[-30:]].mean(axis=1).to_dict()
fcols = [f for f in ss.columns if 'F' in f]
for f in fcols:
    ss[f] = ss['id'].map(thirty_day_avg_map).fillna(0)

    
ss.to_csv('submission.csv', index=False)


In [None]:
ss.head(10)