# Basic visualization and data exploration 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [None]:
root = '../input/m5-forecasting-accuracy'

calendar = pd.read_csv(root + '/calendar.csv')
sales_train_valid = pd.read_csv(root + '/sales_train_validation.csv')
sell_prices = pd.read_csv(root + '/sell_prices.csv')

submission = pd.read_csv(root + '/sample_submission.csv')

print('Size of calendar', calendar.shape)
print('Size of sales_train_valid', sales_train_valid.shape)
print('Size of sell_prices', sell_prices.shape)

In [None]:
calendar.head()

In [None]:
sales_train_valid.head()

In [None]:
sell_prices.head()

In [None]:
submission.head()

Submission file should have forecast for the next 28 days

In [None]:
# Missing data
def missing_data(data):
    total = data.isnull().sum()
    percent = (total/data.isnull().count()*100)
    tp = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'], sort=True)
    
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tp['Types'] = types
    
    return tp

In [None]:
missing_data(calendar).head(8)

In [None]:
sales_train_valid['state_id'].unique()

In [None]:
sales_train_valid['cat_id'].unique()

In [None]:
sales_train_valid['item_id'].unique()

In [None]:
def most_frequent_values(col):
    total = col.count()
    itm = col.value_counts().index[0]
    val = col.value_counts().values[0]
    percent = np.round(val / total * 100, 3)
    dic = {'Total': total, 'Most Frequent Item': itm, 'Value': val, 'Percent': percent}
    return dic

In [None]:
col = sales_train_valid['cat_id']
most_frequent_values(col)

In [None]:
col = sales_train_valid['state_id']
most_frequent_values(col)

In [None]:
sell_prices['store_id'].unique()

In [None]:
sell_prices['item_id'].unique()

In [None]:
sell_prices.loc[sell_prices['item_id'] == 'FOODS_3_827']

Sell price for 10 stores in 3 states for FOODS_3_827

In [None]:
# view a single item states
sales_region = sales_train_valid.loc[sales_train_valid['item_id'] == 'FOODS_3_827']

In [None]:
pd.crosstab(sales_region['state_id'], sales_region['store_id'])

So, FOODS_3_827 is sold in 3 states and 10 stores

In [None]:
sales_region.head()

Now plotting d_1, d_2,..... columns with state_id as label (for given rows)

In [None]:
fig = go.Figure()
for i in range(10):
    fig.add_trace(go.Scatter(x=None, y=sales_region.iloc[i, 6:].values,
                        mode='lines',
                        name=sales_region.iloc[i, 5]))
fig.update_layout(title="FOODS_3_827 sales")
fig.show()

In [None]:
fig = go.Figure()
for i in range(10):
    fig.add_trace(go.Scatter(x=None, y=sales_region.iloc[i, 6:].rolling(30).mean().values,
                        mode='lines',
                        name=sales_region.iloc[i, 5]))
fig.update_layout(title="FOODS_3_827 sales, rolling mean 30 days")
fig.show()

In [None]:
fig = go.Figure()
for i in range(10):
    fig.add_trace(go.Scatter(x=None, y=sales_region.iloc[i, 6:].rolling(100).mean().values,
                        mode='lines',
                        name=sales_region.iloc[i, 5]))
fig.update_layout(title="FOODS_3_827 sales, rolling mean 100 days")
fig.show()

In [None]:
sell_prices.loc[sell_prices['store_id'] == 'CA_1']

In [None]:
sales_train_valid.loc[sales_train_valid['store_id'] == 'CA_1']

Now looking at store_id, CA_1 sales

In [None]:
ca_1_sales = sales_train_valid.loc[sales_train_valid['store_id'] == 'CA_1']
pd.crosstab(ca_1_sales['cat_id'], ca_1_sales['dept_id'])

In [None]:
ca_1_sales['dept_id'].unique()

In [None]:
fig = go.Figure()
for dep in ca_1_sales['dept_id'].unique():
    fig.add_trace(go.Scatter(x=None, y=ca_1_sales.loc[ca_1_sales['dept_id'] == dep].rolling(30).mean().values,
                        mode='lines',
                        name=dep))
fig.update_layout(title="CA_1 sales of dep, rolling mean 30 days")
fig.show()

In [None]:
ca_1_sales['cat_id'].unique()

In [None]:
fig = go.Figure()
for cat in ca_1_sales['cat_id'].unique():
    fig.add_trace(go.Scatter(x=None, y=ca_1_sales.loc[ca_1_sales['cat_id'] == cat].rolling(30).mean().values,
                        mode='lines',
                        name=cat))
fig.update_layout(title="CA_1 sales of cat, rolling mean 30 days")
fig.show()