In [1]:
import numpy as np 
import pandas as pd 


In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
%matplotlib inline
pd.set_option('precision', 2)
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', 200)

In [4]:
# helper functions

def unique_num(df,c_list):
    
    width_1 = 30
    width_2 = 10
    for c in c_list:
        col = df[str(c)]
        string_1 = f'# of unique {c} :'
        print(f'{string_1 :<{width_1}}{col.nunique():>{width_2}}')
        

def c_proportions(df, c_list):
    width_1 = 30
    width_2 = 5
    
    
    precision_2 = 3
    
    for c in c_list:
        col=df[str(c)]
        a = (col.value_counts())
        string_1 = '---'+str(c)+'---'
        print(string_1)
        for i in range(len(a.index)):
            string_1 = a.index[i]
            value_2 = (a.values[i]/a.sum()) *100
            print(f"{string_1:>{width_1}}: {value_2:{width_2}.{precision_2}}%")
        print("\n"*3)

In [5]:
df_calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv")
df_sell_prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv")
df_sample_submission = pd.read_csv("../input/m5-forecasting-accuracy/sample_submission.csv")
df_sales_train_validation = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_validation.csv")

## **Quick look at data**

In [6]:
print(df_calendar.shape)
df_calendar.head()

In [7]:
print(df_sell_prices.shape)
df_sell_prices.head()
    

In [8]:
df_sell_prices.isna().sum()

In [9]:
print(df_sales_train_validation.shape)
df_sales_train_validation.head()

In [10]:
df_sales_train_validation.isna().sum()

In [11]:
df_sales_train_validation.isna().sum().sum()

## **A closer look at the data**


 ### 1. df_sell_prices

In [12]:
print(df_sell_prices.shape)
df_sell_prices.sample(10)

In [13]:
unique_num(df_sell_prices, df_sell_prices.columns)

In [14]:
df_sell_prices.store_id.unique()

In [15]:
# df_sell_prices.item_id.unique().tolist()

In [16]:
df_sell_prices.sell_price.describe()

Price range of the products sold: \$0.01 to \$107.32. 
Mean: \$4.41.
Median: \$3.47.

In [17]:
df_sell_prices.sell_price.plot(kind='hist')

In [18]:
# c_proportions(df_sell_prices, ['store_id', 'wm_yr_wk','item_id'])

### df_sales_train_validation 

In [19]:
print(df_sales_train_validation.shape)
df_sales_train_validation.sample(10)

1919 columns:
id, item_id, dept_id, cat_id, store_id, state_id, 
d_1,.... d_1913


In [20]:
df_d = df_sales_train_validation.iloc[:,6:].copy()

In [21]:
df_d.describe()

In [22]:
unique_num(df_sales_train_validation, ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])

In [23]:
df_sales_train_validation.dept_id.unique()

In [24]:
df_sales_train_validation.cat_id.unique()

In [25]:
df_sales_train_validation.store_id.unique()

## Baseline Model using Moving Average

Ideas from : https://www.kaggle.com/rdizzl3/eda-and-baseline-model

Using moving average of past 'num_days' sales to predict the next 28 day sales

In [26]:
num_days = 15
forecast_days = [f'F{i}' for i in range(1,29)]
last_cols=[]
df_forecast = pd.DataFrame()
df_d_last= df_sales_train_validation.iloc[:,-num_days:].copy()

In [27]:
for f in forecast_days[:]:
    last_cols = df_d_last.columns.tolist()[-num_days:]
    sales_train_array = df_d_last[last_cols].values
    forecast = sales_train_array[:, -num_days:].mean(axis=1)
#     forecast = forecast.reshape((30490,1))
    df_forecast[f] = forecast
#     print(df_forecast.columns)
    df_d_last = df_d_last.join(df_forecast.iloc[:,-1])
    
    
    

In [28]:
df_forecast

In [29]:
predictions = df_sample_submission[['id']].join(df_forecast)

In [30]:
predictions.fillna(0,inplace=True)
predictions

In [31]:
predictions.to_csv('submission.csv', index=False)