In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')

Description of the variables:

**calendar**
* date: The date in a “y-m-d” format.<br>
* wm_yr_wk: The id of the week the date belongs to.<br>
* weekday: The type of the day (Saturday, Sunday, …, Friday).<br>
* wday: The id of the weekday, starting from Saturday.<br>
* month: The month of the date.<br>
* year: The year of the date.<br>
* event_name_1: If the date includes an event, the name of this event.<br>
* event_type_1: If the date includes an event, the type of this event.<br>
* event_name_2: If the date includes a second event, the name of this event.<br>
* event_type_2: If the date includes a second event, the type of this event.<br>
* snap_CA, snap_TX, and snap_WI: A binary variable (0 or 1) indicating whether the stores of CA,
TX or WI allow SNAP2 purchases on the examined date. 1 indicates that SNAP purchases are
allowed.<br>

**sell_prices**
* store_id: The id of the store where the product is sold.<br>
* item_id: The id of the product.<br>
* wm_yr_wk: The id of the week.<br>
* sell_price: The price of the product for the given week/store. The price is provided per week
(average across seven days). If not available, this means that the product was not sold during the
examined week. Note that although prices are constant at weekly basis, they may change through
time (both training and test set). <br>

**sales_train_validation** 
* item_id: The id of the product.<br>
* dept_id: The id of the department the product belongs to.<br>
* cat_id: The id of the category the product belongs to.<br>
* store_id: The id of the store where the product is sold.<br>
* state_id: The State where the store is located.<br>
* ad_1, d_2, …, d_i, … d_1941: The number of units sold at day i, starting from 2011-01-29.

In [None]:
sales.head()

The number of unique item_ids multiplied by the number of unique stores should give us every entry in the dataset

In [None]:
unique_items = np.unique(['item_id']).shape[0]
num_stores = np.unique(sales['store_id']).shape[0]
unique_items * num_stores, sales.shape

# Persistence forecast
This is a very bad forecast that simply persists the last value forward in time. (lb = 1.16344)

In [None]:
persist = np.repeat(sales['d_1913'].values.reshape(-1,1),28,axis=1)

In [None]:
plt.plot(np.arange(0,1913),sales.iloc[0,6:])
plt.plot(np.arange(1913,1913+28),persist[0])
plt.xlim(1850,1940)

# Moving average forecast
This one takes the average over some epoch and predicts that as the sole future value. (lb = 1.01225)

In [None]:
ma = np.repeat(np.mean(sales.iloc[:,6:],axis=1).values.reshape(-1,1),28,axis=1)

In [None]:
plt.plot(np.arange(0,1913),sales.iloc[0,6:])
plt.plot(np.arange(1913,1913+28),ma[0])
plt.xlim(1850,1940)

# Deeper look at the data

In [None]:
sales.head()

In [None]:
mean_by_cat = np.zeros((3,1913))
for i in np.arange(1,1914):
    df = pd.DataFrame(sales.groupby('cat_id').agg('d_{}'.format(i)).mean())
    mean_by_cat[:,i-1] = df.values[:,0]

In [None]:
%matplotlib inline
plt.figure(figsize=(10,7))
plt.plot(mean_by_cat[0,:],label='foods')
plt.plot(mean_by_cat[1,:],label='hobbies')
plt.plot(mean_by_cat[2,:],label='household')
plt.legend()
plt.show()

Every year there is a day when sales drop to zero, that is Christmas. There is also a smaller dip about a month before that representing Thanksgiving.

In [None]:
calendar[calendar.d=='d_1427']

Let's look at sales over these categories each year.

In [None]:
mean_by_cat = np.zeros((3,366,5))
years = np.arange(1,1913,365)
for j in np.arange(0,5):
    for i in np.arange(years[j],years[j+1]):
        df = pd.DataFrame(sales.groupby('cat_id').agg('d_{}'.format(i)).mean())
        mean_by_cat[:,i-years[j],j] = df.values[:,0]

In [None]:
%matplotlib inline
plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
plt.plot(mean_by_cat[0,:,0],label='year1')
plt.plot(mean_by_cat[0,:,1],label='year2')
plt.plot(mean_by_cat[0,:,2],label='year3')
plt.plot(mean_by_cat[0,:,3],label='year4')
plt.plot(mean_by_cat[0,:,4],label='year5')
plt.legend()
plt.xlabel('day')
plt.ylabel('food sales')
plt.subplot(1,2,2)
plt.boxplot((mean_by_cat[0,:,0],mean_by_cat[0,:,1],mean_by_cat[0,:,2],mean_by_cat[0,:,3],mean_by_cat[0,:,4]))
plt.xlabel('year')
plt.show()

In [None]:
%matplotlib inline
plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
plt.plot(mean_by_cat[1,:,0],label='year1')
plt.plot(mean_by_cat[1,:,1],label='year2')
plt.plot(mean_by_cat[1,:,2],label='year3')
plt.plot(mean_by_cat[1,:,3],label='year4')
plt.plot(mean_by_cat[1,:,4],label='year5')
plt.legend()
plt.xlabel('day')
plt.ylabel('hobby sales')
plt.subplot(1,2,2)
plt.boxplot((mean_by_cat[1,:,0],mean_by_cat[1,:,1],mean_by_cat[1,:,2],mean_by_cat[1,:,3],mean_by_cat[1,:,4]))
plt.xlabel('year')
plt.show()

In [None]:
%matplotlib inline
plt.figure(figsize=(15,7))
plt.subplot(1,2,1)
plt.plot(mean_by_cat[2,:,0],label='year1')
plt.plot(mean_by_cat[2,:,1],label='year2')
plt.plot(mean_by_cat[2,:,2],label='year3')
plt.plot(mean_by_cat[2,:,3],label='year4')
plt.plot(mean_by_cat[2,:,4],label='year5')
plt.legend()
plt.xlabel('day')
plt.ylabel('household sales')
plt.subplot(1,2,2)
plt.boxplot((mean_by_cat[2,:,0],mean_by_cat[2,:,1],mean_by_cat[2,:,2],mean_by_cat[2,:,3],mean_by_cat[2,:,4]))
plt.xlabel('year')
plt.show()

Looks like household and hobby sales have both gone up in recent years, while food sales have stayed flat.

Let's look at the last year for which we have complete data, days 1427 to 1791. This is Christmas 2014 to day before Christmas 2015. 

In [None]:
year = [1424,1787]
lastyearcols = ["d_{}".format(i) for i in np.arange(year[0],year[1]+1)]
lastyearsales = sales[['id','item_id','dept_id','cat_id','store_id','state_id']+lastyearcols]

In [None]:
mean_by_cat = np.zeros((3,364))
for i in np.arange(year[0],year[1]+1):
    df = pd.DataFrame(lastyearsales.groupby('cat_id').agg('d_{}'.format(i)).mean())
    mean_by_cat[:,i-year[0]] = df.values[:,0]

In [None]:
%matplotlib inline
plt.figure(figsize=(10,7))
plt.plot(mean_by_cat[0,:],label='foods')
plt.plot(mean_by_cat[1,:],label='hobbies')
plt.plot(mean_by_cat[2,:],label='household')
plt.legend()
plt.show()

There seems to be a weekly oscillation, let's check that out.

In [None]:
foodweekly = mean_by_cat[0,:].reshape(mean_by_cat.shape[1]//7,7)
hobbweekly = mean_by_cat[1,:].reshape(mean_by_cat.shape[1]//7,7)
houseweekly = mean_by_cat[2,:].reshape(mean_by_cat.shape[1]//7,7)

In [None]:
%matplotlib inline
days = ['Mon.','Tues.','Wed.','Thurs.','Fri.','Sat.','Sun.']
plt.figure(figsize=(15,7))
plt.subplot(1,3,1)
plt.plot(days,foodweekly[10,:],'k')
plt.plot(days,foodweekly[30,:],'k')
plt.plot(days,foodweekly[50,:],'k')
plt.plot(days,np.mean(foodweekly,axis=0),'b')
plt.ylabel('food')
plt.subplot(1,3,2)
plt.plot(days,hobbweekly[10,:],'k')
plt.plot(days,hobbweekly[30,:],'k')
plt.plot(days,hobbweekly[50,:],'k')
plt.plot(days,np.mean(hobbweekly,axis=0),'orange')
plt.ylabel('hobbies')
plt.subplot(1,3,3)
plt.plot(days,houseweekly[10,:],'k')
plt.plot(days,houseweekly[30,:],'k')
plt.plot(days,houseweekly[50,:],'k')
plt.plot(days,np.mean(houseweekly,axis=0),'green')
plt.ylabel('household')
plt.show()

Looks like Thursday is the slowest day in every category. Food shopping peaks on Sundays while hobbies and household peaks on Saturday.

In [None]:
X = np.mean(foodweekly,axis=0)
foodweeklytrend = X-np.mean(X)
X = np.mean(hobbweekly,axis=0)
hobbweeklytrend = X-np.mean(X)
X = np.mean(houseweekly,axis=0)
houseweeklytrend = X-np.mean(X)

In [None]:
lastyearsales['mean'] = np.mean(lastyearsales.iloc[:,6:],axis=1)
#np.repeat(np.mean(lastyearsales.iloc[:,6:],axis=1).values.reshape(-1,1),28,axis=1)

In [None]:
lastyearmean = lastyearsales[['id','mean']].copy()

# Products with lots of zeros

In [None]:
%matplotlib inline
plt.figure()
plt.plot(sales[sales.id=='HOBBIES_1_002_CA_1_validation'].iloc[0,6:].values)

plt.figure()
y,x = np.histogram(sales[sales.id=='HOBBIES_1_002_CA_1_validation'].iloc[0,6:].values,5)
plt.bar(x[0:-1],y)
plt.show()

In [None]:
df_oneitem = pd.DataFrame(sales[sales.id=='HOBBIES_1_002_CA_1_validation'].iloc[0,6:].values.astype('int64'))
df_oneitem.columns=['sales']
df_oneitem['wday'] = calendar['wday'].iloc[0:df_oneitem.shape[0]]

In [None]:
np.unique(calendar['event_name_1'][~pd.isnull(calendar.event_name_1)])

In [None]:
days_df = calendar.iloc[0:1913]

In [None]:
def makeholidayvars(holiday_name,event_name,interval,df):
    holiday_days = df[(df.event_name_1 == event_name) | 
                                            (df.event_name_2 == event_name)].d
    df[holiday_name] = np.zeros((df.shape[0],1)).astype(int)
    backdays = interval[0]
    forwarddays = interval[1]
    array_size = holiday_days.index.shape[0]*(backdays+1+forwarddays)
    inds = np.linspace(tuple(holiday_days.index-backdays),
                       tuple(holiday_days.index+forwarddays),backdays+1+forwarddays).reshape(array_size,1).astype('int64')
    
    df[holiday_name][inds[:,0]] = 1
    return df

In [None]:
days_df = makeholidayvars('Chanukah','Chanukah End',[10,1],days_df)
days_df = makeholidayvars('ChristmasSeason','Christmas',[30,7],days_df)

In [None]:
days_df[days_df.ChristmasSeason==1]

# Submit

In [None]:
sample_submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')

In [None]:
foodrows = sample_submission[sample_submission['id'].str.contains('FOOD')]
hobbrows = sample_submission[sample_submission['id'].str.contains('HOBB')]
houserows = sample_submission[sample_submission['id'].str.contains('HOUSE')]

foodrows = pd.merge(foodrows, lastyearmean, on=['id'], how='inner')
hobbrows = pd.merge(hobbrows, lastyearmean, on=['id'], how='inner')
houserows = pd.merge(houserows, lastyearmean, on=['id'], how='inner')

Since both the household and hobby means went up the last couple of years, let's boost them a bit.

In [None]:
hobbrows['mean'] = hobbrows['mean']*1.1
houserows['mean'] = houserows['mean']*1.1

In [None]:
cols = ['F{}'.format(i) for i in np.arange(1,29)]
foodrows[cols] = np.tile(foodweeklytrend,(foodrows.shape[0],4))
hobbrows[cols] = np.tile(hobbweeklytrend,(hobbrows.shape[0],4))
houserows[cols] = np.tile(houseweeklytrend,(houserows.shape[0],4))

In [None]:
allrows = pd.concat([foodrows,hobbrows,houserows])

In [None]:
sample_submission.drop(columns=cols,inplace=True)
sample_submission = pd.merge(sample_submission,allrows,how='left',on='id')

In [None]:
sample_submission.head()

In [None]:
sample_submission[cols] = sample_submission[cols].add(list(sample_submission['mean'].values),axis='rows')

In [None]:
sample_submission.head()

In [None]:
sample_submission.iloc[30490:,1:] = sample_submission.iloc[0:30490,1:].values

In [None]:
sample_submission.drop(columns='mean',inplace=True)

In [None]:
sample_submission.to_csv('submit_Latimer.csv',index=False)