In [None]:
%matplotlib inline

# Experiment with ipython notebooks on Kaggle (work in progress)

I'm not succeeding (yet) in getting my notebook onto Kaggle. Have a look here :
[https://gist.github.com/cast42/bcfd70b919e6648c2b58](https://gist.github.com/cast42/bcfd70b919e6648c2b58)

In [None]:
import pandas as pd
import numpy as np
import datetime
import random

In [None]:
!pip freeze | grep pandas

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def build_features(features, data):
    # remove NaNs
    data.fillna(0, inplace=True)
    data.loc[data.Open.isnull(), 'Open'] = 1
    # Use some properties directly
    features.extend(['Store', 'CompetitionDistance', 'Promo', 'Promo2', 'SchoolHoliday'])

    # Label encode some features
    features.extend(['StoreType', 'Assortment', 'StateHoliday'])
    mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
    data.StoreType.replace(mappings, inplace=True)
    data.Assortment.replace(mappings, inplace=True)
    data.StateHoliday.replace(mappings, inplace=True)

    features.extend(['DayOfWeek', 'Month', 'Day', 'Year', 'WeekOfYear'])
    data['Year'] = data.Date.dt.year
    data['Month'] = data.Date.dt.month
    data['Day'] = data.Date.dt.day
    data['DayOfWeek'] = data.Date.dt.dayofweek
    data['WeekOfYear'] = data.Date.dt.weekofyear

    # CompetionOpen en PromoOpen from https://www.kaggle.com/ananya77041/rossmann-store-sales/randomforestpython/code
    # Calculate time competition open time in months
    features.append('CompetitionOpen')
    data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \
        (data.Month - data.CompetitionOpenSinceMonth)
    # Promo open time in months
    features.append('PromoOpen')
    data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \
        (data.WeekOfYear - data.Promo2SinceWeek) / 4.0
    data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)

    # Indicate that sales on that day are in promo interval
    features.append('IsPromoMonth')
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \
             7:'Jul', 8:'Aug', 9:'Sept', 10:'Okt', 11:'Nov', 12:'Dec'}
    data['monthStr'] = data.Month.map(month2str)
    data.loc[data.PromoInterval == 0, 'PromoInterval'] = ''
    data['IsPromoMonth'] = 0
    for interval in data.PromoInterval.unique():
        if interval != '':
            for month in interval.split(','):
                data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1

    return data

In [None]:
print("Load the training, test and store data using pandas")
types = {'CompetitionOpenSinceYear': np.dtype(int),
         'CompetitionOpenSinceMonth': np.dtype(int),
         'StateHoliday': np.dtype(str),
         'Promo2SinceWeek': np.dtype(int),
         'SchoolHoliday': np.dtype(int),
         'PromoInterval': np.dtype(str)}
train = pd.read_csv("../input/train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../input/test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../input/store.csv")

In [None]:
print("Assume store open, if not provided")
test.fillna(1, inplace=True)

# print("Consider only open stores for training. Closed stores wont count into the score.")
# train = train[train["Open"] != 0]
# print("Use only Sales bigger then zero")
# train = train[train["Sales"] > 0]

print("Join with store")
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')

features = []

print("augment features")
train = build_features(features, train)
test = build_features([], test)
print(features)

print('training data processed')

What must be forecasted ? The sales per store. For what period ?

In [None]:
print ('From',test.Date.min(),'to', test.Date.max())
print ('That is', test.Date.max()-test.Date.min(), 'days')

For how many stores ?

In [None]:
test.Store.nunique()

Let's take a random store from the trainings data and plot how the Sales data looks like

In [None]:
rS = 979 # rS =  random.choice(train.Store.unique())
print ('Random store number =', rS)

How many year's of data do we have in the trainingset?

In [None]:
train.Year.unique()

Let look at the sales of store 979 in 2013

In [None]:
rS = 979
train[(train.Store==rS) & (train.Year==2013)].Sales.plot(label='2013', figsize=(16,4))
plt.title('Store {}'.format(rS))
plt.show()

We see some patters emerge. Let's make Date the index so that we have date's at the x-axis.

In [None]:
train.set_index('Date', inplace=True)

In [None]:
st = train[train.Store==rS] # Select store rS
st['2013']['Sales'].plot(label='2013', figsize=(17,4), title='Store {}'.format(rS))
plt.show()

The sharp needles in the Sales that touch the zero axis are the sunday's. The reason is that on Sunday most store are not open and have no sales. Let's check that by summing all sales on Sunday's:

In [None]:
train[train.DayOfWeek==6].Sales.sum()

This should be zero. How come it's not? The reason is that some store's are occasionally open on sunday:

In [None]:
salesOnSundayPerStore = train[(train.Open) & (train.DayOfWeek==6)].groupby('Store')['Sales']
salesOnSundayPerStore.count().sort_values().plot(kind='barh')
plt.title('Number of sunday open per store')
plt.show()

Indeed, store number 85 had many open days on sunday:

In [None]:
train[(train.Store==85) & (train.DayOfWeek==6)].Sales.plot(figsize=(17,4))
plt.title('Sales of store 85 on sundays')
plt.show()

Let's take a look to the sales of the store 979 and search for patterns.

In [None]:
def plotStore(rS):
    st = train[train.Store==rS]
    storerS13 = st[st.Year==2013].Sales.reset_index(drop=True)
    storerS14 = st[st.Year==2014].Sales.reset_index(drop=True)
    storerS15 = st[st.Year==2015].Sales.reset_index(drop=True)

    df_plot = pd.concat([storerS13, storerS14, storerS15], axis=1)
    df_plot.columns = ['2013', '2014', '2015']
    df_plot.index = pd.date_range('1/1/2015', periods=365, freq='D')
    df_plot.plot(subplots=True,figsize=(18, 6), title='Sales at store {}'.format(rS))
    plt.show()
plotStore(979)

From above chart, are task is clear. We have to predict how the read curve is continuing for 48 days starting from the first of august until and included 19 september. We can also spot some patterns. Peak's are the beginning of every month. The second week have rather constant sales. On the beginning of the third week, we see again peak altough a bit smaller than the beginning of the month. The reason for this patterns is probably paycheck days typically at the beginning of the month or in the middle of the month. Also in 2014 and 2015 we see a big peak in the beginning of July but not in 2013. Maybe a lot of Germans got extra holdiday money in 2014 and 2015 on there paycheck in July? Let's check another store.

In [None]:
rS = 1013  # rS =  random.choice(train.Store.unique())
plotStore(1013)

Store 1013 has no extra big peak beginning of July. Let check another store.

In [None]:
rS =  85 #random.choice(train.Store.unique())
plotStore(rS)

Store 85 looks different. Remember store 85 ? It's the store that is open on sundays a lot. Let check another store that is open on sunday a lot: store 769.

In [None]:
plotStore(769)

We are lucky because neither store 86 nor store 769 are to be predicted so we can ingnore them.
Still have to check
for the other stores later.


# First Prediction
Sales look rather a constant repeating pattern. Let's exploit that pattern to make a prediction. The most basic assuption could be that sales of the store in same period but one or two year ago are a good prediction for this year. 

Let's take the mean of August and the first two weeks of September in 2013 and 2014 as prediction:
[http://i.imgur.com/3ii8Y0I.png](http://i.imgur.com/3ii8Y0I.png)

In [None]:
rS = 1013
trainStore = train[train.Store == rS]
prevy1 = trainStore.ix['2014-08-02':'2014-09-18']['Sales'].reset_index(drop=True)
prevy2 = trainStore.ix['2013-08-03':'2013-09-19']['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)
df_plot = pd.DataFrame(meanSales, index = pd.date_range('8/1/2015', periods=48, freq='D'))
df_plot.columns = ['Prediction']
df_plot.plot(title='Prediction for store {}'.format(rS));

In [None]:
# Adapt above code so it runs on Kaggle
rS = 1013
periodym1 = train.ix['2014-08-02':'2014-09-18']
periodym2 = train.ix['2013-08-03':'2013-09-19']
prevy1 = periodym1[periodym1.Store == rS]['Sales'].reset_index(drop=True)
prevy2 = periodym2[periodym2.Store == rS]['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)
df_plot = pd.DataFrame(meanSales, index = pd.date_range('8/1/2015', periods=48, freq='D'))
df_plot.columns = ['Prediction']
df_plot.plot(title='Prediction for store {}'.format(rS));

Hoeray, we got our first prediction! Let's make a plot to find out how our prediction looks with respect to the trainings data.

In [None]:
rS = 1013  # rS =  random.choice(train.Store.unique())
storerS13 = train[(train.Store==rS) & (train.Year==2013)].Sales.reset_index(drop=True)
storerS14 = train[(train.Store==rS) & (train.Year==2014)].Sales.reset_index(drop=True)
storerS15 = train[(train.Store==rS) & (train.Year==2015)].Sales.reset_index(drop=True)

df_plot = pd.concat([storerS13, storerS14, storerS15], axis=1)
df_plot.columns = ['2013', '2014', '2015']
df_plot.index = pd.date_range('1/1/2015', periods=365, freq='D')
df_plot['pred'] = pd.DataFrame(meanSales, index = pd.date_range('8/1/2015', periods=48, freq='D'))
df_plot.plot(subplots=True,figsize=(18, 6), title='Sales at store {}'.format(rS))
plt.show()

Let's look to our prediction in 2015 alone:

In [None]:
def plotTrainPred(rS, pred, title=None):
    trainStore = train[train.Store==rS]
    plotIndex = pd.date_range('1/1/2015', periods=270, freq='D')
    df_plot = pd.DataFrame(trainStore['2015']['Sales'], index = plotIndex)
    df_plot.columns = ['2015']
    predIndex = pd.date_range('8/1/2015', periods=48, freq='D')
    df_plot['pred'] = pd.DataFrame(pred, index = predIndex)
    df_plot['2015'].plot(label='train')
    if title:
        df_plot['pred'].plot(label='pred', figsize=(17, 5), title=title)
    else:
        df_plot['pred'].plot(label='pred', figsize=(17, 5), title='Sales at store {} in 2015'.format(rS))
    plt.legend();

plotTrainPred(1013, meanSales)

We spot two problems with our prediction. The first problem has to do with the size of our patterns. Beginning of month sale in 2015 are peaking between 6000 and 7000. Our prediction has peaks between 12000 adn 14000. That looks like a scaling problem. The second problem is that sales for store 1013 are anticyclical in 2013 with respect to 2014. The result is that the two week pattern in our prediction is gone! Before we tackle those problems, let's check another store.

In [None]:
rS = 344
periodym1 = train.ix['2014-08-02':'2014-09-18']
periodym2 = train.ix['2013-08-03':'2013-09-19']
prevy1 = periodym1[periodym1.Store == rS]['Sales'].reset_index(drop=True)
prevy2 = periodym2[periodym2.Store == rS]['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)

plotTrainPred(344, meanSales)

Same problems in store 344. But here we have to scale up. Let check another store.

In [None]:
rs= 876 # rS =  random.choice(train.Store.unique())
periodym1 = train.ix['2014-08-02':'2014-09-18']
periodym2 = train.ix['2013-08-03':'2013-09-19']
prevy1 = periodym1[periodym1.Store == rS]['Sales'].reset_index(drop=True)
prevy2 = periodym2[periodym2.Store == rS]['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)

plotTrainPred(876, meanSales)

Store 876 is missing data in the last two weeks of July in 2015. Lukily our simple prediction model only use data from 2013 and 2014. Iterating above code several times with the random choise (see the comment) learns that all sales peaks differ between 2013 and 2014. Moreover the pattern changes around 1 August 2014. The last week of July 2014 is a peak but the first weak of August 2014 too ! Lets check some other stores:

In [None]:
rS = 265 # random.choice(train.Store.unique())
periodym1 = train.ix['2014-08-02':'2014-09-18']
periodym2 = train.ix['2013-08-03':'2013-09-19']
prevy1 = periodym1[periodym1.Store == rS]['Sales'].reset_index(drop=True)
prevy2 = periodym2[periodym2.Store == rS]['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)

storerS13 = train[(train.Store==rS) & (train.Year==2013)].Sales.reset_index(drop=True)
storerS14 = train[(train.Store==rS) & (train.Year==2014)].Sales.reset_index(drop=True)
storerS15 = train[(train.Store==rS) & (train.Year==2015)].Sales.reset_index(drop=True)

df_plot = pd.concat([storerS13, storerS14, storerS15], axis=1)
df_plot.columns = ['2013', '2014', '2015']
df_plot.index = pd.date_range('1/1/2015', periods=365, freq='D')
df_plot['pred'] = pd.DataFrame(meanSales, index = pd.date_range('8/1/2015', periods=48, freq='D'))
df_plot.plot(subplots=True,figsize=(18, 6), title='Sales at store {}'.format(rS))
plt.show()

Let's see it if there's a montly pattern:

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16, 6), sharey=True)
train2013 = train['2013']
train2013.groupby(train2013.index.day)['Sales'].mean().plot(label='2013', ax=ax[0],
    title='Monthly pattern of sales in 2013')
train2014 = train['2014']
train2014.groupby(train2014.index.day)['Sales'].mean().plot(label='2014', ax=ax[1],
    title='Monthly pattern of sales in 2014')
train2015 = train['2015']
train2015.groupby(train2015.index.day)['Sales'].mean().plot(label='2015', ax=ax[2],
     title='Monthly pattern of sales in 2014')
plt.legend(loc='upper center')
plt.title('Monthly pattern of sales in 2015');

There is some pattern in 2015. Clearly peaks around the beginning and the end of the month. In 2013 and 2014, the pattern is not there on the average. Probably that's because the phase of the pattern changed in those years. Here's the two weekly pattern in 2013, 2014 and 2015:

In [None]:
train2013 = train['2013']
train2013.groupby(train2013.index.dayofyear%14)['Sales'].mean().plot(label='2013')
train2014 = train['2014']
train2014.groupby(train2014.index.dayofyear%14)['Sales'].mean().plot(label='2014')
train2015 = train['2015']
train2015.groupby(train2015.index.dayofyear%14)['Sales'].mean().plot(label='2015')
plt.legend(loc='lower left');
plt.title('14 days pattern of sales in 2013/14/15');

Let's shift the green 2014 pattern 1 day to the left and the blue 2013 pattern 2 days to the left:

In [None]:
train2013 = train['2013']
train2013.groupby((train2013.index.dayofyear+12)%14)['Sales'].mean().plot(label='2013')
train2014 = train['2014']
train2014.groupby((train2014.index.dayofyear+13)%14)['Sales'].mean().plot(label='2014')
train2015 = train['2015']
train2015.groupby(train2015.index.dayofyear%14)['Sales'].mean().plot(label='2015')
plt.legend(loc='lower left');
plt.title('14 days pattern of sales in 2013/14/15');

There you have it. In 2013 sales tend to peak on the second monday. In 2014 the peak is on the first monday.
2015 is in between. Probably because the pattern switched during the year and the peaks are averiging out.

Also store 265 (and a lot of other stores that I checked switch in the pattern around end of 
July 20014 and beginning of August 2014. Let's use that for our prediction. 
Let's assume that the change in august is only in 2014 and not in 2015. 
So we must start our prediction with a low week. 
We can do that by taking the data from 2014 7 days further from the first saturday of august:
[http://i.imgur.com/GrERfoZ.png](http://i.imgur.com/GrERfoZ.png)

In [None]:
rS = 660 # random.choice(train.Store.unique())
periodym1 = train.ix['2014-08-09':'2014-09-25']
periodym2 = train.ix['2013-08-03':'2013-09-19']
prevy1 = periodym1[periodym1.Store == rS]['Sales'].reset_index(drop=True)
prevy2 = periodym2[periodym2.Store == rS]['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)

storerS13 = train[(train.Store==rS) & (train.Year==2013)].Sales.reset_index(drop=True)
storerS14 = train[(train.Store==rS) & (train.Year==2014)].Sales.reset_index(drop=True)
storerS15 = train[(train.Store==rS) & (train.Year==2015)].Sales.reset_index(drop=True)

df_plot = pd.concat([storerS13, storerS14, storerS15], axis=1)
df_plot.columns = ['2013', '2014', '2015']
df_plot.index = pd.date_range('1/1/2015', periods=365, freq='D')
df_plot['pred'] = pd.DataFrame(meanSales, index = pd.date_range('8/1/2015', periods=48, freq='D'))
df_plot.plot(subplots=True,figsize=(18, 6), title='Sales at store {}'.format(rS))
plt.show()

In [None]:
periodym1 = train.ix['2014-08-09':'2014-09-25']
periodym1[periodym1.Store == 660]

In [None]:
train[train.Store==600].index.min()

In [None]:
train[train.Store==600].index.max()

In [None]:
from IPython.display import Image
Image('http://i.imgur.com/GrERfoZ.png')

[http://i.imgur.com/GrERfoZ.png](http://i.imgur.com/GrERfoZ.png)

can't display this image


In [None]:
periodym1 = train.ix['2014-08-02':'2014-09-18']
periodym2 = train.ix['2013-08-03':'2013-09-19']
prevy1 = periodym1[periodym1.Store == rS]['Sales'].reset_index(drop=True)
prevy2 = periodym2[periodym2.Store == rS]['Sales'].reset_index(drop=True)
meanSales = np.mean(np.vstack((prevy1, prevy2)), axis=0)
df_plot = pd.DataFrame(meanSales, index = pd.date_range('8/1/2015', periods=48, freq='D'))
df_plot.columns = ['Prediction']
df_plot.plot(title='Prediction for store {}'.format(rS));