In [None]:
import numpy as np
import pandas as pd
import pylab as pl

%matplotlib inline

filename_test = 'test.csv'
filename_train = 'train.csv'
filename_store = 'store.csv'

train = pd.read_csv(filename_train, header=0, low_memory=False)

test = pd.read_csv(filename_test, header=0, low_memory=False)


store = pd.read_csv(filename_store, header=0, low_memory=False)



In [None]:
train['Date'] = pd.to_datetime(train['Date'])

test['Date'] = pd.to_datetime(test['Date'])


In [None]:
def print_missing_stats():
    for data_name, data in {'TRAIN': train, 'TEST': test, 'STORE': store}.items():
        print(data_name, ' (overall = %d)' % len(data))
        for attribute in data.columns:
            mask = data[attribute].isnull()
            k = len(data[attribute][mask].tolist())
            print('%5d (%2d%%)' % (k, 100*k/len(data)), 'missing values in ', attribute) 
        print()
print_missing_stats()

In [None]:
test.loc('Open')[test['Open'].isnull()] = 0

distance_series = store['CompetitionDistance']
since_month = store['CompetitionOpenSinceMonth']
since_year = store['CompetitionOpenSinceYear']
store.loc('CompetitionDistance')[distance_series.isnull()] = distance_series.mean()
store.loc('CompetitionOpenSinceMonth')[since_month.isnull()] = int(since_month.mode())
store.loc('CompetitionOpenSinceYear')[since_year.isnull()] = int(since_year.mode())

print_missing_stats()

In [None]:
from matplotlib.dates import MonthLocator
fig, axes = pl.subplots(nrows=7, ncols=1, sharey=True, figsize=(20,100))

for day_of_week in range(1, 8):
    custom_df = train[(train['Open'] == 1) & (train['DayOfWeek'] == day_of_week)]
    gp_store = custom_df.groupby('Store')

    for store, group in gp_store:
        axes[day_of_week - 1].plot(group['Date'], group['Sales'], 'v--')

#     for store, group in gp_store:
#         ts_ewma = pd.ewma(group['Sales'], com=7)
#         axes[day_of_week - 1].plot(group['Date'], ts_ewma, 'v--')

    gp_date = custom_df.groupby('Date')

    ts_mean = gp_date['Sales'].mean()
    ts_median = gp_date['Sales'].median()
    ts_mean.plot(style='r-', linewidth=5, ax=axes[day_of_week - 1], label='mean')
    ts_median.plot(style='b-', linewidth=5, ax=axes[day_of_week - 1], label='median')


    axes[day_of_week - 1].set_title('Day ' + str(day_of_week) + '. number of stores = ' + str(len(gp_store)))
    axes[day_of_week - 1].legend()
    axes[day_of_week - 1].xaxis.set_major_locator(MonthLocator())
    axes[day_of_week - 1].grid(True)

In [None]:
fig, axes = pl.subplots(nrows=7, ncols=1, sharey=True, figsize=(20,100))
for day_of_week in range(1, 8):
    for school_holiday in [0, 1]:
        for state_holiday in ['0', 'a', 'b', 'c']:
            for promo_flag in [0, 1]:
                custom_df = train[(train['Open'] == 1) & 
                                  (train['DayOfWeek'] == day_of_week) &
                                  (train['Promo'] == promo_flag) & 
                                  (train['SchoolHoliday'] == school_holiday) & 
                                  (train['StateHoliday'] == state_holiday)]

                gp_date = custom_df.groupby('Date')
                gp_store = custom_df.groupby('Store')
                
                def construct_label_name(school_holiday, state_holiday, promo_flag, n_stores):
                    string_school = 'NO SchoolHoliday. '
                    string_state = 'NO StateHoliday. '
                    string_promo = 'NO Promo. '
                    if school_holiday == 1:
                        string_school = string_school[3:]
                    if promo_flag:
                        string_promo = string_promo[3:]
                    if state_holiday != '0':
                        string_state = {'a': 'PublicHoliday. ', 'b': 'EasterHoliday. ', 'c':'Christmas. '}[state_holiday]
                    return string_school + string_state + string_promo + '(' + str(n_stores) + ')'
                
                if list(gp_date['Sales']):
                    ts_mean = gp_date['Sales'].mean()
                    ts_mean.plot(style='v--', ax=axes[day_of_week - 1], 
                                 label=construct_label_name(school_holiday, state_holiday, promo_flag, len(gp_store)))
#                 else:
#                     print('There is no data for ', day_of_week, school_holiday, state_holiday, promo_flag)
    
                
    custom_df = train[(train['Open'] == 1) & (train['DayOfWeek'] == day_of_week)]
    gp_date = custom_df.groupby('Date')
    gp_store = custom_df.groupby('Store')
    ts_mean = gp_date['Sales'].mean()
    ts_mean.plot(style='r-', linewidth=1.5, ax=axes[day_of_week - 1], label='mean (' + str(len(gp_store)) + ')')
    axes[day_of_week - 1].set_title('Day ' + str(day_of_week))
    axes[day_of_week - 1].legend()
    axes[day_of_week - 1].xaxis.set_major_locator(MonthLocator())
    axes[day_of_week - 1].grid(True)
                