In [1]:
import gc
import pandas as pd
import numpy as np
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin' # model visualisation -> 
# add the binaries to path! 
warnings.filterwarnings('ignore')
from tensorflow.keras.callbacks import TensorBoard
tensorboard_callback = TensorBoard(log_dir="./logs")
import keras
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
from keras.layers import concatenate, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import LabelEncoder
import timeit

In [2]:
# set / double check for cuDNN enabled device 

from tensorflow.python.client import device_lib

device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 8772992878236577955
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 7804551168
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 6686604937781567478
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6"
 xla_global_id: 416903419]

In [3]:
# util functions


"""
1- load_to_memory : : input filepaths to return data. Main source is CSVs will use pd.DataFrame types.
                    : this method also executes a log(x) + 1 transform to the unit_sales.
                    : predictions are done on this basis later to be transformed by the inverse of
                    : this function. 
                    : type conversions are also executed to data


2- sales_and_promotion_table : : restructuring and setting up the raw data to extract features from Returns 3 dfs: 
                               
                               : Y feature vector set up schema: Populated by log(sales) + 1 figures
                               : multilvl index of store_nbr|item_nbr
                               : all dates are the columns
                               
                                                     DATE-1 | DATE-1 | DATE-1 | ...
                               -------------------------------------------------------
                               |store_nbr|item_nbr||   
                                    1        1     |
                                             2     |
                                             3     |
                                             .     |
                                             .     |
                                             .     |
                                    2        1     |
                                    
                                    
                                    
                               : X feature vector set up:
                               : Promotion Look up: used to mine promotion features. Features that look ahead, and behind.
                            
    
3- sales_meta : : put in store meta_data ... main output is the first position output. This will be the dataframe that the
                  get_features() function will mainly extract features from:
                  
                  store_nbr| item_nbr| date | unit_sales | day | day_of_week | is_no_sale |family | class | city | state |type | cluster
                  
                  
                  
                  
4- get_features : :

              #############################:  basic stats of sales figures store_nbr & item_nbr pair :
              last                         : "sales value of day prior given store_nbr & item_nbr pair" 
              mean_3                       : "mean sales value of the last 3 days given store_nbr & item_nbr pair"
              mean_7                       : "mean sales value of the last 7 days given store_nbr & item_nbr pair"
              mean_14                      : "mean sales value of the last 14 days given store_nbr & item_nbr pair"
              mean_28                      : "mean sales value of the last 28 days given store_nbr & item_nbr pair"
              mean_60                      : "mean sales value of the last 60 days given store_nbr & item_nbr pair"
              mean_90                      : "mean sales value of the last 90 days given store_nbr & item_nbr pair"
              mean_365                     : "mean sales value of the last 365 days given store_nbr & item_nbr pair"
              mean_diff_7_28               : "variance of mean 7 - 28 given store_nbr & item_nbr pair"
              mean_diff_14_60              : "variance of mean 14 - 60 given store_nbr & item_nbr pair"
              mean_diff_28_90              : "variance of mean 28 - 90 given store_nbr & item_nbr pair"
              mean_no_sale_in_7            : "no sale mean count - 7 days given store_nbr & item_nbr pair"
              mean_no_sale_in_28           : "no sale mean count - 28 days given store_nbr & item_nbr pair"
              mean_no_sale_in_90           : "no sale mean count - 90 days given store_nbr & item_nbr pair"
              mean_no_sale_diff_7_28       : "variance of no sale mean count - 7 - 28 days given store_nbr & item_nbr pair"
              mean_no_sale_diff_28_90      : "variance of no sale mean count - 28 - 90 days given store_nbr & item_nbr pair"
              
              #############################:  promotion stats & day specific sales features given store_nbr & item_nbr pair :
              avg_promo_7                  : "avg number of days an item was on promo in the last 7 days given store_nbr & item_nbr pair"      
              avg_promo_14                 : "avg number of days an item was on promo in the last 14 days given store_nbr & item_nbr pair"
              avg_promo_28                 : "avg number of days an item was on promo in the last 28 days given store_nbr & item_nbr pair"
              avg_promo_90                 : "avg number of days an item was on promo in the last 90 days given store_nbr & item_nbr pair"
              avg_promo_365                : "avg number of days an item was on promo in the last 365 days given store_nbr & item_nbr pair"
              mean_day_of_week_7           : "mean sales for day of the wk in last 7 days given store_nbr & item_nbr pair"
              mean_day_of_week_14          : "mean sales for day of the wk in last 14 days given store_nbr & item_nbr pair"
              mean_day_of_week_21          : "mean sales for day of the wk in last 21 days given store_nbr & item_nbr pair"
              mean_day_of_week_28          : "mean sales for day of the wk in last 28 days given store_nbr & item_nbr pair"
              mean_day_in_90               : "mean sales for day of the month in last 90 days given store_nbr & item_nbr pair"
              mean_day_in_365              : "mean sales for day of the month in last 365 days given store_nbr & item_nbr pair"
              promo                        : "looking forward -> promo states of next 16 given store_nbr & item_nbr pair"
              
              #############################:  metadata given store_nbr & item_nbr pair :
              family                       
              class
              perishable
              city
              state
              type
              cluster
              day_of_week
              day


5- get_y : : returns target values, i.e. sales

6- nwrmsle : : given 2 vectors returns Normalized Weighted Root Mean Squared Logarithmic Error

7- _features : : returns set of features to feed into model. This function also returns features that are 'day' sensitive.
                 eg: given we are predicting for the 17th of a Monday, features are retrieved for Mondays and 17ths of the month.

8- get_model : : returns the keras implementation of the graph  

"""

def load_to_memory(filepath:str,is_test = False):
    
    if not is_test:
        
        data = pd.read_csv(filepath,
                       dtype={ 
                              'date': str,
                              'store_nbr': int,
                              'item':int,
                              'unit_sales':'float32'
                       })
        
        data.onpromotion = data.onpromotion.fillna(0).astype(int)
        
        # transform unit_sales if sales>0 else clip to 0 (this operation turns all return activity to 0 units sold)
        data.unit_sales = data.unit_sales.apply(lambda x: np.log1p(float(x)) if float(x) > 0 else 0)
        data.unit_sales = data.unit_sales.astype('float32')
        data = data.drop(['id'], axis=1)
    if is_test:
        
        data = pd.read_csv(filepath, usecols=[0, 1, 2, 3, 4],
                       dtype={ 
                              'date': str,
                              'store_nbr': int,
                              'item':int 
                       })
        data.onpromotion = data.onpromotion.fillna(0).astype(int)
        

    # date column dtype str -> datetime64
    data.date = pd.to_datetime(data.date, format='%Y-%m-%d')
    # convert bool to int
    data.onpromotion = data.onpromotion.astype('int')
    # the promotions are recorded starting from April - 2014 ... we ignore training samples prior to period
    return data[data.date >= pd.to_datetime('2014-04-01', format='%Y-%m-%d')]
    

def sales_and_promotion_table(_train_stack, _test_stack, key_array: list) -> tuple:
    
    # sales restructure dataframe to host multindex & time series nature from 2014-04 till end of data & fixing missing values if any
    sales = _train_stack.set_index(key_array).unit_sales.unstack().fillna(0)
    # create a feature set
    sales_features = sales.stack().rename('unit_sales').reset_index()
    # check and fill empty dates, will be used for labels
    sales = sales.reindex(pd.date_range('2014-04-01', '2017-08-15'), axis=1).fillna(0)
    # promotions 
    training_p = _train_stack.set_index(key_array)[["onpromotion"]].unstack().fillna(0)
    test_p = _test_stack.set_index(key_array)[["onpromotion"]].unstack().fillna(0)
    # restructure the level of DF to have columns of DF in top level
    training_p.columns = training_p.columns.get_level_values(1)
    test_p.columns = test_p.columns.get_level_values(1)
    # reindex
    test_p = test_p.reindex(training_p.index).fillna(0)
    # putting promotional data of train and test together to look ahead for poromotions while predicting & fix missing values if any
    promotions = pd.concat([training_p, test_p], axis=1).reindex(pd.date_range('2014-04-01', '2017-08-31'), axis=1).fillna(0)
    
    return sales, sales_features, promotions

def sales_meta(df, item_doc_dir : str, store_doc_dir : str) -> tuple :
    
    # load memory
    
    items = pd.read_csv(item_doc_dir,
                    dtype={'item_nbr': 'int32', 
                           'class': int, 
                           'perishable': int})


    stores = pd.read_csv(store_doc_dir,
                    dtype={'store_nbr': int,
                           'cluster': int})
    
    df['day'] = df.date.dt.day
    df['day_of_week'] = df.date.dt.dayofweek
    df['is_no_sale'] = (df.unit_sales==0).astype(int) # turn bool to int
    df = pd.merge(pd.merge(df, items.drop('perishable', axis=1)), stores)
    
    return df, pd.merge(pd.merge(df[['store_nbr', 'item_nbr']].drop_duplicates(), items), stores)    


def get_features(feature_date):
    
    # feature sets
    #####################################################################################


    # get last sales (key: store_nbr & item_nbr) --- look behind a day from prediction day
    temp_last_sales = sales_set_x[(sales_set_x.date >= (pd.to_datetime(feature_date) - pd.Timedelta(days=1))) &
                       (sales_set_x.date <= (pd.to_datetime(feature_date) - pd.Timedelta(days=1)))]
    temp_last_sales = temp_last_sales.groupby(['store_nbr', 'item_nbr'])['unit_sales'].agg(['last']).reset_index()


    # get avg last sales (key: store_nbr & item_nbr) --- look 3 | 7 | 14 | 28 | 90 | 365 days behind prediction day

    for i in [3,7,14,28,60,90,365]:

        temp_avg_run = sales_set_x[(sales_set_x.date >= (pd.to_datetime(feature_date) - pd.Timedelta(days=i))) &
                       (sales_set_x.date <= (pd.to_datetime(feature_date) - pd.Timedelta(days=1)))].groupby(['store_nbr', 'item_nbr'])['unit_sales'].agg(['mean']).reset_index()
        temp_avg_run.columns = ['store_nbr', 'item_nbr', 'mean_{}'.format(i)]
        temp_last_sales = pd.merge(temp_last_sales, temp_avg_run)

    print('DONE: last sales and avg sales calculated for {}'.format(feature_date))

    # difference in means for different wks 

    temp_last_sales['mean_diff_7_28'] = temp_last_sales.mean_7 - temp_last_sales.mean_28
    temp_last_sales['mean_diff_14_60'] = temp_last_sales.mean_14 - temp_last_sales.mean_60
    temp_last_sales['mean_diff_28_90'] = temp_last_sales.mean_28 - temp_last_sales.mean_90

    print('DONE: avg difference in sales calculated for {}'.format(feature_date))


    # mean sales by day_of_week

    for i in [7,14,21,28]:


        temp_avg_run = sales_set_x[(sales_set_x.date >= (pd.to_datetime(feature_date) - pd.Timedelta(days=i))) &
                       (sales_set_x.date <= (pd.to_datetime(feature_date) - pd.Timedelta(days=1)))].groupby(['store_nbr', 'item_nbr', 'day_of_week'])['unit_sales'].agg(['mean']).reset_index()
        temp_avg_run.columns = ['store_nbr', 'item_nbr', 'day_of_week', 'mean_day_of_week_{}_'.format(i)]
        # stack by day of week to avoid the categorical in the fields
        temp_avg_run = temp_avg_run.set_index(['store_nbr', 'item_nbr', 'day_of_week']).unstack()
        temp_avg_run.columns = temp_avg_run.columns.get_level_values(0) + temp_avg_run.columns.get_level_values(1).astype(str)
        temp_avg_run = temp_avg_run.reset_index()
        temp_last_sales = pd.merge(temp_last_sales, temp_avg_run)


    print('DONE: avg sales for each day of week calculated for {}'.format(feature_date))


    # mean sales by day of the wk -  per quarter (90) and year (365) 

    for i in [90,365]:


        temp_avg_run = sales_set_x[(sales_set_x.date >= (pd.to_datetime(feature_date) - pd.Timedelta(days=i))) &
                       (sales_set_x.date <= (pd.to_datetime(feature_date) - pd.Timedelta(days=1)))].groupby(['store_nbr', 'item_nbr', 'day'])['unit_sales'].agg(['mean']).reset_index()
        temp_avg_run.columns = ['store_nbr', 'item_nbr', 'day', 'mean_day_in_{}_'.format(i)]
        # stack by day of week to avoid the categorical in the fields
        temp_avg_run = temp_avg_run.set_index(['store_nbr', 'item_nbr', 'day']).unstack()
        temp_avg_run.columns = temp_avg_run.columns.get_level_values(0) + temp_avg_run.columns.get_level_values(1).astype(str)
        temp_avg_run = temp_avg_run.reset_index()
        temp_last_sales = pd.merge(temp_last_sales, temp_avg_run)

    print('DONE: avg sales for each day calculated for a quarter and year for {}'.format(feature_date))


    # mean of zero sales 

    for i in [7, 28, 90]:


        temp_avg_run = sales_set_x[(sales_set_x.date >= (pd.to_datetime(feature_date) - pd.Timedelta(days=i))) &
                       (sales_set_x.date <= (pd.to_datetime(feature_date) - pd.Timedelta(days=1)))].groupby(['store_nbr', 'item_nbr'])['is_no_sale'].agg(['mean']).reset_index()
        temp_avg_run.columns = ['store_nbr', 'item_nbr', 'mean_no_sale_in_{}'.format(i)]
        temp_last_sales = pd.merge(temp_last_sales, temp_avg_run)

    print('DONE: avg count of item not sold in a store for {}'.format(feature_date))

    # difference in means for different wks no sales

    temp_last_sales['mean_no_sale_diff_7_28'] = temp_last_sales.mean_no_sale_in_7 - temp_last_sales.mean_no_sale_in_28
    temp_last_sales['mean_no_sale_diff_28_90'] = temp_last_sales.mean_no_sale_in_28 - temp_last_sales.mean_no_sale_in_90

    print('DONE: no sales diff in for {}'.format(feature_date))

    #####################################################################################


    # promotions

    # LOOK behind! for mean promotion data

    for i in [7, 14, 28,90, 365]:


        temp_avg_run = promotions_set[pd.date_range(pd.to_datetime(feature_date) - pd.Timedelta(days=i), periods=i, freq='D')].mean(axis=1).values
        temp_avg_run = pd.DataFrame(temp_avg_run, index=promotions_set.index).reset_index()
        temp_avg_run.columns = ['store_nbr', 'item_nbr', 'avg_promo_{}'.format(i)]
        temp_last_sales = pd.merge(temp_last_sales,temp_avg_run)

    print('DONE: past promo avgs for {}'.format(feature_date))


    # LOOK ahead! we are predicting for 16 time steps forward

    for i in range (16):

        temp_last_sales["promo_{}".format(i)] = promotions_set[pd.to_datetime(feature_date) + pd.Timedelta(days=i)].values 

    print('DONE: future promo data for {}'.format(feature_date))
    print('##############################')
    print('\n')
    print('##############################')
    return temp_last_sales


def get_y(df, _date):
    return df[pd.date_range(_date, periods=16)]   



def nwrmsle(yval, ypred, weights=None):
    
    # custom loss ... will train on custom MSE ... 
    return np.sqrt(mean_squared_error(np.log(1+yval), np.log(1+ypred), sample_weight=weights))


def _features(df0, lag, y0=None):
    
    # range we want to predict for #
    test_range = pd.date_range('2017-08-16', '2017-08-31')

    if y0 is not None:
        date = pd.to_datetime(y0.columns[lag])
    else:
        date = test_range[lag]

    # main detection features #
    df = df0[['last', 'mean_3', 'mean_7', 'mean_14', 'mean_28', 'mean_60', 'mean_90', 'mean_365',
              'mean_diff_7_28', 'mean_diff_14_60', 'mean_diff_28_90','mean_no_sale_in_7', 'mean_no_sale_in_28', 'mean_no_sale_in_90',
              'mean_no_sale_diff_7_28', 'mean_no_sale_diff_28_90','avg_promo_7', 'avg_promo_14', 'avg_promo_28', 'avg_promo_90','avg_promo_365']]
    
    # for whatever day - dayofweek we are on ... we take take that day's features #
    day = date.day
    dow = date.dayofweek    
    ###############################################################################
    df['mean_day_of_week_7']  = df0['mean_day_of_week_7_%d' % dow]
    df['mean_day_of_week_14'] = df0['mean_day_of_week_14_%d' % dow]
    df['mean_day_of_week_21'] = df0['mean_day_of_week_21_%d' % dow]
    df['mean_day_of_week_28'] = df0['mean_day_of_week_28_%d' % dow]
    df['mean_day_in_90'] = df0['mean_day_in_90_%d' % day]
    df['mean_day_in_365'] = df0['mean_day_in_365_%d' % day]
    df['promo'] = df0['promo_%d' % lag]
    df['promo_mean'] = df0[['promo_0', 'promo_1', 'promo_2', 'promo_3', 'promo_4', 'promo_5', 
                            'promo_6', 'promo_7', 'promo_8', 'promo_9', 'promo_10', 'promo_11', 
                            'promo_12', 'promo_13', 'promo_14', 'promo_15']].mean(axis=1)
    ###############################################################################
    # embedding layers #
    df['family'] = items2['family'].values
    df['class'] = items2['class'].values
    df['perishable'] = items2['perishable'].values
    df['city'] = stores2['city'].values
    df['state'] = stores2['state'].values
    df['type'] = stores2['type'].values
    df['cluster'] = stores2['cluster'].values
    df['day_of_week'] = dow
    df['day'] = day
    ###############################################################################
    df = df.reset_index()

    if y0 is not None:
        y_i = y0.iloc[:,lag].rename('y').to_frame()
        y_i['date'] = date
        y_i = y_i.reset_index().set_index(['store_nbr', 'item_nbr', 'date']).squeeze()
        return df, y_i
    else:
        return df
    
    
def get_model(input_num_shape):

    input_num = Input(shape=(input_num_shape,), dtype='float32', name='input_num')
    input_store = Input(shape=(1,), dtype='int32', name='input_store')
    input_item = Input(shape=(1,), dtype='int32', name='input_item')
    input_family = Input(shape=(1,), dtype='int32', name='input_family')
    input_city = Input(shape=(1,), dtype='int32', name='input_city')
    input_state = Input(shape=(1,), dtype='int32', name='input_state')
    input_type = Input(shape=(1,), dtype='int32', name='input_type')
    input_cluster = Input(shape=(1,), dtype='int32', name='input_cluster')
    input_dow = Input(shape=(1,), dtype='int32', name='input_dow')
    input_day = Input(shape=(1,), dtype='int32', name='input_day')

    embedding_store = Embedding(input_dim=val.reset_index()['store_nbr'].nunique(), output_dim=5, input_length=1)(input_store)
    embedding_store = Flatten()(embedding_store)
    embedding_item = Embedding(input_dim=val.reset_index()['item_nbr'].nunique(), output_dim=10, input_length=1)(input_item)
    embedding_item = Flatten()(embedding_item)
    embedding_family = Embedding(input_dim=_items['family'].nunique(), output_dim=5, input_length=1)(input_family)
    embedding_family = Flatten()(embedding_family)
    embedding_city = Embedding(input_dim=_stores['city'].nunique(), output_dim=5, input_length=1)(input_city)
    embedding_city = Flatten()(embedding_city)
    embedding_state = Embedding(input_dim=_stores['state'].nunique(), output_dim=5, input_length=1)(input_state)
    embedding_state = Flatten()(embedding_state)
    embedding_type = Embedding(input_dim=_stores['type'].nunique(), output_dim=3, input_length=1)(input_type)
    embedding_type = Flatten()(embedding_type)
    embedding_cluster = Embedding(input_dim=_stores['cluster'].nunique(), output_dim=5, input_length=1)(input_cluster)
    embedding_cluster = Flatten()(embedding_cluster)
    embedding_dow = Embedding(input_dim=7, output_dim=5, input_length=1)(input_dow)
    embedding_dow = Flatten()(embedding_dow)
    embedding_day = Embedding(input_dim=31, output_dim=5, input_length=1)(input_day)
    embedding_day = Flatten()(embedding_day)
    
    features = [input_num, embedding_store, embedding_item, embedding_family, embedding_city,
               embedding_state, embedding_type, embedding_cluster, embedding_dow, embedding_day]
    net = concatenate(features)
    net = Dense(1000, kernel_initializer = 'he_normal', activation='relu')(net)
    net = Dense(500, kernel_initializer = 'he_normal', activation='relu')(net)
    net = Dense(1, kernel_initializer = 'he_normal', activation='linear')(net)
    inputs = [input_num, input_store, input_item, input_family, input_city,
             input_state, input_type, input_cluster, input_dow, input_day]
    model = Model(inputs=inputs, outputs=[net])
    model.compile(loss='mse', optimizer='adam')

    return(model)

In [4]:
training_set = load_to_memory("../kaggle_favorita-grocery-sales/data/train.csv")
print('loded training to memory, memory use: {} GB'.format(round(sum(list(training_set.memory_usage(deep=True)))) / 1e9, 2))
test_set = load_to_memory("../kaggle_favorita-grocery-sales/data/test.csv", is_test=True)
print('loded test to memory, memory use: {} GB'.format(round(sum(list(test_set.memory_usage(deep=True)))) / 1e9, 2))


# prepare sales & promotion data to extract features from. Fixing missing values. Agree on multilvl idx.
# looking in between 2014-04-01 : 2017-08-31
idx_keys = [training_set.columns[1], training_set.columns[2], training_set.columns[0]]
sales_set_y, sales_set_x, promotions_set = sales_and_promotion_table(training_set, test_set, idx_keys)

# populate sales set with meta_data for groupby operations
# day_of_week | day | days_of_no_sale | family | class | city | state | type | cluster
# meta data lookup table for key: store_nbr | item_nbr -> 
sales_set_x, lookup = sales_meta(sales_set_x,
                                 "../kaggle_favorita-grocery-sales/data/items.csv", 
                                 "../kaggle_favorita-grocery-sales/data/stores.csv")

train_date = [pd.to_datetime('2017-07-05') - pd.Timedelta(days=7 * i) for i in range(25)]
validation_date = '2017-07-26'
test_date = '2017-08-16'
# get data ready for features
_training_x = [get_features(i) for i in train_date]
_training_lables = [get_y(sales_set_y,i) for i in train_date]
_training_x = [i.set_index(list(i.columns[[0,1]])) for i in _training_x]
_validation_x = get_features(validation_date)
_test_x = get_features(test_date)
_validation_lables = get_y(sales_set_y, validation_date)
_validation_x = _validation_x.set_index(list(_validation_x.columns[[0,1]]) )
_test_x = _test_x.set_index (list(_test_x.columns[[0,1]]) )

loded training to memory, memory use: 3.738218004 GB
loded test to memory, memory use: 0.13481856 GB
DONE: last sales and avg sales calculated for 2017-07-05 00:00:00
DONE: avg difference in sales calculated for 2017-07-05 00:00:00
DONE: avg sales for each day of week calculated for 2017-07-05 00:00:00
DONE: avg sales for each day calculated for a quarter and year for 2017-07-05 00:00:00
DONE: avg count of item not sold in a store for 2017-07-05 00:00:00
DONE: no sales diff in for 2017-07-05 00:00:00
DONE: past promo avgs for 2017-07-05 00:00:00
DONE: future promo data for 2017-07-05 00:00:00
##############################


##############################
DONE: last sales and avg sales calculated for 2017-06-28 00:00:00
DONE: avg difference in sales calculated for 2017-06-28 00:00:00
DONE: avg sales for each day of week calculated for 2017-06-28 00:00:00
DONE: avg sales for each day calculated for a quarter and year for 2017-06-28 00:00:00
DONE: avg count of item not sold in a store fo

DONE: avg count of item not sold in a store for 2017-03-29 00:00:00
DONE: no sales diff in for 2017-03-29 00:00:00
DONE: past promo avgs for 2017-03-29 00:00:00
DONE: future promo data for 2017-03-29 00:00:00
##############################


##############################
DONE: last sales and avg sales calculated for 2017-03-22 00:00:00
DONE: avg difference in sales calculated for 2017-03-22 00:00:00
DONE: avg sales for each day of week calculated for 2017-03-22 00:00:00
DONE: avg sales for each day calculated for a quarter and year for 2017-03-22 00:00:00
DONE: avg count of item not sold in a store for 2017-03-22 00:00:00
DONE: no sales diff in for 2017-03-22 00:00:00
DONE: past promo avgs for 2017-03-22 00:00:00
DONE: future promo data for 2017-03-22 00:00:00
##############################


##############################
DONE: last sales and avg sales calculated for 2017-03-15 00:00:00
DONE: avg difference in sales calculated for 2017-03-15 00:00:00
DONE: avg sales for each day of w

In [5]:
# import sup. data
_items = pd.read_csv("../kaggle_favorita-grocery-sales/data/items.csv")
_stores = pd.read_csv("../kaggle_favorita-grocery-sales/data/stores.csv")

# encode metadata .. This is important as these are the 'keys'
# these keys will later be used to find values that corresponding values that will point to the trainable embbedings  
le = LabelEncoder()
_items.family = le.fit_transform(_items.family)
_stores.city = le.fit_transform(_stores.city)
_stores.state = le.fit_transform(_stores.state)
_stores.type = le.fit_transform(_stores.type)

# look up table for meta
items2 = _items.set_index('item_nbr').reindex(_validation_x.index.get_level_values(1))
stores2 = _stores.set_index('store_nbr').reindex(_validation_x.index.get_level_values(0))

In [6]:
# training / evaluation block
pred = []
test_pred = []

# for the 16 days to be predicted
for i in range(16):

    print("/" * 50)
    print("Step %d" % (i))
    print("/" * 50)
    start_timer = timeit.default_timer()

    # get features for validation
    val, yval = _features(_validation_x, i, _validation_lables)
    # get features for testing
    test = _features(_test_x, i)

    train = []
    ytrain = []
    for j in range(len(_training_x)):
        tr, ytr = _features(_training_x[j], i,_training_lables[j])
        train.append(tr)
        ytrain.append(ytr)
    train = pd.concat(train)
    ytrain = pd.concat(ytrain)
    ########################################################################################################
    train_set = [train.drop(['store_nbr', 'item_nbr', 'family', 'class', 'city', 'state', 'type', 'cluster',
                             'day_of_week', 'day'], axis=1).values,
                train.store_nbr.values, train.item_nbr.values,
                train.family.values,
                train.city.values, train.state.values, train.type.values, train.cluster.values,
                train.day_of_week.values, train.day.values]
    val_set = [val.drop(['store_nbr', 'item_nbr', 'family', 'class', 'city', 'state', 'type', 'cluster',
                             'day_of_week', 'day'], axis=1).values,
                val.store_nbr.values, val.item_nbr.values,
                val.family.values,
                val.city.values, val.state.values, val.type.values, val.cluster.values,
                val.day_of_week.values, val.day.values]
    test_set = [test.drop(['store_nbr', 'item_nbr', 'family', 'class', 'city', 'state', 'type', 'cluster',
                             'day_of_week', 'day'], axis=1).values,
                test.store_nbr.values, test.item_nbr.values,
                test.family.values,
                test.city.values, test.state.values, test.type.values, test.cluster.values,
                test.day_of_week.values, test.day.values]

    gc.collect()
    ########################################################################################################

    model = get_model(train.shape[1]-10) # -10 due to 10 embedding features / shape is 30 for Coles interview

    earlyStopping=EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto')
    checkpointer = ModelCheckpoint(filepath="tmp/weights.h5", verbose=0, save_best_only=True,
                               save_weights_only=True)

    model.fit(train_set, np.log(1+ytrain.values),
            validation_data=(val_set, np.log(1+yval.values)),
            epochs=16, batch_size=512, verbose=1, callbacks=[tensorboard_callback,earlyStopping, checkpointer])

    model.load_weights('tmp/weights.h5')
    pred.append(np.exp(model.predict(val_set))-1)
    test_pred.append(np.exp(model.predict(test_set))-1)

    print('nwrmsle %.5f' % nwrmsle(yval.values, pred[-1], weights=val.perishable.values*0.25+1))
    print('time elapsed', timeit.default_timer()-start_timer)

//////////////////////////////////////////////////
Step 0
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 00011: early stopping
nwrmsle 0.27576
time elapsed 750.9870422999998
//////////////////////////////////////////////////
Step 1
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 00006: early stopping
nwrmsle 0.29772
time elapsed 390.28740149999976
//////////////////////////////////////////////////
Step 2
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 00006: early stopping
nwrmsle 0.29702
time elapsed 411.9650907
//////////////////////////////////////////////////
Step 3
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
E

Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 00008: early stopping
nwrmsle 0.30967
time elapsed 572.4697794000003
//////////////////////////////////////////////////
Step 9
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 00005: early stopping
nwrmsle 0.30309
time elapsed 360.6782394999991
//////////////////////////////////////////////////
Step 10
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 00007: early stopping
nwrmsle 0.30474
time elapsed 502.9193885000004
//////////////////////////////////////////////////
Step 11
//////////////////////////////////////////////////
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 00009: early stopping
nwrmsle 0.30615
time elapsed 639.8959723999997
//////////////////////////////////////////////////
Step 12
///////////////

In [7]:
# validation
nwrmsle(np.expm1(np.array(pred).transpose().squeeze()), np.expm1(_validation_lables.values), weights=items2.perishable.values*0.25+1)

0.5939973266858016

In [8]:
np.expm1(np.array(test_pred).transpose().squeeze())

array([[ 0.22505914,  0.16737652,  0.29552162, ...,  0.13525485,
         0.19792454,  0.26601425],
       [ 0.29353425,  0.2980149 ,  0.39999387, ...,  0.28511173,
         0.38257676,  0.30394757],
       [ 1.1943672 ,  0.9658501 ,  1.0325513 , ...,  0.9617865 ,
         1.0536716 ,  0.89230216],
       ...,
       [11.330066  , 11.726071  ,  9.059416  , ...,  8.43863   ,
         6.0986266 ,  4.4603643 ],
       [ 0.5755954 ,  0.33560085,  0.5506027 , ...,  0.36428055,
         0.41199687,  0.49086285],
       [ 2.427124  ,  0.01861484,  0.06373626, ...,  0.07948102,
         2.189936  ,  0.09063169]], dtype=float32)