A few utility functions for preapring training data of deep nerual network models for the retail sales forecasting problem

In [1]:
import os
import sys
import math
import itertools
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Append TSPerf path to sys.path
nb_dir = os.path.split(os.getcwd())[0]
tsperf_dir = os.path.dirname(os.path.dirname(nb_dir))
if tsperf_dir not in sys.path:
    sys.path.append(tsperf_dir)

import retail_sales.OrangeJuice_Pt_3Weeks_Weekly.common.benchmark_settings as bs

In [3]:
# Data paths
DATA_DIR = '../data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')

# Parameters of the model
PRED_HORIZON = 3
PRED_STEPS = 2
SEQ_LEN = 8

## DNN Utility Functions

In [4]:
def df_from_cartesian_product(dict_in):
    """Generate a Pandas dataframe from Cartesian product of lists.
    
    Args: 
        dict_in (Dictionary): Dictionary containing multiple lists
        
    Returns:
        df (Dataframe): Dataframe corresponding to the Caresian product of the lists
    """
    from collections import OrderedDict
    from itertools import product
    od = OrderedDict(sorted(dict_in.items()))
    cart = list(product(*od.values()))
    df = pd.DataFrame(cart, columns=od.keys())
    return df

def gen_sequence(df, seq_len, seq_cols, start_timestep=0, end_timestep=None):
    """Reshape features into an array of dimension (time steps, features)   
    
    Args:
        df (Dataframe): Time series data of a specific (store, brand) combination
        seq_len (Integer): The number of previous time series values to use as input features
        seq_cols (List): A list of names of the feature columns 
        start_timestep (Integer): First time step you can use to create feature sequences
        end_timestep (Integer): Last time step you can use to create feature sequences
        
    Returns:
        A generator object for iterating all the feature sequences
    """
    data_array = df[seq_cols].values
    if end_timestep is None:
        end_timestep = df.shape[0]
    for start, stop in zip(range(start_timestep, end_timestep-seq_len+2), range(start_timestep+seq_len, end_timestep+2)):
        yield data_array[start:stop, :]
        
def gen_sequence_array(df_all, seq_len, seq_cols, start_timestep=0, end_timestep=None):
    """Combine feature sequences for all the combinations of (store, brand) into an 3d array.
    
    Args:
        df_all (Dataframe): Time series data of all stores and brands
        seq_len (Integer): The number of previous time series values to use as input features
        seq_cols (List): A list of names of the feature columns 
        start_timestep (Integer): First time step you can use to create feature sequences
        end_timestep (Integer): Last time step you can use to create feature sequences
        
    Returns:
        seq_array (Numpy Array): An array of the feature sequences of all stores and brands    
    """
    seq_gen = (list(gen_sequence(df_all[(df_all['store']==cur_store) & (df_all['brand']==cur_brand)], \
                                 seq_len, seq_cols, start_timestep, end_timestep)) \
              for cur_store, cur_brand in itertools.product(df_all['store'].unique(), df_all['brand'].unique()))
    seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
    return seq_array

def static_feature_array(df_all, total_timesteps, seq_cols):
    """Generate an arary which encodes all the static features.
    
    Args:
        df_all (Dataframe): Time series data of all stores and brands
        total_timesteps (Integer): Total number of training samples for each store and brand
        seq_cols (List): A list of names of the static feature columns (e.g., store index)
        
    Returns:
        fea_array (Numpy Array): An array of static features of all stores and brands
    """
    fea_df = data_filled.groupby(['store', 'brand']). \
                         apply(lambda x: x.iloc[:total_timesteps,:]). \
                         reset_index(drop=True)
    fea_array = fea_df[seq_cols].values
    return fea_array

def normalize_dataframe(df, seq_cols, scaler=MinMaxScaler()):
    """Normalize a subset of columns of a dataframe.
    
    Args:
        df (Dataframe): Input dataframe 
        seq_cols (List): A list of names of columns to be normalized
        scaler (Scaler): A scikit learn scaler object
    
    Returns:
        df_scaled (Dataframe): Normalized dataframe
    """
    cols_fixed = df.columns.difference(seq_cols)
    df_scaled = pd.DataFrame(scaler.fit_transform(df[seq_cols]), 
                            columns=seq_cols, index=df.index)
    df_scaled = pd.concat([df[cols_fixed], df_scaled], axis=1)
    return df_scaled, scaler

## Test the funtions

In [5]:
r = 0
print('---- Round ' + str(r+1) + ' ----')
# Load training data
train_df = pd.read_csv(os.path.join(TRAIN_DIR, 'train_round_'+str(r+1)+'.csv'))
train_df['move'] = train_df['logmove'].apply(lambda x: round(math.exp(x)))
train_df.drop('logmove', axis=1, inplace=True)
print(train_df.head(3))
# Fill missing values
store_list = train_df['store'].unique()
brand_list = train_df['brand'].unique()
week_list = range(bs.TRAIN_START_WEEK, bs.TEST_END_WEEK_LIST[r]+1)
d = {'store': store_list,
     'brand': brand_list,
     'week': week_list}        
data_grid = df_from_cartesian_product(d)
data_filled = pd.merge(data_grid, train_df, how='left', 
                        on=['store', 'brand', 'week'])
print('Number of missing rows is {}'.format(data_filled[data_filled.isnull().any(axis=1)].shape[0]))
data_filled = data_filled.groupby(['store', 'brand']). \
                          apply(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
print(data_filled.head(3))
# Select subset of the data
data_sub = data_filled[(data_filled.brand==1) & (data_filled.store==2)]
print(data_sub[['store', 'brand', 'week', 'deal', 'feat', 'move']])

---- Round 1 ----
   store  brand  week  constant    price1    price2    price3    price4  \
0      2      1    40         1  0.060469  0.060497  0.042031  0.029531   
1      2      1    46         1  0.060469  0.060312  0.045156  0.046719   
2      2      1    47         1  0.060469  0.060312  0.045156  0.046719   

     price5    price6    price7    price8    price9   price10   price11  deal  \
0  0.049531  0.053021  0.038906  0.041406  0.028906  0.024844  0.038984     1   
1  0.049531  0.047813  0.045781  0.027969  0.042969  0.042031  0.038984     0   
2  0.037344  0.053021  0.045781  0.041406  0.048125  0.032656  0.038984     0   

   feat     profit  move  
0   0.0  37.992326  8256  
1   0.0  30.126667  6144  
2   0.0  30.000000  3840  
Number of missing rows is 6204
   brand  store  week  constant    price1    price2    price3    price4  \
0      1      2    40       1.0  0.060469  0.060497  0.042031  0.029531   
1      1      2    41       1.0  0.060469  0.060497  0.042031  0.02

In [6]:
# Create feature sequence array based on data_sub
start_timestep = 0
end_timestep = bs.TRAIN_END_WEEK_LIST[r]-bs.TRAIN_START_WEEK-PRED_HORIZON
train_input1 = gen_sequence_array(data_sub, SEQ_LEN, ['move'], start_timestep, end_timestep)
train_input1.shape

(86, 8, 1)

In [7]:
train_input1

array([[[ 8256.],
        [ 8256.],
        [ 8256.],
        [ 8256.],
        [ 8256.],
        [ 8256.],
        [ 6144.],
        [ 3840.]],

       [[ 8256.],
        [ 8256.],
        [ 8256.],
        [ 8256.],
        [ 8256.],
        [ 6144.],
        [ 3840.],
        [ 8000.]],

       [[ 8256.],
        [ 8256.],
        [ 8256.],
        [ 8256.],
        [ 6144.],
        [ 3840.],
        [ 8000.],
        [ 8000.]],

       [[ 8256.],
        [ 8256.],
        [ 8256.],
        [ 6144.],
        [ 3840.],
        [ 8000.],
        [ 8000.],
        [ 8896.]],

       [[ 8256.],
        [ 8256.],
        [ 6144.],
        [ 3840.],
        [ 8000.],
        [ 8000.],
        [ 8896.],
        [ 7168.]],

       [[ 8256.],
        [ 6144.],
        [ 3840.],
        [ 8000.],
        [ 8000.],
        [ 8896.],
        [ 7168.],
        [10880.]],

       [[ 6144.],
        [ 3840.],
        [ 8000.],
        [ 8000.],
        [ 8896.],
        [ 7168.],
        [10880.]

In [8]:
# Create feature sequence array based on data_filled
%time
train_input1 = gen_sequence_array(data_filled, SEQ_LEN, ['move'], start_timestep, end_timestep)
train_input1.shape

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.91 µs


(78518, 8, 1)

In [9]:
# Create array of static features 
total_timesteps = bs.TRAIN_END_WEEK_LIST[r]-bs.TRAIN_START_WEEK-SEQ_LEN-PRED_HORIZON+2
train_input2 = static_feature_array(data_filled, total_timesteps, ['store', 'brand'])
train_input2.shape

(78518, 2)

In [10]:
train_input2

array([[  2,   1],
       [  2,   1],
       [  2,   1],
       ...,
       [137,  11],
       [137,  11],
       [137,  11]])

In [11]:
# Test scaler
df_scaled, scaler = normalize_dataframe(data_sub[['store', 'brand', 'week', 'profit', 'move']], ['profit', 'move'])
df_scaled

Unnamed: 0,brand,store,week,profit,move
0,1,2,40,0.730845,0.115086
1,1,2,41,0.730845,0.115086
2,1,2,42,0.730845,0.115086
3,1,2,43,0.730845,0.115086
4,1,2,44,0.730845,0.115086
5,1,2,45,0.730845,0.115086
6,1,2,46,0.572949,0.063764
7,1,2,47,0.570406,0.007776
8,1,2,48,0.569402,0.108865
9,1,2,49,0.569402,0.108865


In [12]:
print('Minimum move after scaling is {}'.format(min(df_scaled['move'])))
print('Maximum move after scaling is {}'.format(max(df_scaled['move'])))

Minimum move after scaling is 0.0
Maximum move after scaling is 1.0


In [13]:
scaler.data_range_

array([   49.81538462, 41152.        ])