In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 

from hts.hierarchy import HierarchyTree
from hts import HTSRegressor
from statsmodels.tsa.exponential_smoothing.ets import ETSModel

In [2]:
sales = pd.read_csv('/Users/user/hts-forecast/volume/data/raw/sales_train_evaluation.csv')
cal = pd.read_csv('/Users/user/hts-forecast/volume/data/raw/calendar.csv')

In [3]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [4]:
#take the dataframe and only present the relevant category
def filter_aggregate_sales(frame, column_to_filter, column_value_to_filter, column_to_aggregate):
    frame = frame[frame[column_to_filter] == column_value_to_filter]
    frame = frame.groupby(column_to_aggregate).sum().T
    return frame

In [110]:
filter_aggregate_sales(frame=sales, column_to_filter='state_id', column_value_to_filter='CA', column_to_aggregate='state_id')

state_id,CA
d_1,14195
d_2,13805
d_3,10108
d_4,11047
d_5,9925
...,...
d_1937,15678
d_1938,16297
d_1939,17430
d_1940,23103


In [5]:
#merge sales with calendar 
def merge_pd(frame1, frame2, merge_index):
    mergedframe = pd.merge(frame1, frame2, on = merge_index, how = 'inner')
    mergedframe.date = pd.to_datetime(mergedframe.date)
    mergedframe = mergedframe.drop(['wm_yr_wk', 'weekday', 'year', 'd', 'event_name_1', 'event_name_2', 
                                      'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'wday', 'month',
                                   'event_type_1'], axis='columns')
    return mergedframe

In [121]:
#Create hierarchical dataframe by summing for each category or sub-category
def summedlist(frame, col_to_filter):
    for state in frame[col_to_filter].unique():
        states[state] = filter_aggregate_sales(frame=frame, column_to_filter=col_to_filter, column_value_to_filter=state,
                                           column_to_aggregate=col_to_filter)[state]
    return states

In [122]:
list_of_cols = ['state_id', 'store_id', 'cat_id', 'dept_id'] 
type(list_of_cols)

list

In [125]:
test = summedlist(sales, list_of_cols)
test

AttributeError: 'DataFrame' object has no attribute 'unique'

In [114]:
states = filter_aggregate_sales(frame=sales, column_to_filter='state_id', column_value_to_filter='CA',
                                   column_to_aggregate='state_id')
states['CA']

d_1       14195
d_2       13805
d_3       10108
d_4       11047
d_5        9925
          ...  
d_1937    15678
d_1938    16297
d_1939    17430
d_1940    23103
d_1941    24644
Name: CA, Length: 1941, dtype: int64

In [20]:
sales.state_id.unique()

array(['CA', 'TX', 'WI'], dtype=object)

In [147]:
#Create hierarchical dataframe
summedlistframe = pd.DataFrame()
for state in sales.state_id.unique():
    summedlistframe[state] = filter_aggregate_sales(frame=sales, column_to_filter='state_id', column_value_to_filter=state,
                                                  column_to_aggregate='state_id')[state]

for store in sales.store_id.unique():
    summedlistframe[store] = filter_aggregate_sales(frame=sales, column_to_filter='store_id', column_value_to_filter=store,
                                                    column_to_aggregate='store_id')[store]  

for category in sales.cat_id.unique():
    summedlistframe[category] = filter_aggregate_sales(frame=sales, column_to_filter='cat_id', column_value_to_filter=category,
                                                       column_to_aggregate='cat_id')[category]
for dept in sales.dept_id.unique():
    summedlistframe[dept] = filter_aggregate_sales(frame=sales, column_to_filter='dept_id', column_value_to_filter=dept,
                                                       column_to_aggregate='dept_id')[dept]
summedlistframe['total'] = summedlistframe.CA + summedlistframe.TX + summedlistframe.WI
#states2 = states.reset_index()
#states2 = states2.rename(columns={'index':'d'})
summedlistframe.head()

Unnamed: 0,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,...,HOUSEHOLD,FOODS,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3,total
d_1,14195,9438,8998,4337,3494,4739,1625,2556,3852,3030,...,5689,23178,3610,154,4105,1584,2343,4094,16741,32631
d_2,13805,9630,8314,4155,3046,4827,1777,2687,3937,3006,...,5634,22758,3172,185,3858,1776,2216,4209,16333,31749
d_3,10108,6778,6897,2816,2121,3785,1386,1822,2731,2225,...,3927,17174,2497,185,2827,1100,1657,3174,12343,23783
d_4,11047,7381,6984,3051,2324,4232,1440,2258,2954,2169,...,3865,18878,2531,138,2732,1133,1508,3606,13764,25412
d_5,9925,5912,3309,2630,1942,3817,1536,1694,2492,1726,...,2729,14603,1714,100,1802,927,1209,2869,10525,19146


In [148]:
summedlistframe = summedlistframe.reset_index()
summedlistframe = summedlistframe.rename({'index':'d'}, axis='columns')

In [149]:
summedlistframe.head()

Unnamed: 0,d,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,...,HOUSEHOLD,FOODS,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3,total
0,d_1,14195,9438,8998,4337,3494,4739,1625,2556,3852,...,5689,23178,3610,154,4105,1584,2343,4094,16741,32631
1,d_2,13805,9630,8314,4155,3046,4827,1777,2687,3937,...,5634,22758,3172,185,3858,1776,2216,4209,16333,31749
2,d_3,10108,6778,6897,2816,2121,3785,1386,1822,2731,...,3927,17174,2497,185,2827,1100,1657,3174,12343,23783
3,d_4,11047,7381,6984,3051,2324,4232,1440,2258,2954,...,3865,18878,2531,138,2732,1133,1508,3606,13764,25412
4,d_5,9925,5912,3309,2630,1942,3817,1536,1694,2492,...,2729,14603,1714,100,1802,927,1209,2869,10525,19146


In [150]:
#add date as the index 
combinedframe = merge_pd(frame1=summedlistframe, frame2=cal, merge_index='d')
combinedframe = combinedframe.set_index('date')
combinedframe.head()

Unnamed: 0_level_0,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,...,HOUSEHOLD,FOODS,HOBBIES_1,HOBBIES_2,HOUSEHOLD_1,HOUSEHOLD_2,FOODS_1,FOODS_2,FOODS_3,total
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,14195,9438,8998,4337,3494,4739,1625,2556,3852,3030,...,5689,23178,3610,154,4105,1584,2343,4094,16741,32631
2011-01-30,13805,9630,8314,4155,3046,4827,1777,2687,3937,3006,...,5634,22758,3172,185,3858,1776,2216,4209,16333,31749
2011-01-31,10108,6778,6897,2816,2121,3785,1386,1822,2731,2225,...,3927,17174,2497,185,2827,1100,1657,3174,12343,23783
2011-02-01,11047,7381,6984,3051,2324,4232,1440,2258,2954,2169,...,3865,18878,2531,138,2732,1133,1508,3606,13764,25412
2011-02-02,9925,5912,3309,2630,1942,3817,1536,1694,2492,1726,...,2729,14603,1714,100,1802,927,1209,2869,10525,19146


In [151]:
combinedframe.index

DatetimeIndex(['2011-01-29', '2011-01-30', '2011-01-31', '2011-02-01',
               '2011-02-02', '2011-02-03', '2011-02-04', '2011-02-05',
               '2011-02-06', '2011-02-07',
               ...
               '2016-05-13', '2016-05-14', '2016-05-15', '2016-05-16',
               '2016-05-17', '2016-05-18', '2016-05-19', '2016-05-20',
               '2016-05-21', '2016-05-22'],
              dtype='datetime64[ns]', name='date', length=1941, freq=None)

In [152]:
#Categorize the names in each col
state = sales.state_id.unique()
store = sales.store_id.unique()
dept = sales.dept_id.unique()
cat = sales.cat_id.unique()
items = sales.id.unique()

In [153]:
#create a dictionary for each category to form a tree 
total = {'total': list(state)}
state_h = {k: [v for v in store if v.startswith(k)] for k in state}
store_h = {k: [v for v in cat if v.startswith(k)] for k in store}
dept_h = {k: [v for v in dept if v.startswith(k)] for k in cat}
item_h = {k: [v for v in items if v.startswith(k)] for k in dept}

In [154]:
#create tree 
hierarchy = {**total, **state_h, **store_h, **dept_h, **item_h}
hierarchy

{'total': ['CA', 'TX', 'WI'],
 'CA': ['CA_1', 'CA_2', 'CA_3', 'CA_4'],
 'TX': ['TX_1', 'TX_2', 'TX_3'],
 'WI': ['WI_1', 'WI_2', 'WI_3'],
 'CA_1': [],
 'CA_2': [],
 'CA_3': [],
 'CA_4': [],
 'TX_1': [],
 'TX_2': [],
 'TX_3': [],
 'WI_1': [],
 'WI_2': [],
 'WI_3': [],
 'HOBBIES': ['HOBBIES_1', 'HOBBIES_2'],
 'HOUSEHOLD': ['HOUSEHOLD_1', 'HOUSEHOLD_2'],
 'FOODS': ['FOODS_1', 'FOODS_2', 'FOODS_3'],
 'HOBBIES_1': ['HOBBIES_1_001_CA_1_evaluation',
  'HOBBIES_1_002_CA_1_evaluation',
  'HOBBIES_1_003_CA_1_evaluation',
  'HOBBIES_1_004_CA_1_evaluation',
  'HOBBIES_1_005_CA_1_evaluation',
  'HOBBIES_1_006_CA_1_evaluation',
  'HOBBIES_1_007_CA_1_evaluation',
  'HOBBIES_1_008_CA_1_evaluation',
  'HOBBIES_1_009_CA_1_evaluation',
  'HOBBIES_1_010_CA_1_evaluation',
  'HOBBIES_1_011_CA_1_evaluation',
  'HOBBIES_1_012_CA_1_evaluation',
  'HOBBIES_1_013_CA_1_evaluation',
  'HOBBIES_1_014_CA_1_evaluation',
  'HOBBIES_1_015_CA_1_evaluation',
  'HOBBIES_1_016_CA_1_evaluation',
  'HOBBIES_1_017_CA_1_evaluat

In [83]:
#create hierarchical tree
ht = HierarchyTree.from_nodes(nodes=hierarchy, df=combinedframe)

In [102]:
ht.children

[- CA
    |- CA_1
    |- CA_2
    |- CA_3
    - CA_4,
 - TX
    |- TX_1
    |- TX_2
    - TX_3,
 - WI
    |- WI_1
    |- WI_2
    - WI_3]

In [85]:
ht.children[0].children[2]

- CA_3

In [86]:
#fit to HTSregressor 
regressor = HTSRegressor(model='holt_winters', revision_method='OLS', n_jobs=1)
model = regressor.fit(combinedframe, hierarchy)

Fitting models: 100%|██████████| 5/5 [00:02<00:00,  2.12it/s]


In [87]:
prediction = model.predict()

Fitting models: 100%|██████████| 5/5 [00:01<00:00,  3.80it/s]


TypeError: unsupported operand type(s) for +: 'Timestamp' and 'NoneType'