In [3]:
from datetime import datetime
from hts import HTSRegressor
from hts.utilities.load_data import load_hierarchical_sine_data
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

In [6]:
pwd_parent_path = Path.cwd().parent

In [9]:
train = pd.read_csv(str(pwd_parent_path)+"/data/sales_train_validation.csv").drop(['item_id','id','dept_id'],axis=1)

In [41]:
train.head()

Unnamed: 0,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [42]:
# train = pandas.read_csv(os.path.join(data, 'sales_train_validation.csv'), 
#                         encoding='utf-8', 
#                         engine='c')# .drop('item_id', 1)

# Ensures uniqueness of category, dept, and item across hierarchie
train['cat_id']  = (train['store_id'] + '_' + train['cat_id'])
# train['dept_id'] = (train['store_id'] + '_' + train['dept_id'])
# train['id'] = (train['store_id'] + '_' + train['id'])

In [43]:
train.head()

Unnamed: 0,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,CA_1_HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,CA_1_HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,CA_1_HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,CA_1_HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,CA_1_HOBBIES,CA_1,CA,0,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [44]:
train = train.groupby('cat_id').sum().reset_index()
train['state_id'] = train['cat_id'].apply(lambda x : x.split('_')[0])
train['store_id'] = train['cat_id'].apply(lambda x : x.split('_')[1])
train['store_id'] = train['state_id'] + '_' +train['store_id']
train.head()

Unnamed: 0,cat_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,...,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,state_id,store_id
0,CA_1_FOODS,3239,3137,2008,2258,2032,2407,2693,3962,3418,...,4004,2886,2488,2490,2533,2960,3655,4053,CA,CA_1
1,CA_1_HOBBIES,556,498,415,392,268,467,346,769,432,...,655,542,495,363,410,527,670,714,CA,CA_1
2,CA_1_HOUSEHOLD,542,520,393,401,330,402,411,706,490,...,1295,917,810,869,766,900,1252,1346,CA,CA_1
3,CA_2_FOODS,2193,1921,1289,1540,1278,1494,1785,2385,1827,...,3729,2625,2410,2506,2261,2871,3760,4012,CA,CA_2
4,CA_2_HOBBIES,538,397,368,350,296,391,316,413,452,...,483,320,368,344,271,485,542,507,CA,CA_2


In [45]:
base_variables = list(train['cat_id'].unique())

In [46]:
calendar = pd.read_csv(str(pwd_parent_path)+"/data/calendar.csv")

day_cols = [col for col in train.columns if col.startswith('d_')]
idx = [int(col.split('d_')[1]) for col in day_cols]

# will be useful later
train_date_id = pd.to_datetime(calendar[calendar.d.apply(lambda x: int(x.split('d_')[1])).isin(idx)].date)

In [47]:
def transpose(column, index, day_col):
    """
    Turn the row oriented time series into a column oriented one 
    """
    ts = []
    new_cols = train[column].unique()
    
    for value in new_cols:
        value_ts = train[train[column] == value]
        vertical = value_ts[day_col].sum().T
        vertical.index = index
        ts.append(vertical)
    return pd.DataFrame({k: v for k, v in zip(new_cols, ts)})
    

In [48]:
state_ts = transpose('state_id', train_date_id, day_cols)
store_ts = transpose('store_id', train_date_id, day_cols)
# cat_ts = transpose('cat_id', train_date_id, day_cols)
# dept_ts = transpose('dept_id', train_date_id, day_cols)

In [49]:
item_ts = transpose('cat_id', train_date_id, day_cols)

In [50]:
df = pd.concat([state_ts, store_ts, item_ts], 1)
# df = pd.concat([state_ts, store_ts, cat_ts,dept_ts, item_ts], 1)

# Total column is the root node -- the sum of of all demand across all stores (we have data on, at least)
df['total'] = df['CA'] + df['TX'] + df['WI']

df.head()

Unnamed: 0_level_0,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,...,WI_1_FOODS,WI_1_HOBBIES,WI_1_HOUSEHOLD,WI_2_FOODS,WI_2_HOBBIES,WI_2_HOUSEHOLD,WI_3_FOODS,WI_3_HOBBIES,WI_3_HOUSEHOLD,total
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,14195,9438,8998,4337,3494,4739,1625,2556,3852,3030,...,1581,615,508,1615,190,451,3028,278,732,32631
2011-01-30,13805,9630,8314,4155,3046,4827,1777,2687,3937,3006,...,1327,443,424,1433,127,362,3106,356,736,31749
2011-01-31,10108,6778,6897,2816,2121,3785,1386,1822,2731,2225,...,977,323,262,1586,113,319,2543,248,526,23783
2011-02-01,11047,7381,6984,3051,2324,4232,1440,2258,2954,2169,...,935,137,179,2013,124,385,2596,194,421,25412
2011-02-02,9925,5912,3309,2630,1942,3817,1536,1694,2492,1726,...,2,0,0,967,58,150,1854,74,204,19146


In [51]:
df.to_csv('./M5_hierarchy.csv')

In [52]:
states = train.state_id.unique()
stores = train.store_id.unique()
# depts = train.dept_id.unique()
# cats = train.cat_id.unique()
items = train.cat_id.unique()

# Here we build the tree as a dictionary. Each node (key in dict) has a list of 
# children value in dict, which in turn may also be a key in the dict, and have 
# children as well 
total = {'total': list(states)}
state_h = {k: [v for v in stores if v.startswith(k)] for k in states}
item_h = {k: [v for v in items if v.startswith(k)] for k in stores}
# item_h = {k: [v for v in items if v.startswith(k)] for k in cats}
# item_h = {k: [v for v in items if v.startswith(k)] for k in depts}

In [53]:
import json

# hierarchy = {**total, **state_h, **store_h, **dept_h, **item_h}
hierarchy = {**total, **state_h, **item_h}

In [54]:
from hts.hierarchy import HierarchyTree

ht = HierarchyTree.from_nodes(nodes=hierarchy, df=df)

print(ht.children[0].key)

print(ht.children[0])

CA
- CA
   |- CA_1
   |  |- CA_1_FOODS
   |  |- CA_1_HOBBIES
   |  - CA_1_HOUSEHOLD
   |- CA_2
   |  |- CA_2_FOODS
   |  |- CA_2_HOBBIES
   |  - CA_2_HOUSEHOLD
   |- CA_3
   |  |- CA_3_FOODS
   |  |- CA_3_HOBBIES
   |  - CA_3_HOUSEHOLD
   - CA_4
      |- CA_4_FOODS
      |- CA_4_HOBBIES
      - CA_4_HOUSEHOLD



In [55]:
df.head()

Unnamed: 0_level_0,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,...,WI_1_FOODS,WI_1_HOBBIES,WI_1_HOUSEHOLD,WI_2_FOODS,WI_2_HOBBIES,WI_2_HOUSEHOLD,WI_3_FOODS,WI_3_HOBBIES,WI_3_HOUSEHOLD,total
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,14195,9438,8998,4337,3494,4739,1625,2556,3852,3030,...,1581,615,508,1615,190,451,3028,278,732,32631
2011-01-30,13805,9630,8314,4155,3046,4827,1777,2687,3937,3006,...,1327,443,424,1433,127,362,3106,356,736,31749
2011-01-31,10108,6778,6897,2816,2121,3785,1386,1822,2731,2225,...,977,323,262,1586,113,319,2543,248,526,23783
2011-02-01,11047,7381,6984,3051,2324,4232,1440,2258,2954,2169,...,935,137,179,2013,124,385,2596,194,421,25412
2011-02-02,9925,5912,3309,2630,1942,3817,1536,1694,2492,1726,...,2,0,0,967,58,150,1854,74,204,19146


In [56]:
# df = df.iloc[:100,:]
df.shape

(1913, 44)

# comparing the hts regressor and auto hts regressor

In [57]:
def train_test_data(df: pd.DataFrame):
    """ This function splits the data into train and test datasets in the ratio of 80:20
    
    Args : 
        df( Pandas DataFrame ) :  it's a Pandas Dataframe of the time series with all the columns of the hierarchial time series

    Returns :
        train_data(Pandas DataFrame) : Time series data of all the nodes of hierarchial time series with 80 percent of total data
        test_data(Pandas DataFrame) : Time series data of all the nodes of hierarchial time series with 20 percent of total data

    Erros :
        Value Error : The Input pandas DataFrame should have atleast 12 rows, if not it will raise an error
    """
    if df.shape[0] >= 12 :
        train_sample_size = np.int(np.floor(df.shape[0]*0.8))
        train_data = df.iloc[:train_sample_size,: ]
        test_data = df.iloc[train_sample_size : df.shape[0], :]
        return train_data, test_data

In [58]:
train_data , test_data = train_test_data(df)
train_data.head()

Unnamed: 0_level_0,CA,TX,WI,CA_1,CA_2,CA_3,CA_4,TX_1,TX_2,TX_3,...,WI_1_FOODS,WI_1_HOBBIES,WI_1_HOUSEHOLD,WI_2_FOODS,WI_2_HOBBIES,WI_2_HOUSEHOLD,WI_3_FOODS,WI_3_HOBBIES,WI_3_HOUSEHOLD,total
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,14195,9438,8998,4337,3494,4739,1625,2556,3852,3030,...,1581,615,508,1615,190,451,3028,278,732,32631
2011-01-30,13805,9630,8314,4155,3046,4827,1777,2687,3937,3006,...,1327,443,424,1433,127,362,3106,356,736,31749
2011-01-31,10108,6778,6897,2816,2121,3785,1386,1822,2731,2225,...,977,323,262,1586,113,319,2543,248,526,23783
2011-02-01,11047,7381,6984,3051,2324,4232,1440,2258,2954,2169,...,935,137,179,2013,124,385,2596,194,421,25412
2011-02-02,9925,5912,3309,2630,1942,3817,1536,1694,2492,1726,...,2,0,0,967,58,150,1854,74,204,19146


In [59]:
fcst_input_data = pd.DataFrame(test_data.index)
fcst_input_data.index = fcst_input_data['date']
fcst_input_data

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2015-04-08,2015-04-08
2015-04-09,2015-04-09
2015-04-10,2015-04-10
2015-04-11,2015-04-11
2015-04-12,2015-04-12
...,...
2016-04-20,2016-04-20
2016-04-21,2016-04-21
2016-04-22,2016-04-22
2016-04-23,2016-04-23


In [60]:
fcst_input_data.shape[0]

383

In [61]:
from hts import HTSRegressor

clf = HTSRegressor(model='prophet', revision_method='OLS', n_jobs=12)
model = clf.fit(train_data, hierarchy)
final_output = model.predict(steps_ahead=fcst_input_data.shape[0])

Fitting models:   0%|          | 0/44 [00:00<?, ?it/s]Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Fitting models: 100%|██████████| 44/44 [00:15<00:00,  2.80it/s]
Fitting models:   0%|          | 0/44 [00:00<?, ?it/s]Importing plotly failed. Interactive plots will not work.
Importing plotly failed. Interactive plots will not work.
Importing plotly

In [62]:
hts_regressor_output = final_output[-fcst_input_data.shape[0]:]
hts_regressor_output.shape

(383, 44)

In [63]:
cd /Users/karanamramachaitanya/Downloads/cltv_space/forecasting/auto_hts_package/

/Users/karanamramachaitanya/Downloads/cltv_space/forecasting/auto_hts_package


In [64]:
from auto_hts import Auto_HTS_Regressor
auto_hts_instance = Auto_HTS_Regressor(hier=hierarchy)

In [65]:
base_variables

['CA_1_FOODS',
 'CA_1_HOBBIES',
 'CA_1_HOUSEHOLD',
 'CA_2_FOODS',
 'CA_2_HOBBIES',
 'CA_2_HOUSEHOLD',
 'CA_3_FOODS',
 'CA_3_HOBBIES',
 'CA_3_HOUSEHOLD',
 'CA_4_FOODS',
 'CA_4_HOBBIES',
 'CA_4_HOUSEHOLD',
 'TX_1_FOODS',
 'TX_1_HOBBIES',
 'TX_1_HOUSEHOLD',
 'TX_2_FOODS',
 'TX_2_HOBBIES',
 'TX_2_HOUSEHOLD',
 'TX_3_FOODS',
 'TX_3_HOBBIES',
 'TX_3_HOUSEHOLD',
 'WI_1_FOODS',
 'WI_1_HOBBIES',
 'WI_1_HOUSEHOLD',
 'WI_2_FOODS',
 'WI_2_HOBBIES',
 'WI_2_HOUSEHOLD',
 'WI_3_FOODS',
 'WI_3_HOBBIES',
 'WI_3_HOUSEHOLD']

In [66]:
auto_hts_instance.fit(train_data,exogenus_variables=[],predictable_variables=list(train_data.columns),m=12)

starting ARIM Model
ending ARIMA: CA
ending ARIMA: TX
ending ARIMA: WI
ending ARIMA: CA_1
ending ARIMA: CA_2
ending ARIMA: CA_3
ending ARIMA: CA_4
ending ARIMA: TX_1
ending ARIMA: TX_2
ending ARIMA: TX_3
ending ARIMA: WI_1
ending ARIMA: WI_2
ending ARIMA: WI_3
ending ARIMA: CA_1_FOODS
ending ARIMA: CA_1_HOBBIES
ending ARIMA: CA_1_HOUSEHOLD
ending ARIMA: CA_2_FOODS
ending ARIMA: CA_2_HOBBIES
ending ARIMA: CA_2_HOUSEHOLD
ending ARIMA: CA_3_FOODS
ending ARIMA: CA_3_HOBBIES
ending ARIMA: CA_3_HOUSEHOLD
ending ARIMA: CA_4_FOODS
ending ARIMA: CA_4_HOBBIES
ending ARIMA: CA_4_HOUSEHOLD
ending ARIMA: TX_1_FOODS
ending ARIMA: TX_1_HOBBIES
ending ARIMA: TX_1_HOUSEHOLD
ending ARIMA: TX_2_FOODS
ending ARIMA: TX_2_HOBBIES
ending ARIMA: TX_2_HOUSEHOLD
ending ARIMA: TX_3_FOODS
ending ARIMA: TX_3_HOBBIES
ending ARIMA: TX_3_HOUSEHOLD
ending ARIMA: WI_1_FOODS
ending ARIMA: WI_1_HOBBIES
ending ARIMA: WI_1_HOUSEHOLD
ending ARIMA: WI_2_FOODS
ending ARIMA: WI_2_HOBBIES
ending ARIMA: WI_2_HOUSEHOLD
ending ARI

In [67]:
Auto_HTS_Regressor_output = auto_hts_instance.predict(fcst_input_data)
Auto_HTS_Regressor_output.shape

starting forecast refitting 
fcst_start_date 2015-04-08 00:00:00
fcst_end_date 2016-04-24 00:00:00
starting the refit for arima 
CA_2 without exogeneous-m-12
CA_2_FOODS without exogeneous-m-12
CA_3_HOBBIES without exogeneous-m-12
CA_4_HOBBIES without exogeneous-m-12
CA_4_HOUSEHOLD without exogeneous-m-12
TX_1_FOODS without exogeneous-m-12
WI_1_FOODS without exogeneous-m-12
WI_2_HOUSEHOLD without exogeneous-m-12
starting the refit for HWSE
starting the refit for PROP 
Initial log joint probability = -13.9334
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       2591.39    0.00340681       149.918      0.3374           1      127   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     195       2594.93    7.4863e-08       60.6532      0.1836           1      252   
Optimization terminated normally: 
  Convergence detected: relative gradient magnitude is below tolerance
prop successful
Initial

(383, 44)

In [68]:
for col in base_variables:
    print(f"column name is {col}-->", np.mean(np.abs( np.array(hts_regressor_output[col]) - np.array(test_data[col]))) )

column name is CA_1_FOODS--> 4213.481757213138
column name is CA_1_HOBBIES--> 584.8664377236165
column name is CA_1_HOUSEHOLD--> 888.0291020185489
column name is CA_2_FOODS--> 1553.6371135715833
column name is CA_2_HOBBIES--> 825.9210664578013
column name is CA_2_HOUSEHOLD--> 687.6191066463202
column name is CA_3_FOODS--> 3797.029165979403
column name is CA_3_HOBBIES--> 861.4277968316572
column name is CA_3_HOUSEHOLD--> 832.3848089443744
column name is CA_4_FOODS--> 1012.7016551198396
column name is CA_4_HOBBIES--> 1144.5844008931501
column name is CA_4_HOUSEHOLD--> 874.8869208691069
column name is TX_1_FOODS--> 1167.307022377667
column name is TX_1_HOBBIES--> 571.4610373442948
column name is TX_1_HOUSEHOLD--> 184.68266546564058
column name is TX_2_FOODS--> 2451.943670286193
column name is TX_2_HOBBIES--> 202.58916665642946
column name is TX_2_HOUSEHOLD--> 353.1493680403454
column name is TX_3_FOODS--> 325.80074916713954
column name is TX_3_HOBBIES--> 3284.452158153391
column name is T

In [69]:
for col in base_variables:
    print(f"column name is {col}-->", np.mean(np.abs( np.array(Auto_HTS_Regressor_output[col]) - np.array(test_data[col]))) )

column name is CA_1_FOODS--> 2137.7780678851177
column name is CA_1_HOBBIES--> 451.15404699738906
column name is CA_1_HOUSEHOLD--> 640.9086161879895
column name is CA_2_FOODS--> 1568.2793733681463
column name is CA_2_HOBBIES--> 235.6266318537859
column name is CA_2_HOUSEHOLD--> 546.1801566579635
column name is CA_3_FOODS--> 2497.164490861619
column name is CA_3_HOBBIES--> 297.2924281984334
column name is CA_3_HOUSEHOLD--> 957.3054830287206
column name is CA_4_FOODS--> 940.2819843342037
column name is CA_4_HOBBIES--> 133.4621409921671
column name is CA_4_HOUSEHOLD--> 358.4255874673629
column name is TX_1_FOODS--> 593.0678851174935
column name is TX_1_HOBBIES--> 301.9817232375979
column name is TX_1_HOUSEHOLD--> 684.6057441253264
column name is TX_2_FOODS--> 1152.6083550913838
column name is TX_2_HOBBIES--> 235.28981723237598
column name is TX_2_HOUSEHOLD--> 484.49086161879893
column name is TX_3_FOODS--> 348.88250652741516
column name is TX_3_HOBBIES--> 247.01305483028722
column name is

In [70]:
total_mae = 0
for col in base_variables:
    total_mae+= np.mean(np.abs( np.array(hts_regressor_output[col]) - np.array(test_data[col])))
    # print(f"column name is {col}-->", np.mean(np.abs( np.array(hts_regressor_output[col]) - np.array(test_data[col]))) )
print(total_mae)

38806.46737893373


In [71]:
total_mae = 0
for col in base_variables:
    total_mae+= np.mean(np.abs( np.array(Auto_HTS_Regressor_output[col]) - np.array(test_data[col])))
    # print(f"column name is {col}-->", np.mean(np.abs( np.array(Auto_HTS_Regressor_output[col]) - np.array(test_data[col]))) )
print(total_mae)

29260.744125326375
