In [1]:
# Add mooncake to sys

import sys

mooncake_path = '/Users/ramonamezquita/Projects/mooncake'
sys.path.append(mooncake_path)

In [2]:
import pandas as pd
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
def select_top_n_groups(X, n, group_ids, wrt, aggfunc=sum):
    top_group_ids =  X.groupby(
        group_ids).agg({wrt: aggfunc}).nlargest(n, wrt).index.tolist()
    
    return X.set_index(group_ids).loc[top_group_ids].reset_index()

In [5]:
# Data and description

X = pd.read_csv('sales_example/sales_forecasting_dataset.csv')
date = 'Date'
GROUP_IDS = ['Store', 'Dept']
target = 'Weekly_Sales'

    
# Select subset
n_groups_to_use = 1
X = select_top_n_groups(X, n_groups_to_use, GROUP_IDS, target)

In [6]:
# Rename

columns={
    'Store': 'group_id__0',
    'Dept': 'group_id__1',
    'Date': 'timestamp',
    'Weekly_Sales': 'target'
}
X.rename(columns=columns, inplace=True)
del X['IsHoliday']

GROUP_IDS = ['group_id__0', 'group_id__1']
date = 'timestamp'
target = 'target'

In [7]:
# Correct dtypes

X['timestamp'] = pd.to_datetime(X['timestamp'])
for g in GROUP_IDS:
    X[g] = X[g].astype(str)

In [8]:
X = X.sort_values(GROUP_IDS + [date]).reset_index(drop=True)

In [9]:
X.head()

Unnamed: 0,group_id__0,group_id__1,timestamp,target
0,14,92,2010-01-10,174219.51
1,14,92,2010-02-04,203457.42
2,14,92,2010-02-07,198349.17
3,14,92,2010-02-19,192282.19
4,14,92,2010-02-26,205250.96


# Preprocess

In [61]:
from mooncake.preprocessing import (
    TimeIndex, 
    GroupColumnTransformer, 
    CyclicalDates, 
    ColumnTransformer,
    IdentityTransformer
)
from mooncake.utils.data import column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

In [62]:
def create_column_selector(dtype_include, to_exclude):
    pattern_exclude = GROUP_IDS + to_exclude
    return column_selector(dtype_include=dtype_include, pattern_exclude=pattern_exclude)

In [63]:
# Target transformer
# ------------------

scaler = MinMaxScaler()
target_transformer_triplet = [('cont', scaler, [target])]
target_transformer = GroupColumnTransformer(target_transformer_triplet, GROUP_IDS)

# Revisiting time index transformer

In [85]:
# Time index

X_train = X.loc[X.timestamp.between('2010-01-01', '2012-06-01')]
X_test = X.loc[X.timestamp.between('2012-06-02', '2013-01-01')]

TIME_INDEX = 'time_index'
time_index_triplet = ('time_index', TimeIndex(extra_timestamps=365), 'timestamp')
transformers = [time_index_triplet]
time_index_transformer = GroupColumnTransformer(transformers, GROUP_IDS)


In [86]:
time_index_transformer.fit_transform(X_train)

Unnamed: 0,timestamp,group_id__0,group_id__1,target
0,0,14,92,174219.51
1,1,14,92,203457.42
2,2,14,92,198349.17
3,3,14,92,192282.19
4,4,14,92,205250.96
...,...,...,...,...
115,115,14,92,169755.19
116,116,14,92,169519.23
117,117,14,92,163028.98
118,118,14,92,180251.98


In [87]:
timeIndex = time_index_transformer.mapping_[('14', '92')].transformers_[0][1]

In [None]:
timeId

In [88]:
time_index_transformer.transform(X_test)

Unnamed: 0,timestamp,group_id__0,group_id__1,target
0,121,14,92,224917.94
1,124,14,92,166956.91
2,132,14,92,160550.27
3,139,14,92,134236.16
4,146,14,92,135790.38
5,156,14,92,190579.72
6,160,14,92,137087.77
7,167,14,92,141112.33
8,174,14,92,129580.31
9,184,14,92,206575.9


In [32]:
# GroupColumnTransformer
# ----------------------

transformers = []
TIME_INDEX = 'time_index'
to_exclude = GROUP_IDS

# Time index
time_index_triplet = ('time_index', TimeIndex(), TIME_INDEX)
transformers.append(time_index_triplet)

# Scaler
scaler = MinMaxScaler()
selector = create_column_selector(['float'], to_exclude=[target])
scaler_triplet = ('cont', scaler, selector)
transformers.append(scaler_triplet)

gct = GroupColumnTransformer(transformers, GROUP_IDS)

In [33]:
# ColumnTransformer
# -----------------

to_exclude = GROUP_IDS
transformers = []
timestamp = 'timestamp'

# Encoder
encoder = OneHotEncoder()
selector = create_column_selector(['object'], to_exclude)
transformers = [('cat', encoder, selector)]

# Time index
identity_transformer = IdentityTransformer(TIME_INDEX, cast_to_object=True, dtype=np.dtype('<M8[ns]'))
identity_triplet = ('identity', identity_transformer, [timestamp])
transformers.append(identity_triplet)

# Cyclical dates
transformers.append(('cyclical_dates', CyclicalDates(), timestamp))

ct = ColumnTransformer(transformers)

In [34]:
steps = [ 
    ('target', target_transformer),
    ('non_group', ct), 
    ('group', gct),
]
preprocessor = Pipeline(steps)

# Estimator

In [35]:
from mooncake.nn import SeqToSeq

In [36]:
target = 'target'
time_varying_unknown_reals = ['target']
time_varying_known_reals = []
static_categoricals = []

In [37]:
estimator = SeqToSeq(
    group_ids=GROUP_IDS, time_idx='time_index', 
    target=target, max_prediction_length=10, 
    max_encoder_length=10, 
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    static_categoricals=static_categoricals,
    max_epochs=10
)

# Pipeline

In [38]:
pipeline = Pipeline([('preprocessor', preprocessor), ('estimator', estimator)])

In [39]:
pipeline.fit(X)

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.2311[0m  0.0238
      2        [36m0.2309[0m  0.0223
      3        [36m0.2309[0m  0.0215
      4        0.2322  0.0205
      5        0.2321  0.0202
      6        [36m0.2306[0m  0.0217
      7        [36m0.2305[0m  0.0217
      8        [36m0.2304[0m  0.0210
      9        0.2314  0.0204
     10        [36m0.2301[0m  0.0215


Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('target',
                                  GroupColumnTransformer(group_ids=['group_id__0',
                                                                    'group_id__1'],
                                                         transformers=[('cont',
                                                                        MinMaxScaler(),
                                                                        ['target'])])),
                                 ('non_group',
                                  ColumnTransformer(transformers=[('cat',
                                                                   OneHotEncoder(),
                                                                   <mooncake.utils.data.column_selector object at 0x2812f81f0>),
                                                                  ('identity',
                                                                   IdentityTransformer(

In [40]:
predictions_df = pipeline.predict(X, raw=False)

In [None]:
preprocessor

In [42]:
preprocessor.inverse_transform(predictions_df)

Unnamed: 0,target,group_id__0,group_id__1,timestamp
0,162565.683996,14,92,2010-04-16
1,162516.063696,14,92,2010-04-23
2,162557.945904,14,92,2010-04-30
3,162532.164512,14,92,2010-05-02
4,163225.942377,14,92,2010-05-03
...,...,...,...,...
128,150787.154489,14,92,2012-10-08
129,150367.171330,14,92,2012-10-19
130,150133.745017,14,92,2012-10-26
131,150007.995513,14,92,2012-11-05
