In [1]:
# Add mooncake to sys

import sys

mooncake_path = '/Users/ramonamezquita/Projects/mooncake'
sys.path.append(mooncake_path)

In [2]:
import pandas as pd
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
def select_top_n_groups(X, n, group_ids, wrt, aggfunc=sum):
    top_group_ids =  X.groupby(
        group_ids).agg({wrt: aggfunc}).nlargest(n, wrt).index.tolist()
    
    return X.set_index(group_ids).loc[top_group_ids].reset_index()

In [5]:
# Data and description

X = pd.read_csv('sales_example/sales_forecasting_dataset.csv')
date = 'Date'
GROUP_IDS = ['Store', 'Dept']
target = 'Weekly_Sales'

    
# Select subset
n_groups_to_use = 1
X = select_top_n_groups(X, n_groups_to_use, GROUP_IDS, target)

In [6]:
# Rename

columns={
    'Store': 'group_id__0',
    'Dept': 'group_id__1',
    'Date': 'timestamp',
    'Weekly_Sales': 'target'
}
X.rename(columns=columns, inplace=True)
del X['IsHoliday']

GROUP_IDS = ['group_id__0', 'group_id__1']
date = 'timestamp'
target = 'target'

In [7]:
# Correct dtypes

X['timestamp'] = pd.to_datetime(X['timestamp'])
for g in GROUP_IDS:
    X[g] = X[g].astype(str)

In [8]:
X = X.sort_values(GROUP_IDS + [date]).reset_index(drop=True)

In [9]:
X.head()

Unnamed: 0,group_id__0,group_id__1,timestamp,target
0,14,92,2010-01-10,174219.51
1,14,92,2010-02-04,203457.42
2,14,92,2010-02-07,198349.17
3,14,92,2010-02-19,192282.19
4,14,92,2010-02-26,205250.96


# Preprocess

In [10]:
from mooncake.preprocessing import (
    TimeIndex, 
    GroupColumnTransformer, 
    CyclicalDates, 
    ColumnTransformer,
    IdentityTransformer
)
from mooncake.utils.data import column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

In [11]:
def create_column_selector(dtype_include, to_exclude):
    pattern_exclude = GROUP_IDS + to_exclude
    return column_selector(dtype_include=dtype_include, pattern_exclude=pattern_exclude)

In [12]:
# Target transformer
# ------------------

scaler = MinMaxScaler()
target_transformer_triplet = [('cont', scaler, [target])]
target_transformer = GroupColumnTransformer(target_transformer_triplet, GROUP_IDS)

In [13]:
# GroupColumnTransformer
# ----------------------

transformers = []
TIME_INDEX = 'time_index'
to_exclude = GROUP_IDS

# Time index
time_index_triplet = ('time_index', TimeIndex(), TIME_INDEX)
transformers.append(time_index_triplet)

# Scaler
scaler = MinMaxScaler()
selector = create_column_selector(['float'], to_exclude=[target])
scaler_triplet = ('cont', scaler, selector)
transformers.append(scaler_triplet)

gct = GroupColumnTransformer(transformers, GROUP_IDS)

In [14]:
# ColumnTransformer
# -----------------

to_exclude = GROUP_IDS
transformers = []
timestamp = 'timestamp'

# Encoder
encoder = OneHotEncoder()
selector = create_column_selector(['object'], to_exclude)
transformers = [('cat', encoder, selector)]

# Time index
identity_transformer = IdentityTransformer(TIME_INDEX, cast_to_object=True, dtype=np.dtype('<M8[ns]'))
identity_triplet = ('identity', identity_transformer, [timestamp])
transformers.append(identity_triplet)

# Cyclical dates
transformers.append(('cyclical_dates', CyclicalDates(), timestamp))

ct = ColumnTransformer(transformers)

In [15]:
steps = [ 
    ('target', target_transformer),
    ('non_group', ct), 
    ('group', gct),
]
preprocessor = Pipeline(steps)

# Estimator

In [16]:
from mooncake.nn import SeqToSeq

In [17]:
target = 'target'
time_varying_unknown_reals = ['target']
time_varying_known_reals = []
static_categoricals = []

In [18]:
estimator = SeqToSeq(
    group_ids=GROUP_IDS, time_idx='time_index', 
    target=target, max_prediction_length=10, 
    max_encoder_length=10, 
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    static_categoricals=static_categoricals,
    max_epochs=10
)

# Pipeline

In [19]:
pipeline = Pipeline([('preprocessor', preprocessor), ('estimator', estimator)])

In [20]:
pipeline.fit(X)

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.2781[0m  0.0233
      2        [36m0.2780[0m  0.0216
      3        [36m0.2779[0m  0.0216
      4        [36m0.2750[0m  0.0202
      5        0.2778  0.0208
      6        [36m0.2747[0m  0.0202
      7        0.2775  0.0206
      8        0.2774  0.0208
      9        0.2771  0.0203
     10        [36m0.2744[0m  0.0206


Pipeline(steps=[('preprocessor',
                 Pipeline(steps=[('target',
                                  GroupColumnTransformer(group_ids=['group_id__0',
                                                                    'group_id__1'],
                                                         transformers=[('cont',
                                                                        MinMaxScaler(),
                                                                        ['target'])])),
                                 ('non_group',
                                  ColumnTransformer(transformers=[('cat',
                                                                   OneHotEncoder(),
                                                                   <mooncake.utils.data.column_selector object at 0x1207a2580>),
                                                                  ('identity',
                                                                   IdentityTransformer(

In [21]:
predictions_df = pipeline.predict(X, raw=False)

In [23]:
preprocessor.inverse_transform(predictions_df)

Unnamed: 0,target,group_id__0,group_id__1,timestamp
0,143434.627347,14,92,2010-04-16
1,143491.182289,14,92,2010-04-23
2,143474.755685,14,92,2010-04-30
3,143504.418370,14,92,2010-05-02
4,142869.498014,14,92,2010-05-03
...,...,...,...,...
128,143199.815819,14,92,2012-10-08
129,143051.535461,14,92,2012-10-19
130,142954.918318,14,92,2012-10-26
131,142893.875821,14,92,2012-11-05
