In [1]:
# Add mooncake to sys

import sys

mooncake_path = '/Users/ramonamezquita/Projects/mooncake'
sys.path.append(mooncake_path)

In [2]:
import pandas as pd
import numpy as np

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
def select_top_n_groups(X, n, group_ids, wrt, aggfunc=sum):
    top_group_ids =  X.groupby(
        group_ids).agg({wrt: aggfunc}).nlargest(n, wrt).index.tolist()
    
    return X.set_index(group_ids).loc[top_group_ids].reset_index()

In [5]:
# Data and description

X = pd.read_csv('sales_example/sales_forecasting_dataset.csv')
date = 'Date'
GROUP_IDS = ['Store', 'Dept']
target = 'Weekly_Sales'

    
# Select subset
n_groups_to_use = 1
X = select_top_n_groups(X, n_groups_to_use, GROUP_IDS, target)

In [6]:
# Rename

columns={
    'Store': 'group_id__0',
    'Dept': 'group_id__1',
    'Date': 'timestamp',
    'Weekly_Sales': 'target'
}
X.rename(columns=columns, inplace=True)
del X['IsHoliday']

GROUP_IDS = ['group_id__0', 'group_id__1']
date = 'timestamp'
target = 'target'

In [7]:
# Correct dtypes

X['timestamp'] = pd.to_datetime(X['timestamp'])
for g in GROUP_IDS:
    X[g] = X[g].astype(str)

In [10]:
X = X.sort_values(GROUP_IDS + [date]).reset_index(drop=True)

In [11]:
X.head()

Unnamed: 0,group_id__0,group_id__1,timestamp,target
0,14,92,2010-01-10,174219.51
1,14,92,2010-02-04,203457.42
2,14,92,2010-02-07,198349.17
3,14,92,2010-02-19,192282.19
4,14,92,2010-02-26,205250.96


# IdentityTransformer

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array

In [33]:
class IdentityTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, out_feature, cast_to_object=False, dtype=None, inverse_func=None):
        self.out_feature = out_feature
        self.cast_to_object = cast_to_object
        self.dtype = dtype
        self.inverse_func = inverse_func

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        check_array(X)

        if isinstance(X, pd.DataFrame):
            X = X.values

        if self.cast_to_object:
            X = X.astype(object)

        return X

    def get_feature_names(self):
        return np.array([self.out_feature])
    
    
    def inverse_transform(self, X):
        if self.inverse_func is None:
            return X
        return self.inverse_func(X)


# Preprocess

In [34]:
from mooncake.preprocessing import TimeIndex, GroupColumnTransformer, CyclicalDates, ColumnTransformer
from mooncake.utils.data import column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [35]:
def create_column_selector(dtype_include, to_exclude):
    pattern_exclude = GROUP_IDS + to_exclude
    return column_selector(dtype_include=dtype_include, pattern_exclude=pattern_exclude)

In [36]:
# GroupColumnTransformer
# ----------------------

transformers = []
TIME_INDEX = 'time_index'
to_exclude = GROUP_IDS

# Time index
time_index_triplet = ('time_index', TimeIndex(), TIME_INDEX)
transformers.append(time_index_triplet)

# Scaler
scaler = MinMaxScaler()
selector = create_column_selector(['float'], to_exclude)
scaler_triplet = ('cont', scaler, selector)
transformers.append(scaler_triplet)

gct = GroupColumnTransformer(transformers, GROUP_IDS)

In [45]:
# ColumnTransformer
# -----------------

to_exclude = GROUP_IDS
transformers = []
timestamp = 'timestamp'

# Encoder
encoder = OneHotEncoder()
selector = create_column_selector(['object'], to_exclude)
transformers = [('cat', encoder, selector)]

# Time index
identity_transformer = IdentityTransformer(TIME_INDEX, cast_to_object=True, dtype=np.dtype('<M8[ns]'))
identity_triplet = ('identity', identity_transformer, [timestamp])
transformers.append(identity_triplet)

# Cyclical dates
transformers.append(('cyclical_dates', CyclicalDates(), timestamp))

ct = ColumnTransformer(transformers)

In [46]:
X.head()

Unnamed: 0,group_id__0,group_id__1,timestamp,target
0,14,92,2010-01-10,174219.51
1,14,92,2010-02-04,203457.42
2,14,92,2010-02-07,198349.17
3,14,92,2010-02-19,192282.19
4,14,92,2010-02-26,205250.96


In [47]:
Xt = ct.fit_transform(X)
Xt.head()

Unnamed: 0,time_index,day_sine,day_cos,month_sine,month_cos,group_id__0,group_id__1,target
0,2010-01-10,0.897805,0.724793,0.5,0.866025,14,92,174219.51
1,2010-02-04,0.988468,-0.651372,0.866025,0.866025,14,92,203457.42
2,2010-02-07,-0.848644,0.968077,0.866025,1.0,14,92,198349.17
3,2010-02-19,0.651372,-0.651372,1.0,1.0,14,92,192282.19
4,2010-02-26,-0.848644,0.937752,1.0,0.866025,14,92,205250.96


In [48]:
Xu = gct.fit_transform(Xt)
Xu.head()

Unnamed: 0,time_index,day_sine,day_cos,month_sine,month_cos,target,group_id__0,group_id__1
0,0,0.948869,0.862308,0.75,0.933013,0.271552,14,92
1,1,0.99423,0.173784,0.933013,0.933013,0.449413,14,92
2,2,0.075084,0.984028,0.933013,1.0,0.418338,14,92
3,3,0.825574,0.173784,1.0,1.0,0.381431,14,92
4,4,0.075084,0.968856,1.0,0.933013,0.460324,14,92


In [49]:
Xu_inv = gct.inverse_transform(Xu)
Xu_inv.head()

Unnamed: 0,time_index,day_sine,day_cos,month_sine,month_cos,target,group_id__0,group_id__1
0,2010-01-10,0.897805,0.724793,0.5,0.866025,174219.51,14,92
1,2010-02-04,0.988468,-0.651372,0.866025,0.866025,203457.42,14,92
2,2010-02-07,-0.848644,0.968077,0.866025,1.0,198349.17,14,92
3,2010-02-19,0.651372,-0.651372,1.0,1.0,192282.19,14,92
4,2010-02-26,-0.848644,0.937752,1.0,0.866025,205250.96,14,92


In [53]:
inv_tr, Xu_inv_inv = ct.inverse_transform(Xu_inv)
Xu_inv_inv.head()

Unnamed: 0,timestamp,group_id__0,group_id__1,target
0,2010-01-10,14,92,174219.51
1,2010-02-04,14,92,203457.42
2,2010-02-07,14,92,198349.17
3,2010-02-19,14,92,192282.19
4,2010-02-26,14,92,205250.96
