In [None]:
# import numpy as np
import numpy as np
import pandas as pd

from sktime.datatypes import get_examples

# Hierarchical dataset

In [None]:
df = get_examples(mtype="pd_multiindex_hier", as_scitype="Hierarchical")
df = df[0]

df

## Aggregate Hierarchy

In [None]:
def aggregate_hierarchy(df_hier, flatten_single_levels=True):
    """From hierarchical mtype get the full aggregate hierarchy before forecasting"""

    hier_names = list(df_hier.index.names)

    # top level
    # remove aggregations that only have one level from below
    if flatten_single_levels:
        single_df = df_hier.groupby(["timepoints"]).count()
        mask1 = (
            single_df[(single_df > 1).all(1)]
            .index.get_level_values("timepoints")
            .unique()
        )
        mask1 = df_hier.index.get_level_values("timepoints").isin(mask1)
        top = df_hier.loc[mask1].groupby(level=["timepoints"]).sum()
    else:
        top = df_hier.loc[mask1].groupby(level=["timepoints"]).sum()

    ind_names = list(set(hier_names).difference(["timepoints"]))
    for i in ind_names:
        top[i] = "__total"

    top = top.set_index(ind_names, append=True).reorder_levels(hier_names)

    df_out = pd.concat([top, df_hier])

    # if we have a hierarchy with mid levels
    if len(hier_names) > 2:
        for i in range(len(hier_names) - 2):
            # list of levels to aggregate
            agg_levels = hier_names[0 : (i + 1)]
            agg_levels.append("timepoints")

            # remove aggregations that only have one level from below
            if flatten_single_levels:
                single_df = df_hier.groupby(level=agg_levels).count()
                # get index masks
                masks = []
                for i in agg_levels:
                    m1 = (
                        single_df[(single_df > 1).all(1)]
                        .index.get_level_values(i)
                        .unique()
                    )
                    m1 = df_hier.index.get_level_values(i).isin(m1)
                    masks.append(m1)
                mid = (
                    df_hier.loc[np.logical_and.reduce(masks)]
                    .groupby(level=agg_levels)
                    .sum()
                )
            else:
                mid = df_hier.groupby(level=agg_levels).sum()

            # now fill in index
            ind_names = list(set(hier_names).difference(agg_levels))
            for j in ind_names:
                mid[j] = "__total"
            # set back in index
            mid = mid.set_index(ind_names, append=True).reorder_levels(hier_names)
            df_out = pd.concat([df_out, mid])

    df_out.sort_index(inplace=True)
    return df_out

Now we have the full forecasting dataset

In [None]:
aggregate_hierarchy(df)

Let's test with bottom levels that span two nodes

- i.e. mid levels that are only present at a subset of bottom nodes

In [None]:
cols = ["foo", "foo2", "bar", "timepoints"] + [f"var_{i}" for i in range(2)]

Xlist = [
    pd.DataFrame(
        [["a", "a1", 0, 0, 1, 4], ["a", "a1", 0, 1, 2, 5], ["a", "a1", 0, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["a", "a1", 1, 0, 1, 4], ["a", "a1", 1, 1, 2, 55], ["a", "a1", 1, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["a", "a2", 2, 0, 1, 42], ["a", "a2", 2, 1, 2, 5], ["a", "a2", 2, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["b", "b1", 0, 0, 1, 4], ["b", "b1", 0, 1, 2, 5], ["b", "b1", 0, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["b", "b2", 1, 0, 1, 4], ["b", "b2", 1, 1, 2, 55], ["b", "b2", 1, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["b", "b2", 2, 0, 1, 42], ["b", "b2", 2, 1, 2, 5], ["b", "b2", 2, 2, 3, 6]],
        columns=cols,
    ),
]
X = pd.concat(Xlist)
X = X.set_index(["foo", "foo2", "bar", "timepoints"])

X

Note flatten single levels is the default option

- see that `(a, a2, 2, *)` and `(b, b1, 0, *)` don't contain `__total`

In [None]:
aggregate_hierarchy(X, flatten_single_levels=True)

# Forecasting Example

Let's generate a hierarchical dataset similar to the last example from the flights dataset

- Generate dataset
- Generate full hierarchy
- Forecast each level
- Reconcile

## Generate Dataset

In [None]:
from sktime.datasets import load_airline
from sktime.utils.plotting import plot_series

In [None]:
zone1 = load_airline()

zone1

In [None]:
# plotting for visualization
plot_series(
    zone1,
    10 + zone1 * 5,
    -50 + zone1 * 0.9,
    zone1 ** 1.5,
    -20 + 10 * zone1,
    10 + (10 * zone1) + (0.05 * (zone1 ** 2)),
    labels=["zone1", "zone2", "zone3", "zone4", "zone5", "zone6"],
)

In [None]:
df = pd.DataFrame(zone1, index=zone1.index).rename(
    columns={"Number of airline passengers": "zone1"}
)

df["zone2"] = 10 + zone1 * 5
df["zone3"] = zone1 * 0.9 - 50
df["zone4"] = zone1 ** 1.5
df["zone5"] = zone1 * 10 - 500
df["zone6"] = 10 + (10 * zone1) + (0.05 * (zone1 ** 2))

df = (
    df.melt(ignore_index=False)
    .set_index(["variable", df.melt(ignore_index=False).index])
    .rename_axis(["airport", "timepoints"], axis=0)
    .rename(columns={"value": "passengers"})
)

# df['country'] = "USA"
df.loc[
    df.index.get_level_values(level="airport").isin(["zone1", "zone2", "zone3"]),
    "state",
] = "CA"
df.loc[
    df.index.get_level_values(level="airport").isin(["zone1", "zone2"]), "city"
] = "LA"
df.loc[df.index.get_level_values(level="airport").isin(["zone3"]), "city"] = "SF"


df.loc[
    df.index.get_level_values(level="airport").isin(["zone4", "zone5", "zone6"]),
    "state",
] = "NY"
df.loc[
    df.index.get_level_values(level="airport").isin(["zone4", "zone5"]), "city"
] = "NYC"
df.loc[df.index.get_level_values(level="airport").isin(["zone6"]), "city"] = "BF"

df = df.set_index(["state", "city", df.index])
df

## Generate full hierarchy

In [None]:
df_fh = aggregate_hierarchy(df, flatten_single_levels=True)

df_fh

## Forecast each level

here we will forecast each unique level outside `timepoints`

In [None]:
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error

In [None]:
model_ids = df_fh.droplevel(level="timepoints").index.unique()

model_ids

Now set up loop for forecasting

In [None]:
# for i in model_ids:
mods = {}
prds = {}

for i in model_ids:
    # i = model_ids[0]
    y_train, y_test = temporal_train_test_split(df_fh.loc[i], test_size=36)
    fh = ForecastingHorizon(y_test.index, is_relative=False)
    forecaster = ExponentialSmoothing(trend="add", seasonal="additive", sp=12)
    mods[i] = forecaster.fit(y_train)
    prds[i] = forecaster.predict(fh)
    # plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"])
    print(i)
    print(mean_absolute_percentage_error(y_test, prds[i], symmetric=True))

Extract forecasts

In [None]:
prds = (
    pd.concat(prds)
    .rename_axis(df_fh.index.names, axis=0)
    .rename(columns={"passengers": "y_pred"})
)

# join with meas
prds = pd.concat([prds, df_fh], axis=1, join="inner").rename(
    columns={"passengers": "y_true"}
)

prds

## Reconcile - Bottom Up

Bottom up is easy we just sum the bottome levels much like aggregate function


In [None]:
df_bu = prds.loc[~(prds.index.get_level_values(level="airport").isin(["__total"]))]

df_bu = aggregate_hierarchy(
    df_bu.loc[:, ["y_pred"]], flatten_single_levels=True
).rename(columns={"y_pred": "y_reco"})

prds = pd.concat([prds, df_bu], axis=1)

prds

Maybe need some significance testing here :p

In [None]:
for i in model_ids:
    print(i)
    print(
        mean_absolute_percentage_error(
            prds.loc[i, "y_true"], prds.loc[i, "y_pred"], symmetric=True
        )
    )
    print(
        mean_absolute_percentage_error(
            prds.loc[i, "y_true"], prds.loc[i, "y_reco"], symmetric=True
        )
    )
    # plot_series(
    #     prds.loc[i, 'y_true'],
    #     prds.loc[i, 'y_pred'],
    #     prds.loc[i, 'y_reco'],
    # labels=["y_test", "y_pred", "y_reco"],
    # )

But we want it to be compatible with other methods which go like
    
    - get S matrix from index
    - reconcile forecasts

In [None]:
def get_s_matrix(df):

    # get bottom level indexes
    bl_inds = (
        df.loc[~(df.index.get_level_values(level=-2).isin(["__total"]))]
        .index.droplevel("timepoints")
        .unique()
    )

    # get all level indexes
    al_inds = df.droplevel(level="timepoints").index.unique()

    s_matrix = pd.DataFrame(
        [[0.0 for i in range(len(bl_inds))] for i in range(len(al_inds))], index=al_inds
    )

    #
    s_matrix.columns = list(bl_inds.get_level_values(level=-1))

    # now insert indicator for bottom level
    for i in s_matrix.columns:
        s_matrix.loc[s_matrix.index.get_level_values(-1) == i, i] = 1.0

    # now for each unique column
    for j in s_matrix.columns:

        # find bottom index id
        inds = list(s_matrix.index[s_matrix.index.get_level_values(level=-1).isin([j])])

        # generate new tuples for the aggregate levels
        for i in range(len(inds[0])):
            tmp = list(inds[i])
            tmp[-(i + 1)] = "__total"
            inds.append(tuple(tmp))

        # insrt indicator for aggregates
        for i in inds:
            s_matrix.loc[i, j] = 1.0

    # drop new levels not present in orginal matrix
    s_matrix.dropna(inplace=True)

    return s_matrix

seems to work lol

In [None]:
s_test = get_s_matrix(prds)

s_test

In [None]:
prds_bm = prds.loc[~(prds.index.get_level_values(level=-2).isin(["__total"]))]

tst = prds_bm[prds_bm.index.get_level_values(level=-1) == "1958-01"]

tst = tst.droplevel([0, 1, 3])

# tst
np.dot(s_test, tst["y_pred"])

In [None]:
prds[prds.index.get_level_values(level=-1) == "1958-01"]

In [None]:
from numpy.linalg import inv

In [None]:
g_ols = pd.DataFrame(
    np.dot(inv(np.dot(np.transpose(s_test), s_test)), np.transpose(s_test))
)

g_ols

In [None]:
tst = prds[prds.index.get_level_values(level=-1) == "1958-01"]

tst["y_ols"] = np.dot(s_test, np.dot(g_ols, tst["y_pred"]))

tst

In [None]:
df_fh.loc[model_ids[-1]]