In [1]:
# import numpy as np
import pandas as pd

from sktime.datatypes import get_examples

In [2]:
df = get_examples(mtype="pd_multiindex_hier", as_scitype="Hierarchical")
df = df[0]

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,var_0,var_1
foo,bar,timepoints,Unnamed: 3_level_1,Unnamed: 4_level_1
a,0,0,1,4
a,0,1,2,5
a,0,2,3,6
a,1,0,1,4
a,1,1,2,55
a,1,2,3,6
a,2,0,1,42
a,2,1,2,5
a,2,2,3,6
b,0,0,1,4


In [3]:
top = df.groupby(level=["timepoints"]).sum()
top["foo"] = "__total"
top["bar"] = "__total"

top = top.set_index(["foo", "bar"], append=True).reorder_levels(df.index.names)

top

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,var_0,var_1
foo,bar,timepoints,Unnamed: 3_level_1,Unnamed: 4_level_1
__total,__total,0,6,100
__total,__total,1,12,130
__total,__total,2,18,36


In [4]:
mid = df.groupby(level=["foo", "timepoints"]).sum()
mid["bar"] = "__total"

mid = mid.set_index(["bar"], append=True).reorder_levels(df.index.names)

mid

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,var_0,var_1
foo,bar,timepoints,Unnamed: 3_level_1,Unnamed: 4_level_1
a,__total,0,3,50
a,__total,1,6,65
a,__total,2,9,18
b,__total,0,3,50
b,__total,1,6,65
b,__total,2,9,18


In [5]:
wh_df = pd.concat([top, mid, df]).sort_index()
wh_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,var_0,var_1
foo,bar,timepoints,Unnamed: 3_level_1,Unnamed: 4_level_1
__total,__total,0,6,100
__total,__total,1,12,130
__total,__total,2,18,36
a,0,0,1,4
a,0,1,2,5
a,0,2,3,6
a,1,0,1,4
a,1,1,2,55
a,1,2,3,6
a,2,0,1,42


In [6]:
wh_df.loc[("a")]

Unnamed: 0_level_0,Unnamed: 1_level_0,var_0,var_1
bar,timepoints,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,1,4
0,1,2,5
0,2,3,6
1,0,1,4
1,1,2,55
1,2,3,6
2,0,1,42
2,1,2,5
2,2,3,6
__total,0,3,50


In [7]:
def aggregate_hierarchy(df_hier):
    """From hierarchical mtype get the full aggregate hierarchy before forecasting"""

    hier_names = list(df_hier.index.names)

    # top level
    top = df_hier.groupby(level=["timepoints"]).sum()
    ind_names = list(set(hier_names).difference(["timepoints"]))
    for i in ind_names:
        top[i] = "__total"

    top = top.set_index(ind_names, append=True).reorder_levels(hier_names)

    df_out = pd.concat([top, df_hier])

    # if we have a hierarchy with mid levels
    if len(hier_names) > 2:
        for i in range(len(hier_names) - 2):
            # list of levels to aggregats
            agg_levels = hier_names[0 : (i + 1)]
            agg_levels.append("timepoints")
            mid = df_hier.groupby(level=[agg_levels]).sum()
            ind_names = list(set(hier_names).difference(agg_levels))
            for j in ind_names:
                mid[j] = "__total"
            # set back in index
            mid = mid.set_index(ind_names, append=True).reorder_levels(hier_names)
            df_out = pd.concat([df_out, mid])

    df_out.sort_index(inplace=True)
    return df_out

In [8]:
aggregate_hierarchy(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,var_0,var_1
foo,bar,timepoints,Unnamed: 3_level_1,Unnamed: 4_level_1
__total,__total,0,6,100
__total,__total,1,12,130
__total,__total,2,18,36
a,0,0,1,4
a,0,1,2,5
a,0,2,3,6
a,1,0,1,4
a,1,1,2,55
a,1,2,3,6
a,2,0,1,42


In [9]:
cols = ["foo", "foo2", "bar", "timepoints"] + [f"var_{i}" for i in range(2)]

Xlist = [
    pd.DataFrame(
        [["a", "a1", 0, 0, 1, 4], ["a", "a1", 0, 1, 2, 5], ["a", "a1", 0, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["a", "a1", 1, 0, 1, 4], ["a", "a1", 1, 1, 2, 55], ["a", "a1", 1, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["a", "a2", 2, 0, 1, 42], ["a", "a2", 2, 1, 2, 5], ["a", "a2", 2, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["b", "b1", 0, 0, 1, 4], ["b", "b1", 0, 1, 2, 5], ["b", "b1", 0, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["b", "b2", 1, 0, 1, 4], ["b", "b2", 1, 1, 2, 55], ["b", "b2", 1, 2, 3, 6]],
        columns=cols,
    ),
    pd.DataFrame(
        [["b", "b2", 2, 0, 1, 42], ["b", "b2", 2, 1, 2, 5], ["b", "b2", 2, 2, 3, 6]],
        columns=cols,
    ),
]
X = pd.concat(Xlist)
X = X.set_index(["foo", "foo2", "bar", "timepoints"])

X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,var_0,var_1
foo,foo2,bar,timepoints,Unnamed: 4_level_1,Unnamed: 5_level_1
a,a1,0,0,1,4
a,a1,0,1,2,5
a,a1,0,2,3,6
a,a1,1,0,1,4
a,a1,1,1,2,55
a,a1,1,2,3,6
a,a2,2,0,1,42
a,a2,2,1,2,5
a,a2,2,2,3,6
b,b1,0,0,1,4


In [11]:
# temp = aggregate_hierarchy(X)

single_df = X.groupby(["foo", "foo2", "timepoints"]).count()

mask1 = single_df[(single_df > 1).all(1)].index.get_level_values("foo").unique()
mask1 = X.index.get_level_values("foo").isin(mask1)

mask2 = single_df[(single_df > 1).all(1)].index.get_level_values("foo2").unique()
mask2 = X.index.get_level_values("foo2").isin(mask2)

X.loc[(mask1) & (mask2)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,var_0,var_1
foo,foo2,bar,timepoints,Unnamed: 4_level_1,Unnamed: 5_level_1
a,a1,0,0,1,4
a,a1,0,1,2,5
a,a1,0,2,3,6
a,a1,1,0,1,4
a,a1,1,1,2,55
a,a1,1,2,3,6
b,b2,1,0,1,4
b,b2,1,1,2,55
b,b2,1,2,3,6
b,b2,2,0,1,42


In [14]:
import numpy as np


def aggregate_hierarchy(df_hier, flatten_single_levels=True):
    """From hierarchical mtype get the full aggregate hierarchy before forecasting"""

    hier_names = list(df_hier.index.names)

    # top level
    # remove aggregations that only have one level from below
    if flatten_single_levels:
        single_df = df_hier.groupby(["timepoints"]).count()
        mask1 = (
            single_df[(single_df > 1).all(1)]
            .index.get_level_values("timepoints")
            .unique()
        )
        mask1 = df_hier.index.get_level_values("timepoints").isin(mask1)
        top = df_hier.loc[mask1].groupby(level=["timepoints"]).sum()
    else:
        top = df_hier.loc[mask1].groupby(level=["timepoints"]).sum()

    ind_names = list(set(hier_names).difference(["timepoints"]))
    for i in ind_names:
        top[i] = "__total"

    top = top.set_index(ind_names, append=True).reorder_levels(hier_names)

    df_out = pd.concat([top, df_hier])

    # if we have a hierarchy with mid levels
    if len(hier_names) > 2:
        for i in range(len(hier_names) - 2):
            # list of levels to aggregate
            agg_levels = hier_names[0 : (i + 1)]
            agg_levels.append("timepoints")

            # remove aggregations that only have one level from below
            if flatten_single_levels:
                single_df = df_hier.groupby(level=agg_levels).count()
                # get index masks
                masks = []
                for i in agg_levels:
                    m1 = (
                        single_df[(single_df > 1).all(1)]
                        .index.get_level_values(i)
                        .unique()
                    )
                    m1 = df_hier.index.get_level_values(i).isin(m1)
                    masks.append(m1)
                mid = (
                    df_hier.loc[np.logical_and.reduce(masks)]
                    .groupby(level=agg_levels)
                    .sum()
                )
            else:
                mid = df_hier.groupby(level=agg_levels).sum()

            # now fill in index
            ind_names = list(set(hier_names).difference(agg_levels))
            for j in ind_names:
                mid[j] = "__total"
            # set back in index
            mid = mid.set_index(ind_names, append=True).reorder_levels(hier_names)
            df_out = pd.concat([df_out, mid])

    df_out.sort_index(inplace=True)
    return df_out

In [15]:
aggregate_hierarchy(X)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,var_0,var_1
foo,foo2,bar,timepoints,Unnamed: 4_level_1,Unnamed: 5_level_1
__total,__total,__total,0,6,100
__total,__total,__total,1,12,130
__total,__total,__total,2,18,36
a,__total,__total,0,3,50
a,__total,__total,1,6,65
a,__total,__total,2,9,18
a,a1,0,0,1,4
a,a1,0,1,2,5
a,a1,0,2,3,6
a,a1,1,0,1,4
