In [4]:
import os

if "temporal_fusion_transformer_pytorch" not in os.listdir():
    os.chdir("..")

In [19]:
from pathlib import Path
from leapfrog.etl import clean_column_names, optimize_memory
import pandas as pd


def parse_yearmonth(df):
    return df.assign(date=lambda x: pd.to_datetime(x.yearmonth, format="%Y%m")).drop("yearmonth", axis=1)

data_path = Path("examples/data/stallion")
weather = parse_yearmonth(clean_column_names(pd.read_csv(data_path / "weather.csv"))).set_index(["date", "agency"])
price_sales_promotion = parse_yearmonth(
    clean_column_names(pd.read_csv(data_path / "price_sales_promotion.csv")).rename(
        columns={"sales": "price_actual", "price": "price_regular", "promotions": "discount"}
    )
).set_index(["date", "sku", "agency"])
industry_volume = parse_yearmonth(clean_column_names(pd.read_csv(data_path / "industry_volume.csv"))).set_index("date")
industry_soda_sales = parse_yearmonth(clean_column_names(pd.read_csv(data_path / "industry_soda_sales.csv"))).set_index("date")
historical_volume = parse_yearmonth(clean_column_names(pd.read_csv(data_path / "historical_volume.csv")))
event_calendar = parse_yearmonth(clean_column_names(pd.read_csv(data_path / "event_calendar.csv"))).set_index("date")
demographics = clean_column_names(pd.read_csv(data_path / "demographics.csv")).set_index("agency")

# combine the data
data = (
    historical_volume
    .join(industry_volume, on="date")
    .join(industry_soda_sales, on="date")
    .join(weather, on=["date", "agency"])
    .join(price_sales_promotion, on=["date", "sku", "agency"])
    .join(demographics, on="agency")
    .join(event_calendar, on="date")
    .pipe(lambda x: optimize_memory(x, unique_value_ratio=1))
    .sort_values('date')
)

# minor feature engineering: add 12 month rolling mean volume
data = data.assign(discount_in_percent=lambda x: (x.discount / x.price_regular).fillna(0) * 100)
data["month"] = data.date.dt.month

features = data.drop(["volume"], axis=1).dropna()
target = data.volume[features.index]

data.head()

Unnamed: 0,agency,sku,volume,date,industry_volume,soda_volume,avg_max_temp,price_regular,price_actual,discount,...,labor_day,independence_day,revolution_day_memorial,regional_games,fifa_u_17_world_cup,football_gold_cup,beer_capital,music_fest,discount_in_percent,month
0,Agency_22,SKU_01,52.272,2013-01-01,492612703,718394219,25.845238,1168.903668,1069.166193,99.737475,...,0,0,0,0,0,0,0,0,8.532566,1
238,Agency_37,SKU_04,0.0,2013-01-01,492612703,718394219,26.505,1852.273642,1611.466298,240.807344,...,0,0,0,0,0,0,0,0,13.000635,1
237,Agency_59,SKU_03,812.9214,2013-01-01,492612703,718394219,22.219737,1270.795012,1197.18426,73.610752,...,0,0,0,0,0,0,0,0,5.792496,1
236,Agency_11,SKU_01,316.44,2013-01-01,492612703,718394219,25.36,1176.155397,1082.757488,93.397909,...,0,0,0,0,0,0,0,0,7.94095,1
235,Agency_05,SKU_05,420.9093,2013-01-01,492612703,718394219,24.079012,1327.003396,1207.822992,119.180404,...,0,0,0,0,0,0,0,0,8.981168,1


In [28]:
data.groupby(["agency", "sku"], observed=True).price_actual.std()

agency     sku   
Agency_22  SKU_01     99.933925
           SKU_04    174.060578
           SKU_03    109.060261
           SKU_05    115.168426
           SKU_02    106.797365
                        ...    
Agency_42  SKU_01    125.452170
           SKU_04    116.459837
           SKU_03    130.943412
           SKU_05    150.786137
           SKU_02    114.710078
Name: price_actual, Length: 350, dtype: float64

In [22]:
data.columns.to_list()

['agency',
 'sku',
 'volume',
 'date',
 'industry_volume',
 'soda_volume',
 'avg_max_temp',
 'price_regular',
 'price_actual',
 'discount',
 'avg_population_2017',
 'avg_yearly_household_income_2017',
 'easter_day',
 'good_friday',
 'new_year',
 'christmas',
 'labor_day',
 'independence_day',
 'revolution_day_memorial',
 'regional_games',
 'fifa_u_17_world_cup',
 'football_gold_cup',
 'beer_capital',
 'music_fest',
 'discount_in_percent',
 'month']

In [30]:
import temporal_fusion_transformer_pytorch.data
import importlib
importlib.reload(temporal_fusion_transformer_pytorch.data)

dl = temporal_fusion_transformer_pytorch.data.TimeSeriesDataLoader(data, time="date", time_granularity=(1, "M"), target="volume",
                          group_ids=["agency", "sku"],
                     max_encode_length=50, max_prediction_length=12,
                     static_categoricals=["agency", "sku"],
                     static_reals=[],
                     time_varying_known_categoricals=[
                         "easter_day",
                        'good_friday',
                        'new_year',
                        'christmas',
                        'labor_day',
                        'independence_day',
                        'revolution_day_memorial',
                        'regional_games',
                        'fifa_u_17_world_cup',
                        'football_gold_cup',
                        'beer_capital',
                        'music_fest',],
                     time_varying_known_reals=[
                         "price_regular",
                         "price_actual",
                         "discount",
                         'avg_population_2017',
                         'avg_yearly_household_income_2017',
                     "discount_in_percent"],
                     time_varying_unknown_categoricals=[],
                     time_varying_unknown_reals=["industry_volume", "soda_volume", "avg_max_temp"])

  df_index_first = g[self.time].transform("nth", 0).to_frame("time_first")
  df_index_diff_to_next = (g[self.time].diff() / pd.Timedelta(*self.time_granularity)).fillna(0)
  df_index = pd.concat([df_index_first, df_index_last], axis=1)
  df_index["time_end_position"] = (


TypeError: 'float' object is not iterable