# Modelagem HTS

Neste notebook será feita a avaliação dos modelos a serem utilizados para compor a hierarquia. Além disso, a própria conciliação também será feita aqui.

## Bibliotecas

In [22]:
import pandas as pd

from statsforecast import StatsForecast

from utilsforecast.losses import rmse, mae, smape, mase, scaled_crps, mqloss
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.evaluation import evaluate
from hierarchicalforecast.methods import BottomUp, TopDown, MiddleOut, MinTrace
from hierarchicalforecast.utils import aggregate, HierarchicalPlot

## Identificação dos melhores modelos por continente.

In [23]:
help(rmse)

Help on function rmse in module utilsforecast.losses:

rmse(df: pandas.core.frame.DataFrame, models: List[str], id_col: str = 'unique_id', target_col: str = 'y') -> pandas.core.frame.DataFrame
    Root Mean Squared Error (RMSE)

    RMSE measures the relative prediction
    accuracy of a forecasting method by calculating the squared deviation
    of the prediction and the observed value at a given time and
    averages these devations over the length of the series.
    Finally the RMSE will be in the same scale
    as the original time series so its comparison with other
    series is possible only if they share a common scale.
    RMSE has a direct connection to the L2 norm.

    Parameters
    ----------
    df : pandas or polars DataFrame
        Input dataframe with id, actual values and predictions.
    models : list of str
        Columns that identify the models predictions.
    id_col : str (default='unique_id')
        Column that identifies each serie.
    target_col : str (d

### Importando DataFrames das previsões e valores actual

#### SARIMAX

In [48]:

# 11_sarimax_forecasts (filtra total, america_central_caribe, america_norte, asia)
# 01_sarimax_forecasts (filtra america_do_sul, europa)
# 00_sarimax_forecasts (filtra africa)

sarimax_11_df = pd.read_csv("11_sarimax_forecasts.csv")
sarimax_11_df = sarimax_11_df.loc[sarimax_11_df["continent"].isin(["America_Central_E_Caribe", "America_Do_Norte", "Asia"])]

sarimax_01_df = pd.read_csv("01_sarimax_forecasts.csv")
sarimax_01_df = sarimax_01_df.loc[sarimax_01_df["continent"].isin(["America_Do_Sul", "Europa"])]

sarimax_00_df = pd.read_csv("00_sarimax_forecasts.csv")
sarimax_00_df = sarimax_00_df.loc[sarimax_00_df["continent"].isin(["Africa"])]

# TESTE
# Previsoes do sarimax
# sarimax_df = pd.read_csv("00_sarimax_forecasts.csv")
sarimax_df = pd.concat([sarimax_11_df, sarimax_01_df, sarimax_00_df])
sarimax_df = sarimax_df.loc[sarimax_df["continent"] != "Oceania"]


# Previsoes do sarimax oceania
oceania_sarimax_df = pd.read_csv("oceania_sarimax_forecasts.csv")

# Concatenando oceania e geral
sarimax_df = pd.concat([sarimax_df, oceania_sarimax_df])

# Renomeando colunas
sarimax_df=sarimax_df.rename(columns={"forecast": "sarimax_forecast"})

# TREINO
# Previsoes do sarimax
sarimax_11_train_df = pd.read_csv("11_sarimax_forecasts.csv")
sarimax_11_train_df = sarimax_11_train_df.loc[sarimax_11_train_df["continent"].isin(["America_Central_E_Caribe", "America_Do_Norte", "Asia"])]

sarimax_01_train_df = pd.read_csv("01_sarimax_forecasts.csv")
sarimax_01_train_df = sarimax_01_train_df.loc[sarimax_01_train_df["continent"].isin(["America_Do_Sul", "Europa"])]

sarimax_00_train_df = pd.read_csv("00_sarimax_forecasts.csv")
sarimax_00_train_df = sarimax_00_train_df.loc[sarimax_00_train_df["continent"].isin(["Africa"])]

# Previsoes do sarimax oceania
oceania_sarimax_train_df = pd.read_csv("oceania_sarimax_forecasts_TRAIN.csv")

# Concatenando oceania e geral
sarimax_train_full_df = pd.concat([sarimax_11_train_df, sarimax_01_train_df, sarimax_00_train_df,oceania_sarimax_train_df])

# Renomeando colunas
sarimax_train_full_df=sarimax_train_full_df.rename(columns={"forecast": "sarimax_forecast"})

# Gerando rmse do sarimax
rmse_sarimax = rmse(sarimax_df, models=["sarimax_forecast"], target_col="actual", id_col="continent")

rmse_sarimax

Unnamed: 0,continent,sarimax_forecast
0,Africa,118.878661
1,America_Central_E_Caribe,364.743257
2,America_Do_Norte,3501.26482
3,America_Do_Sul,21062.83303
4,Asia,957.00858
5,Europa,5885.345992
6,Oceania,393.809531


#### XGBOOST

In [49]:
# Importando dataset e renomeando coluna da previsao
xgboost_df = pd.read_csv("xgboost_forecasts.csv")
xgboost_df = xgboost_df.rename(columns={"forecast": "xgboost_forecast"})
xgboost_df.loc[xgboost_df["continent"] == "TOTAL (Agregado)", "continent"] = "TOTAL"

# Xgboost train
xgboost_train_df = pd.read_csv("xgboost_forecasts_TRAIN.csv")
xgboost_train_df = xgboost_train_df.rename(columns={"forecast": "xgboost_forecast"})
xgboost_train_df.loc[xgboost_train_df["continent"] == "TOTAL (Agregado)", "continent"] = "TOTAL"

# Gerando tabela do rmse
rmse_xgboost = rmse(xgboost_df, models=["xgboost_forecast"], target_col="actual", id_col="continent")
rmse_xgboost.loc[rmse_xgboost["continent"] == "TOTAL (Agregado)", "continent"] = "TOTAL"
rmse_xgboost

Unnamed: 0,continent,xgboost_forecast
0,Africa,79.142045
1,America_Central_E_Caribe,262.88711
2,America_Do_Norte,2725.095099
3,America_Do_Sul,13583.029284
4,Asia,918.125635
5,Europa,4340.333062
6,Oceania,216.755227
7,TOTAL,18387.481725


In [50]:
xgboost_sarimax_df = rmse_xgboost.merge(rmse_sarimax, on="continent")
xgboost_sarimax_df

Unnamed: 0,continent,xgboost_forecast,sarimax_forecast
0,Africa,79.142045,118.878661
1,America_Central_E_Caribe,262.88711,364.743257
2,America_Do_Norte,2725.095099,3501.26482
3,America_Do_Sul,13583.029284,21062.83303
4,Asia,918.125635,957.00858
5,Europa,4340.333062,5885.345992
6,Oceania,216.755227,393.809531


#### ETS

In [51]:
# america do norte
ets_na_test = pd.read_csv("forecast_ets_america_do_norte.csv")
ets_na_test["continent"] = "America_Do_Norte"
ets_na_test = ets_na_test.rename(columns={"forecast":"ets_forecast"})
ets_na_test

# oceania
ets_oc_test = pd.read_csv("forecast_ets_oceania.csv")
ets_oc_test["continent"] = "Oceania"
ets_oc_test = ets_oc_test.rename(columns={"forecast":"ets_forecast"})
ets_oc_test

# africa
ets_af_test = pd.read_csv("forecast_ets_africa.csv")
ets_af_test["continent"] = "Africa"
ets_af_test = ets_af_test.rename(columns={"forecast":"ets_forecast"})
ets_af_test

# america do norte
ets_ac_test = pd.read_csv("forecast_models_america_central_e_caribe.csv")
ets_ac_test["continent"] = "America_Central_E_Caribe"
ets_ac_test = ets_ac_test.rename(columns={"forecast":"ets_forecast"})
ets_ac_test

# america do norte
ets_as_test = pd.read_csv("forecast_models_america_do_sul.csv")
ets_as_test["continent"] = "America_Do_Sul"
ets_as_test = ets_as_test.rename(columns={"forecast":"ets_forecast"})
ets_as_test

# asia
ets_a_test = pd.read_csv("forecast_models_asia.csv")
ets_a_test["continent"] = "Asia"
ets_a_test = ets_a_test.rename(columns={"forecast":"ets_forecast"})
ets_a_test

# europa
ets_eu_test = pd.read_csv("forecast_models_europa.csv")
ets_eu_test["continent"] = "Europa"
ets_eu_test = ets_eu_test.rename(columns={"forecast":"ets_forecast"})
ets_eu_test

# total
ets_total_test = pd.read_csv("forecasts_comparison.csv")
ets_total_test["continent"] = "TOTAL"
ets_total_test = ets_total_test.rename(columns={"forecast":"ets_forecast"})
ets_total_test


# Full dataset
ets_df = pd.concat([ets_na_test, ets_oc_test, ets_a_test, ets_ac_test, ets_as_test, ets_af_test, ets_eu_test, ets_total_test])
ets_df

ets_train_df = ets_df.loc[ets_df["date"] <= "2023-12-01",:]
ets_test_df = ets_df.loc[ets_df["date"] > "2023-12-01",:]

ets_test_df.to_csv("ets_forecasts_TEST.csv", index=False)
ets_train_df.to_csv("ets_forecast_TRAIN.csv", index=False)

# Rmse
rmse_ets = rmse(ets_test_df, models=["ets_forecast"], target_col="actual", id_col="continent")
rmse_ets

Unnamed: 0,continent,ets_forecast
0,Africa,124.573622
1,America_Central_E_Caribe,450.534789
2,America_Do_Norte,6161.036595
3,America_Do_Sul,17860.450726
4,Asia,1248.91571
5,Europa,5508.584443
6,Oceania,469.265911
7,TOTAL,32881.407645


In [52]:
full_comparison_df = xgboost_sarimax_df.merge(rmse_ets, on="continent")


In [53]:
full_comparison_df

Unnamed: 0,continent,xgboost_forecast,sarimax_forecast,ets_forecast
0,Africa,79.142045,118.878661,124.573622
1,America_Central_E_Caribe,262.88711,364.743257,450.534789
2,America_Do_Norte,2725.095099,3501.26482,6161.036595
3,America_Do_Sul,13583.029284,21062.83303,17860.450726
4,Asia,918.125635,957.00858,1248.91571
5,Europa,4340.333062,5885.345992,5508.584443
6,Oceania,216.755227,393.809531,469.265911


XGBOOST vence todos os cenários. Considerando apenas o SARIMAX e o ETS, temos:

* ETS vence nos continentes: América do Sul e Europa.
* SARIMAX vence nos continentes: África, America central e caribe, America do norte, Asia, Oceania.

### HTS 1: Xgboost em todos, menos na asia. Asia -> ETS
Já assume modelos que passam no ljung box

In [30]:
def create_hts1_df(xgb_df, ets_df):

    # Xgboost dataset
    xgb_df = xgb_df.copy()
    xgb_df = xgb_df.loc[xgb_df["continent"] != "Asia"]
    xgb_df = xgb_df.loc[:, ["date", "continent", "actual", "xgboost_forecast"]]
    xgb_df = xgb_df.rename(columns={"xgboost_forecast":"forecast", "TOTAL (Agregado)": "TOTAL"})
    # xgb_df = xgb_df.loc[xgb_df["continent"] !=    "]

    # ets dataset
    ets_df = ets_df.copy()
    ets_df = ets_df.loc[ets_df["continent"] == "Asia"]
    ets_df = ets_df.loc[:, ["date", "continent", "actual", "sarimax_forecast"]]
    ets_df = ets_df.rename(columns={"sarimax_forecast":"forecast"})
    # ets_df = ets_df.loc[ets_df["continent"] != "TOTAL"]

    # Concat
    df = pd.concat([xgb_df, ets_df])

    return df


In [31]:
def create_hts2_df(sar_df, ets_df):

    # ets dataset
    ets_df = ets_df.copy()
    ets_df = ets_df.loc[ets_df["continent"].isin(["America_Do_Sul", "Europa"])]
    ets_df = ets_df.loc[:, ["date", "continent", "actual", "ets_forecast"]]
    ets_df = ets_df.rename(columns={"ets_forecast":"forecast"})
    #ets_df = ets_df.loc[ets_df["continent"] != "TOTAL"]

    # SARIMAX
    sar_df = sar_df.copy()
    sar_df = sar_df.loc[sar_df["continent"].isin(["America_Central_E_Caribe", "Africa", "Asia", "America_Do_Norte", "Oceania"])]
    sar_df = sar_df.loc[:, ["date", "continent", "actual", "sarimax_forecast"]]
    sar_df = sar_df.rename(columns={"sarimax_forecast":"forecast"})
    # sar_df = sar_df.loc[sar_df["continent"] != "TOTAL"]

    # Concat
    df = pd.concat([sar_df, ets_df])

    return df


In [36]:
# HTS 1
hts1_train_df = create_hts1_df(xgboost_train_df, sarimax_train_full_df)
hts1_test_df = create_hts1_df(xgboost_df, sarimax_df)
hts1_test_df.to_csv("hts1_test_df.csv", index=False)
hts1_train_df.to_csv("hts1_train_df.csv", index=False)

# HTS 2:
hts2_train_df = create_hts2_df(sarimax_train_full_df, ets_train_df)
hts2_test_df = create_hts2_df(sarimax_df, ets_test_df)
hts2_train_df.to_csv("hts2_train_df.csv", index=False)
hts2_test_df.to_csv("hts2_test_df.csv", index=False)

In [37]:
a = hts1_test_df[hts1_test_df["continent"] != "TOTAL"]
a.groupby("date")["forecast"].sum()

date
2024-01-01    141500.359237
2024-02-01    148564.770456
2024-03-01    140091.719848
2024-04-01    100946.212135
2024-05-01     84012.855983
2024-06-01     78338.005832
2024-07-01     90717.549668
2024-08-01    103066.077668
2024-09-01     95565.644110
2024-10-01    118832.114096
2024-11-01    141229.394150
2024-12-01    146590.934552
Name: forecast, dtype: float64

In [38]:
ets_train_df

Unnamed: 0,date,continent,actual,ets_forecast,error,abs_error,pct_error,ets_model,model_type,arima_model
0,2000-01-01,America_Do_Norte,22202.0,18082.070165,-4119.929835,4119.929835,-18.556571,"ETS(M,N,M)",AutoETS,
1,2000-02-01,America_Do_Norte,19060.0,19986.541950,926.541950,926.541950,4.861185,"ETS(M,N,M)",AutoETS,
2,2000-03-01,America_Do_Norte,17767.0,19325.610283,1558.610283,1558.610283,8.772501,"ETS(M,N,M)",AutoETS,
3,2000-04-01,America_Do_Norte,14761.0,13363.934207,-1397.065793,1397.065793,-9.464574,"ETS(M,N,M)",AutoETS,
4,2000-05-01,America_Do_Norte,12601.0,12939.110184,338.110184,338.110184,2.683201,"ETS(M,N,M)",AutoETS,
...,...,...,...,...,...,...,...,...,...,...
283,2023-08-01,TOTAL,80697.0,87601.714505,-6904.714505,6904.714505,-8.556346,"ETS(M,A,M)",HYBRID,"(1, 0, 0, 0, 1, 0, 0)"
284,2023-09-01,TOTAL,82149.0,74041.895005,8107.104995,8107.104995,9.868781,"ETS(M,A,M)",HYBRID,"(1, 0, 0, 0, 1, 0, 0)"
285,2023-10-01,TOTAL,92918.0,88634.817681,4283.182319,4283.182319,4.609637,"ETS(M,A,M)",HYBRID,"(1, 0, 0, 0, 1, 0, 0)"
286,2023-11-01,TOTAL,114550.0,94906.380829,19643.619171,19643.619171,17.148511,"ETS(M,A,M)",HYBRID,"(1, 0, 0, 0, 1, 0, 0)"


In [39]:
sarimax_train_df

Unnamed: 0,date,continent,actual,forecast,error,abs_error,pct_error
0,2000-01-01,TOTAL,91318.0,0.000000,91318.000000,91318.000000,100.000000
1,2000-02-01,TOTAL,80424.0,86345.922797,-5921.922797,5921.922797,-7.363378
2,2000-03-01,TOTAL,75716.0,79209.338286,-3493.338286,3493.338286,-4.613739
3,2000-04-01,TOTAL,62957.0,75853.390173,-12896.390173,12896.390173,-20.484442
4,2000-05-01,TOTAL,53855.0,67094.665899,-13239.665899,13239.665899,-24.583912
...,...,...,...,...,...,...,...
2011,2023-08-01,Europa,20349.0,16467.347129,3881.652871,3881.652871,19.075399
2012,2023-09-01,Europa,16368.0,14823.604256,1544.395744,1544.395744,9.435458
2013,2023-10-01,Europa,22743.0,18042.304760,4700.695240,4700.695240,20.668756
2014,2023-11-01,Europa,27200.0,22980.309523,4219.690477,4219.690477,15.513568


In [44]:
hts2_test_df

Unnamed: 0,date,continent,actual,forecast
24,2024-01-01,America_Central_E_Caribe,912.0,1296.823094
25,2024-02-01,America_Central_E_Caribe,843.0,1377.565111
26,2024-03-01,America_Central_E_Caribe,1302.0,1621.364300
27,2024-04-01,America_Central_E_Caribe,1082.0,1456.391123
28,2024-05-01,America_Central_E_Caribe,1155.0,1251.594965
...,...,...,...,...
295,2024-08-01,Europa,24719.0,24266.246572
296,2024-09-01,Europa,22025.0,18274.542567
297,2024-10-01,Europa,28376.0,23508.461584
298,2024-11-01,Europa,34420.0,24836.935039
