In [None]:
from datetime import datetime

import numpy as np
import pandas as pd

In [None]:
time_now = datetime.now().strftime("%Y-%m-%d")
print(f"Time: {time_now}")

In [None]:
MODELS = [
    "PyBoost",
    "DLinear_NN",
    "PatchTST_NN",
    "GPT4TS_NN",
    # "TimesNet_NN",
    # "TimeMixer_NN",
    "CycleNet_NN",
]
DATASETS = [
    # "ILI", 
    "WTH", 
    "nn5", 
    "demand_forecasting_kernels", 
    "fred_md"
]
STRATEGY_TIMES = [
    "FlatWideMIMOStrategy__model_horizon_NaN",
    "MIMOStrategy__model_horizon_NaN",
    "RecursiveStrategy__model_horizon_1.0",
    "RecursiveStrategy__model_horizon_6.0",
]
DATETIMES = ["False", "with_normalization_over_all"]
IDS = ["False", "with_le_normalization_over_all"]
TRANSFORMER_NAMES = [
    "NaN",
    "DifferenceNormalizer",
    "LastKnownNormalizer",
    "LastKnownNormalizer_wo_standardscaler",
    "DifferenceNormalizer_wo_standardscaler",
]
TRANSFORMER_REGIMES = ["NaN", "delta", "ratio"]
TRANSFORMER_TR_FEATURES = ["True"]
TRANSFORMER_TR_TARGETS = ["True"]
REGIMES = ["multivariate", "global"]
CI = ["False", "True"]

In [None]:
df_path = f"agg_results_{time_now}__normalized_True.csv"

In [None]:
raw_df = pd.read_csv(df_path)

# Convert bool to object
for col in raw_df.columns:
    if raw_df[col].dtype == bool:
        raw_df[col] = raw_df[col].astype(str)
        print(f"Converted {col} to object")

raw_df = raw_df.dropna(
    subset=[
        "mae_test",
        "rmse_test",
        "fit_time_test",
        "forecast_time_test",
        "mae_val",
        "rmse_val",
        "fit_time_val",
        "forecast_time_val",
    ]
)
# NaN to "NaN"
raw_df = raw_df.fillna("NaN")

# colname regime -> colname transormer_regime
# colname transformer -> colname transformer_name
# colname tr_target -> colname transformer_tr_target
# colname tr_features -> colname transformer_tr_features
raw_df = raw_df.rename(
    columns={
        "dateteime": "datetime",
        "regime": "transformer_regime",
        "transformer": "transformer_name",
        "tr_target": "transformer_tr_target",
        "tr_features": "transformer_tr_features",
    }
)

# mult ("False" or "True") to regime ("multivariate" or "global")
raw_df["regime"] = np.where(raw_df["mult"] == "False", "global", "multivariate")

# Concatenate strategy_time and model_horizon
raw_df["strategy_time"] = (
    raw_df["strategy_time"] + "__model_horizon_" + raw_df["model_hor"].astype(str)
)

# Drop unnecessary columns
raw_df = raw_df.drop(columns=["model_hor", "hor", "hist", "mult"])
raw_df = raw_df.drop(columns=["transformer_tr_target", "transformer_tr_features"])

raw_df["ci"] = raw_df["ci"].replace({"True": "CI", "False": "CM"})
raw_df["mode"] = raw_df["regime"] + " " + raw_df["ci"]
raw_df["mode"] = raw_df["mode"].replace({"global CI": "global", "global CM": "global"})
raw_df.loc[(raw_df["model"] == "PyBoost") & (raw_df["mode"] == "multivariate CI"), "mode"] = (
    "multivariate CM"
)
raw_df = raw_df.drop(columns=["regime", "ci"])

# Filter rows with constants initialized in the top of the notebook
filtered_df = raw_df[
    (raw_df["model"].isin(MODELS))
    & (raw_df["dataset"].isin(DATASETS))
    & (raw_df["strategy_time"].isin(STRATEGY_TIMES))
    & (raw_df["transformer_name"].isin(TRANSFORMER_NAMES))
    & (raw_df["transformer_regime"].isin(TRANSFORMER_REGIMES))
    & (raw_df["datetime"].isin(DATETIMES))
    & (raw_df["id"].isin(IDS))
]

raw_df = raw_df.drop_duplicates(
    subset=[
        "dataset",
        "model",
        "strategy_time",
        "datetime",
        "id",
        "transformer_name",
        "mode",
        "transformer_regime",
    ]
)

In [None]:
# Value counts for each column
cols_for_check = [
    col
    for col in raw_df.columns
    if col
    not in [
        "mae_test",
        "rmse_test",
        "mape_test",
        "fit_time_test",
        "forecast_time_test",
        "mae_val",
        "rmse_val",
        "mape_val",
        "fit_time_val",
        "forecast_time_val",
    ]
]

for col in cols_for_check:
    print("Column: ", col)
    print("Type of data: ", raw_df[col].dtype)
    
    print(raw_df[col].value_counts(dropna=False))
    print()

In [None]:
raw_df.to_csv(f"{df_path[:-4]}_cleaned.csv", index=False)