In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
np.random.seed(42)
import pprint
import s3fs

import sys
sys.path.append("../../")
from robust_pca.benchmark import cross_validation, comparator
from robust_pca.imputations import models

In [2]:
path = "s3://aifluence-data2/data/02_intermediate/chatelet_pdt/H.parq"
df = pd.read_parquet(path, engine="pyarrow")
#df = pd.read_parquet("../../../data/H.parq", engine="pyarrow")

**data preparation**

In [3]:
def nan_rule(df, columns, hours):
    """replace nan values by zero if "true" zero (e.g. no train during this period)
    Parameters
    ----------
    df : pd.DataFrame
        "spatiotemporal" dataframe
    columns : List[str]
        list of column names we have to replace nan by zero
    hours : List[str]
        list of hours for which we know there is no train -> true zeros
    """
    data = df.copy()
    data.loc[data.index.get_level_values("datetime").hour.isin(hours), columns] = data.loc[data.index.get_level_values("datetime").hour.isin(hours), columns].fillna(0)
    return data


data = df.reset_index()
#del data["datetime"]
#data = data.rename(columns={"datetime_theo_pdt":"datetime"})
data = data.sort_values(by=["station", "datetime", "position"])
data = data.set_index(["line","station", "datetime", "direction"])
data = nan_rule(data, ["load"], [0, 1, 2, 3])
data = data.reset_index()

**Comparator**

In [189]:
models_to_test = [
  models.ImputeByMean(["datetime.dt.dayofweek"]), #"datetime.dt.round('10min')"
  models.ImputeByMedian(["datetime.dt.dayofweek"]), 
  models.RandomImpute(),
  models.ImputeLOCF(),
  models.ImputeNOCB(),
  # models.ImputeKNN(),
  models.ImputeRPCA(
    method="temporal", 
    #aggregate_time="10min", 
    multivariate=False,
    **{"n_rows":24*12*7, "maxIter":10}
    ),
#   models.ImputeProphet(**{
#               "weekly_seasonality": True, 
#               "yearly_seasonality": True,
#               "interval_width": 0.95,
#               })
# models.ImputeRPCA(),
]

search_params = {
  "ImputeKNN": {"k": {"min":2, "max":3, "type":"Integer"}},
  "ImputeProphet": {
          "daily_seasonality": {"categories": [True, False], "type":"Categorical"},
        },
  "ImputeRPCA": {
    "lam": {"min": 0.5, "max": 1, "type":"Real"},
    "tau": {"min": 1, "max": 1.5, "type":"Real"},
  }
}

line = "H"
station = "VALMONDOIS"
direction = "Paris"
dataset = data[(data["line"]==line) & (data["station"]==station) & (data["direction"]==direction)]
cols_to_impute = ["load"]

dataset = data[(data["line"]==line) & (data["station"]==station) & (data["direction"]==direction)][["datetime", "load"]]
dataset = dataset.set_index("datetime")
dataset = dataset.squeeze()
#dataset = dataset.resample("5min").agg(pd.Series.sum, skipna=False)
dataset = dataset.resample('5min').agg(lambda x: np.nan if np.isnan(x).all() else np.sum(x) )
print(dataset.isna().sum())
dataset = dataset.to_frame()

# dataset = dataset.set_index(["line","station", "datetime", "direction"])
# dataset = dataset.set_index(["datetime", "direction"])
# dataset = dataframe with multiindex and at least, columns in cols_to_impute

prop_nan = 0.05

277134


In [190]:
comparison = comparator.Comparator(
          dataset,
          prop_nan, 
          models_to_test, 
          cols_to_impute,
          search_params=search_params,
)
results = comparison.compare()

pp = pprint.PrettyPrinter(depth=4)
pp.pprint(results)

ImputeByMean
# nan before imputation: 277134
# nan after imputation...: 0
ImputeByMedian
# nan before imputation: 277134
# nan after imputation...: 0
RandomImpute
# nan before imputation: 277134
# nan after imputation...: 0
ImputeLOCF
# nan before imputation: 277134
# nan after imputation...: 0
ImputeNOCB
# nan before imputation: 277134
# nan after imputation...: 0
ImputeRPCA
# nan before imputation: 277134


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  resu

# nan after imputation...: 0
{'ImputeByMean': {'mae': 40092.275, 'rmse': 1583.3222, 'wmape': 0.5709},
 'ImputeByMedian': {'mae': 38835.0, 'rmse': 1785.6184, 'wmape': 0.5443},
 'ImputeLOCF': {'mae': 49505.0, 'rmse': 2006.5276, 'wmape': 0.6972},
 'ImputeNOCB': {'mae': 49029.0, 'rmse': 2008.6276, 'wmape': 0.6773},
 'ImputeRPCA': {'mae': 73303.0, 'rmse': 2729.3393, 'wmape': 1.0},
 'RandomImpute': {'mae': 56980.0, 'rmse': 2371.4574, 'wmape': 0.7823}}


In [186]:
dataset.to_frame()

Unnamed: 0_level_0,load
datetime,Unnamed: 1_level_1
2019-01-01 04:45:00,25.0
2019-01-01 04:50:00,
2019-01-01 04:55:00,
2019-01-01 05:00:00,
2019-01-01 05:05:00,
...,...
2021-11-06 21:25:00,
2021-11-06 21:30:00,
2021-11-06 21:35:00,24.0
2021-11-06 21:40:00,
