In [6]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
np.random.seed(42)
import pprint
import s3fs

import sys
sys.path.append("../../")
from robust_pca.benchmark import comparator
from robust_pca.imputations import models

In [7]:
path = "s3://aifluence-data2/data/02_intermediate/chatelet_pdt/H.parq"
df = pd.read_parquet(path, engine="pyarrow")

  warn('Trying to detect encoding from a tiny portion of ({}) byte(s).'.format(length))


**data preparation**

In [8]:
def nan_rule(df, columns, hours):
    """replace nan values by zero if "true" zero (e.g. no train during this period)
    Parameters
    ----------
    df : pd.DataFrame
        "spatiotemporal" dataframe
    columns : List[str]
        list of column names we have to replace nan by zero
    hours : List[str]
        list of hours for which we know there is no train -> true zeros
    """
    data = df.copy()
    data.loc[data.index.get_level_values("datetime").hour.isin(hours), columns] = data.loc[data.index.get_level_values("datetime").hour.isin(hours), columns].fillna(0)
    return data


data = df.reset_index()
#del data["datetime"]
#data = data.rename(columns={"datetime_theo_pdt":"datetime"})
data = data.sort_values(by=["station", "datetime", "position"])
data = data.set_index(["line","station", "datetime", "direction"])
data = nan_rule(data, ["load"], [0, 1, 2, 3])
data = data.reset_index()

In [9]:
data.head()

Unnamed: 0,line,station,datetime,direction,datetime_theo_pdt,position,index,train,date_count,n_coaches,mission,datetime_theo_cht,loading,unloading,load,origin_cht,destination_cht,n_rows_by_stop,ORIGIN,DESTINATION
0,H,AUVERS SUR OISE,2019-01-01 07:15:46,Suburb,2019-01-01 07:15:20,1.0,8399.0,121009,2019-01-01,1,TSOL,2019-01-01 07:14:40,0.0,0.0,6.0,Pontoise,Creil,1,PERSAN BEAUMONT,CREIL
1,H,AUVERS SUR OISE,2019-01-01 07:41:13,Paris,2019-01-01 07:40:20,1.0,8400.0,121012,2019-01-01,1,TOLI,2019-01-01 07:39:40,4.0,1.0,20.0,Creil,Pontoise,1,PERSAN BEAUMONT,PONTOISE
2,H,AUVERS SUR OISE,2019-01-01 08:14:16,Suburb,2019-01-01 08:15:20,1.0,8401.0,121015,2019-01-01,1,TSOL,2019-01-01 08:14:40,1.0,3.0,16.0,Pontoise,Creil,1,PERSAN BEAUMONT,CREIL
3,H,AUVERS SUR OISE,2019-01-01 08:40:22,Paris,2019-01-01 08:40:40,1.0,8402.0,121020,2019-01-01,1,TIMA,2019-01-01 08:40:00,0.0,2.0,11.0,Persan-Beaumont,Pontoise,1,PERSAN BEAUMONT,PONTOISE
4,H,AUVERS SUR OISE,2019-01-01 09:14:37,Suburb,2019-01-01 09:15:20,1.0,8403.0,121023,2019-01-01,1,TSOL,2019-01-01 09:14:40,2.0,6.0,20.0,Pontoise,Creil,1,PERSAN BEAUMONT,CREIL


In [10]:
print(data[(data.position==2) & (data.n_coaches==2)]["load"].isna().sum())
print(data[(data.position==1) & (data.n_coaches==2)]["load"].isna().sum())

0
0


In [11]:
data[(data.n_coaches==2) & (data.load.isnull())]["position"].unique()

array([nan])

**Comparator**

In [24]:
models_to_test = [
  # models.ImputeByMean(["datetime.dt.dayofweek", "datetime.dt.round('10min')"]),
  # models.ImputeByMedian(["datetime.dt.dayofweek", "datetime.dt.round('10min')"]), 
  # models.RandomImpute(),
  # models.ImputeLOCF(),
  # models.ImputeNOCB(),
  # models.ImputeByInterpolation(**{"method": "linear"}),
  # models.ImputeBySpline(),
  # models.ImputeKNN(),
  models.ImputeRPCA(
    method="temporal", 
    aggregate_time="10min", 
    multivariate=False,
    **{"n_rows":24*12*7, "maxIter":10, "tau":1, "lam":0.8}
    ),
#   models.ImputeProphet(**{
#               "weekly_seasonality": True, 
#               "yearly_seasonality": True,
#               "interval_width": 0.95,
#               })
# models.ImputeRPCA(),
]

search_params = {
  "ImputeKNN": {"k": {"min":2, "max":3, "type":"Integer"}},
  "ImputeProphet": {
          "daily_seasonality": {"categories": [True, False], "type":"Categorical"},
        },
  # "ImputeRPCA": {
  #   "lam": {"min": 0.5, "max": 1, "type":"Real"},
  #   "tau": {"min": 1, "max": 1.5, "type":"Real"},
  # }
}

line = "H"
station = "VALMONDOIS"
direction = "Paris"
dataset = data[(data["line"]==line) & (data["station"]==station) & (data["direction"]==direction)]
cols_to_impute = ["load"]

# dataset = data[(data["line"]==line) & (data["station"]==station) & (data["direction"]==direction)][["datetime", "load"]]
# dataset = dataset.set_index("datetime")
# dataset = dataset.squeeze()
# #dataset = dataset.resample("5min").agg(pd.Series.sum, skipna=False)
# dataset = dataset.resample('5min').agg(lambda x: np.nan if np.isnan(x).all() else np.sum(x) )
# print(dataset.isna().sum())
# dataset = dataset.to_frame()

dataset = dataset.set_index(["line","station", "datetime", "direction"])
# dataset = dataset.set_index(["datetime", "direction"])
# dataset = dataframe with multiindex and at least, columns in cols_to_impute

prop_nan = 0.05
filter_value_nan = 20

In [79]:
comparison = comparator.Comparator(
          dataset,
          prop_nan, 
          models_to_test, 
          cols_to_impute,
          search_params=search_params,
          filter_value_nan=filter_value_nan,
)
results = comparison.compare()

pp = pprint.PrettyPrinter(depth=4)
pp.pprint(results)

ImputeRPCA
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_entropy successed !!!
impute_by_max_ent

ValueError: cannot join with no overlapping index names