Merge pull request #62 from predict-idlab/bound_bug

🐛 fix bug with bound_method + ✨ new integrations
predict-idlab · Jun 7, 2022 · 468f2b3 · 468f2b3
2 parents 2df1d47 + d19ae40
commit 468f2b3
Show file tree

Hide file tree

Showing 11 changed files with 155 additions and 38 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -21,6 +21,7 @@ tsflex is a domain independent package for time series processing & feature extr
 | Climate modelling | [Ozone level detection](https://archive.ics.uci.edu/ml/datasets/Ozone%20Level%20Detection) | [example_ozone_level_detection.ipynb](https://github.com/predict-idlab/tsflex/blob/main/examples/example_ozone_level_detection.ipynb) |  
 | Household data | [Electric power consumption](https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption) | [example_power_consumption_estimation.ipynb](example_power_consumption_estimation.ipynb) |
 | Clinical data | [Sleep-EDF Database Expanded](https://physionet.org/content/sleep-edfx/1.0.0/) | [example_sleep_staging.ipynb](example_sleep_staging.ipynb) |
+| kaggle competition | [Tabular Playground Series - Apr 2022](https://www.kaggle.com/competitions/tabular-playground-series-apr-2022)| https://www.kaggle.com/code/jeroenvdd/tpsapr22-best-non-dl-model-tsflex-powershap | 
 
 
 <!-- | Wearable data | [WESAD - Wearable stress & affect detection](https://archive.ics.uci.edu/ml/datasets/WESAD+%28Wearable+Stress+and+Affect+Detection%29) | [verbose_example.ipynb](verbose_example.ipynb) - <br>`TODO` create a notebook that uses the whole wesad dataset -->

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tsflex"
-version = "0.2.3.6"  # Do not forget to update the __init__.py __version__ variable
+version = "0.2.3.7"  # Do not forget to update the __init__.py __version__ variable
 description = "Toolkit for flexible processing & feature extraction on time-series data"
 authors = ["Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"]
 readme = "README.md"
@@ -33,6 +33,7 @@ tsfresh = "^0.18.0"
 tsfel = "^0.1.4"
 statsmodels = "0.12.2"  # Added bc of this: https://github.com/blue-yonder/tsfresh/issues/897
 fastparquet = "0.8.0"  # Lock to this version to resolve issue on macos with python 3.7
+catch22 = "^0.2.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]

diff --git a/tests/test_features_feature_collection.py b/tests/test_features_feature_collection.py
@@ -960,6 +960,29 @@ def windowed_diff(x1, x2):
     assert np.all(res["EDA|TMP__windowed_diff"+p].values == manual_diff)
 
 
+### Test feature extraction length
+
+def test_feature_extraction_length():
+    s = pd.Series(np.arange(10), name="dummy")
+    assert len(s) == 10
+
+    fc = FeatureCollection(
+        feature_descriptors=[
+            FeatureDescriptor(np.max, "dummy", 2, 2),
+            FeatureDescriptor(
+                FuncWrapper(np.max, output_names="max_", vectorized=True, axis=-1),
+                "dummy",  2, 2,
+            )
+        ]
+    )
+    res = fc.calculate(s)
+
+    assert len(res) == 2
+    assert (len(res[0]) == 5) and (len(res[1]) == 5)
+    assert np.all(res[0].index == res[1].index)
+    assert np.all(res[0].values == res[1].values)
+
+
 ### Test 'error' use-cases
 
 
@@ -1193,7 +1216,7 @@ def test_serialization(dummy_data):
 
     df_tmp = dummy_data["TMP"].reset_index(drop=True)
     df_eda = dummy_data["EDA"].reset_index(drop=True)
-    out = fc.calculate([df_tmp, df_eda], window_idx="end", return_df=True)
+    out = fc.calculate([df_tmp, df_eda], return_df=True)
     col_order = out.columns
 
     save_path = Path("featurecollection.pkl")

diff --git a/tests/test_features_integration.py b/tests/test_features_integration.py
@@ -20,6 +20,7 @@
     tsfel_feature_dict_wrapper,
     tsfresh_combiner_wrapper,
     tsfresh_settings_wrapper,
+    catch22_wrapper,
 )
 
 
@@ -281,3 +282,20 @@ def test_tsfel_feature_dict_wrapper(dummy_data):
 
     res_df = feature_collection.calculate(dummy_data.first("15min"), return_df=True)
     assert (res_df.shape[0] > 0) and (res_df.shape[1]) > 0
+
+
+## CATCH22
+
+def test_catch22_all_features(dummy_data):
+    # Tests if we integrate with the catch22 features
+    from catch22 import catch22_all
+
+    catch22_feats = MultipleFeatureDescriptors(
+        functions=catch22_wrapper(catch22_all),
+        series_names=["EDA", "TMP"],
+        windows="2.5min", strides="10min",
+    )
+    feature_collection = FeatureCollection(catch22_feats)
+
+    res_df = feature_collection.calculate(dummy_data.first("15min"), return_df=True)
+    assert (res_df.shape[0] > 0) and (res_df.shape[1]) > 0
diff --git a/tests/test_strided_rolling.py b/tests/test_strided_rolling.py
@@ -62,20 +62,31 @@ def stroll_apply_dummy_func(data, window, stride) -> pd.DataFrame:
         stroll = SequenceStridedRolling(data, window, stride, window_idx="end")
         return stroll.apply_func(FuncWrapper(np.min))
 
+    out = stroll_apply_dummy_func(df_eda[:2198], window=1000, stride=200)
+    assert out.index[-1] == 2000
+    out = stroll_apply_dummy_func(df_eda[:2199], window=1000, stride=200)
+    assert out.index[-1] == 2000
+    out = stroll_apply_dummy_func(df_eda[:2200], window=1000, stride=200)
+    assert out.index[-1] == 2200
     out = stroll_apply_dummy_func(df_eda[:2201], window=1000, stride=200)
     assert out.index[-1] == 2200
-
-    out = stroll_apply_dummy_func(df_eda[:2399], window=1000, stride=200)
+    out = stroll_apply_dummy_func(df_eda[:2202], window=1000, stride=200)
     assert out.index[-1] == 2200
 
-    # -> slicing is include left bound, discard right bound -> so UNTIL index 2200
-    # i.e. last index in sequence is 2199 -> last valid full range 2200
-    out = stroll_apply_dummy_func(df_eda[:2400], window=1000, stride=200)
-    assert out.index[-1] == 2200
-    out = stroll_apply_dummy_func(df_eda[:2401], window=1000, stride=200)
-    assert out.index[-1] == 2400
-    out = stroll_apply_dummy_func(df_eda[:2530], window=1000, stride=200)
-    assert out.index[-1] == 2400
+    def stroll_apply_dummy_func(data, window, stride) -> pd.DataFrame:
+        stroll = SequenceStridedRolling(data, window, stride, window_idx="begin")
+        return stroll.apply_func(FuncWrapper(np.min))
+
+    out = stroll_apply_dummy_func(df_eda[:2198], window=1000, stride=200)
+    assert out.index[-1] == 1000
+    out = stroll_apply_dummy_func(df_eda[:2199], window=1000, stride=200)
+    assert out.index[-1] == 1000
+    out = stroll_apply_dummy_func(df_eda[:2200], window=1000, stride=200)
+    assert out.index[-1] == 1200
+    out = stroll_apply_dummy_func(df_eda[:2201], window=1000, stride=200)
+    assert out.index[-1] == 1200
+    out = stroll_apply_dummy_func(df_eda[:2202], window=1000, stride=200)
+    assert out.index[-1] == 1200
 
 
 def test_time_stroll_last_window_full(dummy_data):

diff --git a/tsflex/__init__.py b/tsflex/__init__.py
@@ -9,7 +9,7 @@
 
 __docformat__ = 'numpy'
 __author__ = "Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"
-__version__ = '0.2.3.6'
+__version__ = '0.2.3.7'
 __pdoc__ = {
     # do not show tue utils module
     'tsflex.utils': False,

diff --git a/tsflex/features/feature_collection.py b/tsflex/features/feature_collection.py
@@ -256,7 +256,7 @@ def calculate(
         self,
         data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]],
         return_df: Optional[bool] = False,
-        window_idx: Optional[str] = "end",
+        window_idx: Optional[str] = "begin",
         bound_method: Optional[str] = "inner",
         approve_sparsity: Optional[bool] = False,
         show_progress: Optional[bool] = False,
@@ -292,7 +292,7 @@ def calculate(
         window_idx : str, optional
             The window's index position which will be used as index for the
             feature_window aggregation. Must be either of: `["begin", "middle", "end"]`.
-            by default "end". All features in this collection will use the same
+            by default "begin". All features in this collection will use the same
             window_idx.
         bound_method: str, optional
             The start-end bound methodology which is used to generate the slice ranges
@@ -367,7 +367,7 @@ def calculate(
         # determing the bounds of the series dict items and slice on them
         start, end = _determine_bounds(bound_method, list(series_dict.values()))
         series_dict = {
-            n: s[s.index.dtype.type(start) : s.index.dtype.type(end)]
+            n: s.loc[s.index.dtype.type(start) : s.index.dtype.type(end)]  # TODO: check memory efficiency of ths
             for n, s, in series_dict.items()
         }
 

diff --git a/tsflex/features/integrations.py b/tsflex/features/integrations.py
@@ -42,7 +42,7 @@ def wrap_func(x: np.ndarray):
     output_names = _get_name(func) if func_name is None else func_name
     # A bit hacky (hard coded), bc hist is only func that returns multiple values
     if hasattr(func, "bins"):
-        output_names = [output_names+f"_bin{idx}" for idx in range(1, func.bins+1)]
+        output_names = [output_names + f"_bin{idx}" for idx in range(1, func.bins + 1)]
     return FuncWrapper(wrap_func, output_names=output_names)
 
 
@@ -110,7 +110,7 @@ def tsfel_feature_dict_wrapper(features_dict: Dict) -> List[Callable]:
     .. Note::
         This wrapper wraps the output of tsfel its `get_features_by_domain` or
         `get_features_by_tag`. <br>
-        Se more [here](https://github.com/fraunhoferportugal/tsfel/blob/master/tsfel/feature_extraction/features_settings.py).
+        See more [here](https://github.com/fraunhoferportugal/tsfel/blob/master/tsfel/feature_extraction/features_settings.py).
 
     Example
     -------
@@ -153,14 +153,14 @@ def get_output_names(config: dict):
             if nb_outputs == 1:
                 return func_name
             else:
-                return [func_name+f"_{idx}" for idx in range(1,nb_outputs+1)]
+                return [func_name + f"_{idx}" for idx in range(1, nb_outputs + 1)]
         output_param = eval(config["parameters"][nb_outputs])
-        return [func_name+f"_{nb_outputs}=v" for v in output_param]
+        return [func_name + f"_{nb_outputs}={v}" for v in output_param]
 
     functions = []
     tsfel_mod = importlib.import_module("tsfel.feature_extraction")
-    for donain_feats in features_dict.values():  # Iterate over feature domains
-        for config in donain_feats.values():  # Iterate over function configs
+    for domain_feats in features_dict.values():  # Iterate over feature domains
+        for config in domain_feats.values():  # Iterate over function configs
             func = getattr(tsfel_mod, config["function"].split(".")[-1])
             params = config["parameters"] if config["parameters"] else {}
             output_names = get_output_names(config)
@@ -223,7 +223,7 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
         This wrapper wraps the output of tsfresh its `MinimalFCParameters()`, 
         `EfficientFCParameters()`, `IndexBasedFCParameters()`, 
         `TimeBasedFCParameters()`, or `ComprehensiveFCParameters()`. <br>
-        Se more [here](https://github.com/blue-yonder/tsfresh/blob/main/tsfresh/feature_extraction/settings.py).
+        See more [here](https://github.com/blue-yonder/tsfresh/blob/main/tsfresh/feature_extraction/settings.py).
 
     Example
     -------
@@ -264,5 +264,63 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
             functions.append(tsfresh_combiner_wrapper(func, param))
         else:
             for kwargs in param:
-                functions.append(FuncWrapper(func, output_names=f"{func.__name__}_{str(kwargs)}", **kwargs))
+                functions.append(
+                    FuncWrapper(
+                        func, output_names=f"{func.__name__}_{str(kwargs)}", **kwargs
+                    )
+                )
     return functions
+
+
+# ----------------------------------- --CATCH22 -------------------------------------
+def catch22_wrapper(catch22_all: Callable) -> FuncWrapper:
+    """Wrapper enabling compatibility with catch22.
+
+    [catch22](https://github.com/chlubba/catch22) is a collection of 22 time series 
+    features that are a high-performing subset of the over 7000 features in hctsa.
+
+    By using this wrapper, we can plug the catch22 features in a tsflex
+    ``FeatureCollection``.
+    This enables to easily extract the catch22 features while leveraging the flexibility
+    of tsflex.
+
+    .. Note::
+        This wrapper wraps the `catch22_all` function from `catch22`.
+        See more [here](https://github.com/chlubba/catch22/blob/master/wrap_Python/catch22/catch22.py).
+
+    Example
+    -------
+    ```python
+    from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
+    from tsflex.features.integrations import catch22_wrapper
+    from catch22 import catch22_all
+
+    catch22_feats = MultipleFeatureDescriptors(
+        functions=catch22_wrapper(catch22_all),
+        series_names=["sig_0", "sig_1"],  # list of signal names
+        windows="15min", strides="2min",
+    )
+
+    fc = FeatureCollection(catch22_feats)
+    fc.calculate(data)  # calculate the features on your data
+    ```
+
+    Parameters
+    ----------
+    catch22_all: Callable
+        The `catch22_all` function from the `catch22` package.
+
+    Returns
+    -------
+    FuncWrapper
+        The wrapped `catch22_all` function that is compatible with tsflex.
+        This FuncWrapper will output the 22 catch22 features.
+ 
+    """
+    catch22_names = catch22_all([0])["names"]
+
+    def wrap_catch22_all(x):
+        return catch22_all(x)["values"]
+
+    wrap_catch22_all.__name__ = "[wrapped]__" + _get_name(catch22_all)
+    return FuncWrapper(wrap_catch22_all, output_names=catch22_names)
diff --git a/tsflex/features/segmenter/strided_rolling.py b/tsflex/features/segmenter/strided_rolling.py
@@ -112,7 +112,7 @@ def __init__(
         self.data_type = func_data_type
 
         # 0. Standardize the input
-        series_list: List[pd.Series] = to_series_list(data)
+        series_list: List[pd.Series] = to_series_list(data)  # TODO: isn't it always a list of series?
         self.series_dtype = AttributeParser.determine_type(series_list)
         self.series_key: Tuple[str, ...] = tuple([str(s.name) for s in series_list])
 
@@ -142,17 +142,12 @@ def __init__(
 
         # 4. Check the sparsity assumption
         if not self.approve_sparsity:
-            qs = [0, 0.1, 0.5, 0.9, 1]
             for container in self.series_containers:
-                series_idx_stats = np.quantile(
-                    container.end_indexes - container.start_indexes, q=qs
-                )
-                q_str = ", ".join([f"q={q}: {v}" for q, v in zip(qs, series_idx_stats)])
                 # Warn when min != max
-                if not all(series_idx_stats == series_idx_stats[-1]):
+                if np.ptp(container.end_indexes - container.start_indexes) != 0:
                     warnings.warn(
                         f"There are gaps in the sequence of the {container.name}"
-                        f"-series;\n \t Quantiles of nb values in window: {q_str}",
+                        f"-series!",
                         RuntimeWarning,
                     )
 
@@ -382,7 +377,7 @@ def _construct_output_index(self, series: pd.Series) -> pd.Index:
         window_offset = self._get_window_offset(self.window)
         # bool which indicates whether the `end` lies on the boundary
         # and as arange does not include the right boundary -> use it to enlarge `stop`
-        boundary = (self.end - self.start - self.window) % self.stride == 0
+        boundary = (self.end + 1 - self.start - self.window) % self.stride <= 1
         return pd.Index(
             data=np.arange(
                 start=self.start + window_offset,
@@ -525,7 +520,6 @@ def _construct_output_index(self, series: pd.Series) -> pd.DatetimeIndex:
         # bool which indicates whether the `end` lies on the boundary
         # and as arange does not include the right boundary -> use it to enlarge `stop`
         boundary = (self.end - self.start - self.window) % self.stride == 0
-
         return series.iloc[
             np.arange(
                 start=int(window_offset),
@@ -581,7 +575,7 @@ def _sliding_strided_window_1d(data: np.ndarray, window: int, step: int):
     assert (step >= 1) & (window < len(data))
 
     shape = [
-        np.ceil(len(data) / step - window / step).astype(int),
+        np.floor(len(data) / step - window / step + 1).astype(int),
         window,
     ]
 

diff --git a/tsflex/features/utils.py b/tsflex/features/utils.py
@@ -141,7 +141,7 @@ def _make_single_func_robust(
 
     output_names = func_wrapper_kwargs.get("output_names")
 
-    def wrap_func(*series: Union[np.ndarray, pd.Series], **kwargs) -> FuncWrapper:
+    def wrap_func(*series: Union[np.ndarray, pd.Series], **kwargs) -> Callable:
         if not passthrough_nans:
             series = [s[~np.isnan(s)] for s in series]
         if any([len(s) < min_nb_samples for s in series]):