Skip to content

Commit

Permalink
Merge pull request #62 from predict-idlab/bound_bug
Browse files Browse the repository at this point in the history
🐛 fix bug with bound_method + ✨ new integrations
  • Loading branch information
jonasvdd committed Jun 7, 2022
2 parents 2df1d47 + d19ae40 commit 468f2b3
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 38 deletions.
1 change: 1 addition & 0 deletions examples/README.md
Expand Up @@ -21,6 +21,7 @@ tsflex is a domain independent package for time series processing & feature extr
| Climate modelling | [Ozone level detection](https://archive.ics.uci.edu/ml/datasets/Ozone%20Level%20Detection) | [example_ozone_level_detection.ipynb](https://github.com/predict-idlab/tsflex/blob/main/examples/example_ozone_level_detection.ipynb) |
| Household data | [Electric power consumption](https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption) | [example_power_consumption_estimation.ipynb](example_power_consumption_estimation.ipynb) |
| Clinical data | [Sleep-EDF Database Expanded](https://physionet.org/content/sleep-edfx/1.0.0/) | [example_sleep_staging.ipynb](example_sleep_staging.ipynb) |
| kaggle competition | [Tabular Playground Series - Apr 2022](https://www.kaggle.com/competitions/tabular-playground-series-apr-2022)| https://www.kaggle.com/code/jeroenvdd/tpsapr22-best-non-dl-model-tsflex-powershap |


<!-- | Wearable data | [WESAD - Wearable stress & affect detection](https://archive.ics.uci.edu/ml/datasets/WESAD+%28Wearable+Stress+and+Affect+Detection%29) | [verbose_example.ipynb](verbose_example.ipynb) - <br>`TODO` create a notebook that uses the whole wesad dataset -->
Expand Down
15 changes: 13 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tsflex"
version = "0.2.3.6" # Do not forget to update the __init__.py __version__ variable
version = "0.2.3.7" # Do not forget to update the __init__.py __version__ variable
description = "Toolkit for flexible processing & feature extraction on time-series data"
authors = ["Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"]
readme = "README.md"
Expand Down Expand Up @@ -33,6 +33,7 @@ tsfresh = "^0.18.0"
tsfel = "^0.1.4"
statsmodels = "0.12.2" # Added bc of this: https://github.com/blue-yonder/tsfresh/issues/897
fastparquet = "0.8.0" # Lock to this version to resolve issue on macos with python 3.7
catch22 = "^0.2.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
25 changes: 24 additions & 1 deletion tests/test_features_feature_collection.py
Expand Up @@ -960,6 +960,29 @@ def windowed_diff(x1, x2):
assert np.all(res["EDA|TMP__windowed_diff"+p].values == manual_diff)


### Test feature extraction length

def test_feature_extraction_length():
s = pd.Series(np.arange(10), name="dummy")
assert len(s) == 10

fc = FeatureCollection(
feature_descriptors=[
FeatureDescriptor(np.max, "dummy", 2, 2),
FeatureDescriptor(
FuncWrapper(np.max, output_names="max_", vectorized=True, axis=-1),
"dummy", 2, 2,
)
]
)
res = fc.calculate(s)

assert len(res) == 2
assert (len(res[0]) == 5) and (len(res[1]) == 5)
assert np.all(res[0].index == res[1].index)
assert np.all(res[0].values == res[1].values)


### Test 'error' use-cases


Expand Down Expand Up @@ -1193,7 +1216,7 @@ def test_serialization(dummy_data):

df_tmp = dummy_data["TMP"].reset_index(drop=True)
df_eda = dummy_data["EDA"].reset_index(drop=True)
out = fc.calculate([df_tmp, df_eda], window_idx="end", return_df=True)
out = fc.calculate([df_tmp, df_eda], return_df=True)
col_order = out.columns

save_path = Path("featurecollection.pkl")
Expand Down
18 changes: 18 additions & 0 deletions tests/test_features_integration.py
Expand Up @@ -20,6 +20,7 @@
tsfel_feature_dict_wrapper,
tsfresh_combiner_wrapper,
tsfresh_settings_wrapper,
catch22_wrapper,
)


Expand Down Expand Up @@ -281,3 +282,20 @@ def test_tsfel_feature_dict_wrapper(dummy_data):

res_df = feature_collection.calculate(dummy_data.first("15min"), return_df=True)
assert (res_df.shape[0] > 0) and (res_df.shape[1]) > 0


## CATCH22

def test_catch22_all_features(dummy_data):
# Tests if we integrate with the catch22 features
from catch22 import catch22_all

catch22_feats = MultipleFeatureDescriptors(
functions=catch22_wrapper(catch22_all),
series_names=["EDA", "TMP"],
windows="2.5min", strides="10min",
)
feature_collection = FeatureCollection(catch22_feats)

res_df = feature_collection.calculate(dummy_data.first("15min"), return_df=True)
assert (res_df.shape[0] > 0) and (res_df.shape[1]) > 0
31 changes: 21 additions & 10 deletions tests/test_strided_rolling.py
Expand Up @@ -62,20 +62,31 @@ def stroll_apply_dummy_func(data, window, stride) -> pd.DataFrame:
stroll = SequenceStridedRolling(data, window, stride, window_idx="end")
return stroll.apply_func(FuncWrapper(np.min))

out = stroll_apply_dummy_func(df_eda[:2198], window=1000, stride=200)
assert out.index[-1] == 2000
out = stroll_apply_dummy_func(df_eda[:2199], window=1000, stride=200)
assert out.index[-1] == 2000
out = stroll_apply_dummy_func(df_eda[:2200], window=1000, stride=200)
assert out.index[-1] == 2200
out = stroll_apply_dummy_func(df_eda[:2201], window=1000, stride=200)
assert out.index[-1] == 2200

out = stroll_apply_dummy_func(df_eda[:2399], window=1000, stride=200)
out = stroll_apply_dummy_func(df_eda[:2202], window=1000, stride=200)
assert out.index[-1] == 2200

# -> slicing is include left bound, discard right bound -> so UNTIL index 2200
# i.e. last index in sequence is 2199 -> last valid full range 2200
out = stroll_apply_dummy_func(df_eda[:2400], window=1000, stride=200)
assert out.index[-1] == 2200
out = stroll_apply_dummy_func(df_eda[:2401], window=1000, stride=200)
assert out.index[-1] == 2400
out = stroll_apply_dummy_func(df_eda[:2530], window=1000, stride=200)
assert out.index[-1] == 2400
def stroll_apply_dummy_func(data, window, stride) -> pd.DataFrame:
stroll = SequenceStridedRolling(data, window, stride, window_idx="begin")
return stroll.apply_func(FuncWrapper(np.min))

out = stroll_apply_dummy_func(df_eda[:2198], window=1000, stride=200)
assert out.index[-1] == 1000
out = stroll_apply_dummy_func(df_eda[:2199], window=1000, stride=200)
assert out.index[-1] == 1000
out = stroll_apply_dummy_func(df_eda[:2200], window=1000, stride=200)
assert out.index[-1] == 1200
out = stroll_apply_dummy_func(df_eda[:2201], window=1000, stride=200)
assert out.index[-1] == 1200
out = stroll_apply_dummy_func(df_eda[:2202], window=1000, stride=200)
assert out.index[-1] == 1200


def test_time_stroll_last_window_full(dummy_data):
Expand Down
2 changes: 1 addition & 1 deletion tsflex/__init__.py
Expand Up @@ -9,7 +9,7 @@

__docformat__ = 'numpy'
__author__ = "Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"
__version__ = '0.2.3.6'
__version__ = '0.2.3.7'
__pdoc__ = {
# do not show tue utils module
'tsflex.utils': False,
Expand Down
6 changes: 3 additions & 3 deletions tsflex/features/feature_collection.py
Expand Up @@ -256,7 +256,7 @@ def calculate(
self,
data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]],
return_df: Optional[bool] = False,
window_idx: Optional[str] = "end",
window_idx: Optional[str] = "begin",
bound_method: Optional[str] = "inner",
approve_sparsity: Optional[bool] = False,
show_progress: Optional[bool] = False,
Expand Down Expand Up @@ -292,7 +292,7 @@ def calculate(
window_idx : str, optional
The window's index position which will be used as index for the
feature_window aggregation. Must be either of: `["begin", "middle", "end"]`.
by default "end". All features in this collection will use the same
by default "begin". All features in this collection will use the same
window_idx.
bound_method: str, optional
The start-end bound methodology which is used to generate the slice ranges
Expand Down Expand Up @@ -367,7 +367,7 @@ def calculate(
# determing the bounds of the series dict items and slice on them
start, end = _determine_bounds(bound_method, list(series_dict.values()))
series_dict = {
n: s[s.index.dtype.type(start) : s.index.dtype.type(end)]
n: s.loc[s.index.dtype.type(start) : s.index.dtype.type(end)] # TODO: check memory efficiency of ths
for n, s, in series_dict.items()
}

Expand Down
74 changes: 66 additions & 8 deletions tsflex/features/integrations.py
Expand Up @@ -42,7 +42,7 @@ def wrap_func(x: np.ndarray):
output_names = _get_name(func) if func_name is None else func_name
# A bit hacky (hard coded), bc hist is only func that returns multiple values
if hasattr(func, "bins"):
output_names = [output_names+f"_bin{idx}" for idx in range(1, func.bins+1)]
output_names = [output_names + f"_bin{idx}" for idx in range(1, func.bins + 1)]
return FuncWrapper(wrap_func, output_names=output_names)


Expand Down Expand Up @@ -110,7 +110,7 @@ def tsfel_feature_dict_wrapper(features_dict: Dict) -> List[Callable]:
.. Note::
This wrapper wraps the output of tsfel its `get_features_by_domain` or
`get_features_by_tag`. <br>
Se more [here](https://github.com/fraunhoferportugal/tsfel/blob/master/tsfel/feature_extraction/features_settings.py).
See more [here](https://github.com/fraunhoferportugal/tsfel/blob/master/tsfel/feature_extraction/features_settings.py).
Example
-------
Expand Down Expand Up @@ -153,14 +153,14 @@ def get_output_names(config: dict):
if nb_outputs == 1:
return func_name
else:
return [func_name+f"_{idx}" for idx in range(1,nb_outputs+1)]
return [func_name + f"_{idx}" for idx in range(1, nb_outputs + 1)]
output_param = eval(config["parameters"][nb_outputs])
return [func_name+f"_{nb_outputs}=v" for v in output_param]
return [func_name + f"_{nb_outputs}={v}" for v in output_param]

functions = []
tsfel_mod = importlib.import_module("tsfel.feature_extraction")
for donain_feats in features_dict.values(): # Iterate over feature domains
for config in donain_feats.values(): # Iterate over function configs
for domain_feats in features_dict.values(): # Iterate over feature domains
for config in domain_feats.values(): # Iterate over function configs
func = getattr(tsfel_mod, config["function"].split(".")[-1])
params = config["parameters"] if config["parameters"] else {}
output_names = get_output_names(config)
Expand Down Expand Up @@ -223,7 +223,7 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
This wrapper wraps the output of tsfresh its `MinimalFCParameters()`,
`EfficientFCParameters()`, `IndexBasedFCParameters()`,
`TimeBasedFCParameters()`, or `ComprehensiveFCParameters()`. <br>
Se more [here](https://github.com/blue-yonder/tsfresh/blob/main/tsfresh/feature_extraction/settings.py).
See more [here](https://github.com/blue-yonder/tsfresh/blob/main/tsfresh/feature_extraction/settings.py).
Example
-------
Expand Down Expand Up @@ -264,5 +264,63 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
functions.append(tsfresh_combiner_wrapper(func, param))
else:
for kwargs in param:
functions.append(FuncWrapper(func, output_names=f"{func.__name__}_{str(kwargs)}", **kwargs))
functions.append(
FuncWrapper(
func, output_names=f"{func.__name__}_{str(kwargs)}", **kwargs
)
)
return functions


# ----------------------------------- --CATCH22 -------------------------------------
def catch22_wrapper(catch22_all: Callable) -> FuncWrapper:
"""Wrapper enabling compatibility with catch22.
[catch22](https://github.com/chlubba/catch22) is a collection of 22 time series
features that are a high-performing subset of the over 7000 features in hctsa.
By using this wrapper, we can plug the catch22 features in a tsflex
``FeatureCollection``.
This enables to easily extract the catch22 features while leveraging the flexibility
of tsflex.
.. Note::
This wrapper wraps the `catch22_all` function from `catch22`.
See more [here](https://github.com/chlubba/catch22/blob/master/wrap_Python/catch22/catch22.py).
Example
-------
```python
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import catch22_wrapper
from catch22 import catch22_all
catch22_feats = MultipleFeatureDescriptors(
functions=catch22_wrapper(catch22_all),
series_names=["sig_0", "sig_1"], # list of signal names
windows="15min", strides="2min",
)
fc = FeatureCollection(catch22_feats)
fc.calculate(data) # calculate the features on your data
```
Parameters
----------
catch22_all: Callable
The `catch22_all` function from the `catch22` package.
Returns
-------
FuncWrapper
The wrapped `catch22_all` function that is compatible with tsflex.
This FuncWrapper will output the 22 catch22 features.
"""
catch22_names = catch22_all([0])["names"]

def wrap_catch22_all(x):
return catch22_all(x)["values"]

wrap_catch22_all.__name__ = "[wrapped]__" + _get_name(catch22_all)
return FuncWrapper(wrap_catch22_all, output_names=catch22_names)
16 changes: 5 additions & 11 deletions tsflex/features/segmenter/strided_rolling.py
Expand Up @@ -112,7 +112,7 @@ def __init__(
self.data_type = func_data_type

# 0. Standardize the input
series_list: List[pd.Series] = to_series_list(data)
series_list: List[pd.Series] = to_series_list(data) # TODO: isn't it always a list of series?
self.series_dtype = AttributeParser.determine_type(series_list)
self.series_key: Tuple[str, ...] = tuple([str(s.name) for s in series_list])

Expand Down Expand Up @@ -142,17 +142,12 @@ def __init__(

# 4. Check the sparsity assumption
if not self.approve_sparsity:
qs = [0, 0.1, 0.5, 0.9, 1]
for container in self.series_containers:
series_idx_stats = np.quantile(
container.end_indexes - container.start_indexes, q=qs
)
q_str = ", ".join([f"q={q}: {v}" for q, v in zip(qs, series_idx_stats)])
# Warn when min != max
if not all(series_idx_stats == series_idx_stats[-1]):
if np.ptp(container.end_indexes - container.start_indexes) != 0:
warnings.warn(
f"There are gaps in the sequence of the {container.name}"
f"-series;\n \t Quantiles of nb values in window: {q_str}",
f"-series!",
RuntimeWarning,
)

Expand Down Expand Up @@ -382,7 +377,7 @@ def _construct_output_index(self, series: pd.Series) -> pd.Index:
window_offset = self._get_window_offset(self.window)
# bool which indicates whether the `end` lies on the boundary
# and as arange does not include the right boundary -> use it to enlarge `stop`
boundary = (self.end - self.start - self.window) % self.stride == 0
boundary = (self.end + 1 - self.start - self.window) % self.stride <= 1
return pd.Index(
data=np.arange(
start=self.start + window_offset,
Expand Down Expand Up @@ -525,7 +520,6 @@ def _construct_output_index(self, series: pd.Series) -> pd.DatetimeIndex:
# bool which indicates whether the `end` lies on the boundary
# and as arange does not include the right boundary -> use it to enlarge `stop`
boundary = (self.end - self.start - self.window) % self.stride == 0

return series.iloc[
np.arange(
start=int(window_offset),
Expand Down Expand Up @@ -581,7 +575,7 @@ def _sliding_strided_window_1d(data: np.ndarray, window: int, step: int):
assert (step >= 1) & (window < len(data))

shape = [
np.ceil(len(data) / step - window / step).astype(int),
np.floor(len(data) / step - window / step + 1).astype(int),
window,
]

Expand Down
2 changes: 1 addition & 1 deletion tsflex/features/utils.py
Expand Up @@ -141,7 +141,7 @@ def _make_single_func_robust(

output_names = func_wrapper_kwargs.get("output_names")

def wrap_func(*series: Union[np.ndarray, pd.Series], **kwargs) -> FuncWrapper:
def wrap_func(*series: Union[np.ndarray, pd.Series], **kwargs) -> Callable:
if not passthrough_nans:
series = [s[~np.isnan(s)] for s in series]
if any([len(s) < min_nb_samples for s in series]):
Expand Down

0 comments on commit 468f2b3

Please sign in to comment.