Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 fix bug with bound_method + ✨ new integrations #62

Merged
merged 11 commits into from Jun 7, 2022
Merged
1 change: 1 addition & 0 deletions examples/README.md
Expand Up @@ -21,6 +21,7 @@ tsflex is a domain independent package for time series processing & feature extr
| Climate modelling | [Ozone level detection](https://archive.ics.uci.edu/ml/datasets/Ozone%20Level%20Detection) | [example_ozone_level_detection.ipynb](https://github.com/predict-idlab/tsflex/blob/main/examples/example_ozone_level_detection.ipynb) |
| Household data | [Electric power consumption](https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption) | [example_power_consumption_estimation.ipynb](example_power_consumption_estimation.ipynb) |
| Clinical data | [Sleep-EDF Database Expanded](https://physionet.org/content/sleep-edfx/1.0.0/) | [example_sleep_staging.ipynb](example_sleep_staging.ipynb) |
| kaggle competition | [Tabular Playground Series - Apr 2022](https://www.kaggle.com/competitions/tabular-playground-series-apr-2022)| https://www.kaggle.com/code/jeroenvdd/tpsapr22-best-non-dl-model-tsflex-powershap |
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: maybe state here that the data was already segmented -> so here you can find an example on how to use tsflex on already segmented data?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm not very proud about how we did it (considering the long table as one large series and having a stride that is equal to your sample size)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, will create an issue or extend an existing one with this topic



<!-- | Wearable data | [WESAD - Wearable stress & affect detection](https://archive.ics.uci.edu/ml/datasets/WESAD+%28Wearable+Stress+and+Affect+Detection%29) | [verbose_example.ipynb](verbose_example.ipynb) - <br>`TODO` create a notebook that uses the whole wesad dataset -->
Expand Down
15 changes: 13 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tsflex"
version = "0.2.3.6" # Do not forget to update the __init__.py __version__ variable
version = "0.2.3.7" # Do not forget to update the __init__.py __version__ variable
description = "Toolkit for flexible processing & feature extraction on time-series data"
authors = ["Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"]
readme = "README.md"
Expand Down Expand Up @@ -33,6 +33,7 @@ tsfresh = "^0.18.0"
tsfel = "^0.1.4"
statsmodels = "0.12.2" # Added bc of this: https://github.com/blue-yonder/tsfresh/issues/897
fastparquet = "0.8.0" # Lock to this version to resolve issue on macos with python 3.7
catch22 = "^0.2.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
25 changes: 24 additions & 1 deletion tests/test_features_feature_collection.py
Expand Up @@ -960,6 +960,29 @@ def windowed_diff(x1, x2):
assert np.all(res["EDA|TMP__windowed_diff"+p].values == manual_diff)


### Test feature extraction length

def test_feature_extraction_length():
s = pd.Series(np.arange(10), name="dummy")
assert len(s) == 10

fc = FeatureCollection(
feature_descriptors=[
FeatureDescriptor(np.max, "dummy", 2, 2),
FeatureDescriptor(
FuncWrapper(np.max, output_names="max_", vectorized=True, axis=-1),
"dummy", 2, 2,
)
]
)
res = fc.calculate(s)

assert len(res) == 2
assert (len(res[0]) == 5) and (len(res[1]) == 5)
assert np.all(res[0].index == res[1].index)
assert np.all(res[0].values == res[1].values)


### Test 'error' use-cases


Expand Down Expand Up @@ -1193,7 +1216,7 @@ def test_serialization(dummy_data):

df_tmp = dummy_data["TMP"].reset_index(drop=True)
df_eda = dummy_data["EDA"].reset_index(drop=True)
out = fc.calculate([df_tmp, df_eda], window_idx="end", return_df=True)
out = fc.calculate([df_tmp, df_eda], return_df=True)
jvdd marked this conversation as resolved.
Show resolved Hide resolved
col_order = out.columns

save_path = Path("featurecollection.pkl")
Expand Down
18 changes: 18 additions & 0 deletions tests/test_features_integration.py
Expand Up @@ -20,6 +20,7 @@
tsfel_feature_dict_wrapper,
tsfresh_combiner_wrapper,
tsfresh_settings_wrapper,
catch22_wrapper,
)


Expand Down Expand Up @@ -281,3 +282,20 @@ def test_tsfel_feature_dict_wrapper(dummy_data):

res_df = feature_collection.calculate(dummy_data.first("15min"), return_df=True)
assert (res_df.shape[0] > 0) and (res_df.shape[1]) > 0


## CATCH22

def test_catch22_all_features(dummy_data):
# Tests if we integrate with the catch22 features
from catch22 import catch22_all

catch22_feats = MultipleFeatureDescriptors(
functions=catch22_wrapper(catch22_all),
series_names=["EDA", "TMP"],
windows="2.5min", strides="10min",
)
feature_collection = FeatureCollection(catch22_feats)

res_df = feature_collection.calculate(dummy_data.first("15min"), return_df=True)
assert (res_df.shape[0] > 0) and (res_df.shape[1]) > 0
31 changes: 21 additions & 10 deletions tests/test_strided_rolling.py
Expand Up @@ -62,20 +62,31 @@ def stroll_apply_dummy_func(data, window, stride) -> pd.DataFrame:
stroll = SequenceStridedRolling(data, window, stride, window_idx="end")
return stroll.apply_func(FuncWrapper(np.min))

out = stroll_apply_dummy_func(df_eda[:2198], window=1000, stride=200)
assert out.index[-1] == 2000
out = stroll_apply_dummy_func(df_eda[:2199], window=1000, stride=200)
assert out.index[-1] == 2000
out = stroll_apply_dummy_func(df_eda[:2200], window=1000, stride=200)
assert out.index[-1] == 2200
out = stroll_apply_dummy_func(df_eda[:2201], window=1000, stride=200)
assert out.index[-1] == 2200

out = stroll_apply_dummy_func(df_eda[:2399], window=1000, stride=200)
out = stroll_apply_dummy_func(df_eda[:2202], window=1000, stride=200)
assert out.index[-1] == 2200

# -> slicing is include left bound, discard right bound -> so UNTIL index 2200
# i.e. last index in sequence is 2199 -> last valid full range 2200
out = stroll_apply_dummy_func(df_eda[:2400], window=1000, stride=200)
assert out.index[-1] == 2200
out = stroll_apply_dummy_func(df_eda[:2401], window=1000, stride=200)
assert out.index[-1] == 2400
out = stroll_apply_dummy_func(df_eda[:2530], window=1000, stride=200)
assert out.index[-1] == 2400
def stroll_apply_dummy_func(data, window, stride) -> pd.DataFrame:
stroll = SequenceStridedRolling(data, window, stride, window_idx="begin")
return stroll.apply_func(FuncWrapper(np.min))

out = stroll_apply_dummy_func(df_eda[:2198], window=1000, stride=200)
assert out.index[-1] == 1000
out = stroll_apply_dummy_func(df_eda[:2199], window=1000, stride=200)
assert out.index[-1] == 1000
out = stroll_apply_dummy_func(df_eda[:2200], window=1000, stride=200)
assert out.index[-1] == 1200
out = stroll_apply_dummy_func(df_eda[:2201], window=1000, stride=200)
assert out.index[-1] == 1200
out = stroll_apply_dummy_func(df_eda[:2202], window=1000, stride=200)
assert out.index[-1] == 1200


def test_time_stroll_last_window_full(dummy_data):
Expand Down
2 changes: 1 addition & 1 deletion tsflex/__init__.py
Expand Up @@ -9,7 +9,7 @@

__docformat__ = 'numpy'
__author__ = "Jonas Van Der Donckt, Jeroen Van Der Donckt, Emiel Deprost"
__version__ = '0.2.3.6'
__version__ = '0.2.3.7'
__pdoc__ = {
# do not show tue utils module
'tsflex.utils': False,
Expand Down
6 changes: 3 additions & 3 deletions tsflex/features/feature_collection.py
Expand Up @@ -256,7 +256,7 @@ def calculate(
self,
data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]],
return_df: Optional[bool] = False,
window_idx: Optional[str] = "end",
window_idx: Optional[str] = "begin",
bound_method: Optional[str] = "inner",
approve_sparsity: Optional[bool] = False,
show_progress: Optional[bool] = False,
Expand Down Expand Up @@ -292,7 +292,7 @@ def calculate(
window_idx : str, optional
The window's index position which will be used as index for the
feature_window aggregation. Must be either of: `["begin", "middle", "end"]`.
by default "end". All features in this collection will use the same
by default "begin". All features in this collection will use the same
window_idx.
bound_method: str, optional
The start-end bound methodology which is used to generate the slice ranges
Expand Down Expand Up @@ -367,7 +367,7 @@ def calculate(
# determing the bounds of the series dict items and slice on them
start, end = _determine_bounds(bound_method, list(series_dict.values()))
series_dict = {
n: s[s.index.dtype.type(start) : s.index.dtype.type(end)]
n: s.loc[s.index.dtype.type(start) : s.index.dtype.type(end)] # TODO: check memory efficiency of ths
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check memory efficiency of this

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Runtime is the same as previous implementation ✔️
image

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Memory profiling indicates no memory peak ✔️
image

for n, s, in series_dict.items()
}

Expand Down
74 changes: 66 additions & 8 deletions tsflex/features/integrations.py
Expand Up @@ -42,7 +42,7 @@ def wrap_func(x: np.ndarray):
output_names = _get_name(func) if func_name is None else func_name
# A bit hacky (hard coded), bc hist is only func that returns multiple values
if hasattr(func, "bins"):
output_names = [output_names+f"_bin{idx}" for idx in range(1, func.bins+1)]
output_names = [output_names + f"_bin{idx}" for idx in range(1, func.bins + 1)]
return FuncWrapper(wrap_func, output_names=output_names)


Expand Down Expand Up @@ -110,7 +110,7 @@ def tsfel_feature_dict_wrapper(features_dict: Dict) -> List[Callable]:
.. Note::
This wrapper wraps the output of tsfel its `get_features_by_domain` or
`get_features_by_tag`. <br>
Se more [here](https://github.com/fraunhoferportugal/tsfel/blob/master/tsfel/feature_extraction/features_settings.py).
See more [here](https://github.com/fraunhoferportugal/tsfel/blob/master/tsfel/feature_extraction/features_settings.py).

Example
-------
Expand Down Expand Up @@ -153,14 +153,14 @@ def get_output_names(config: dict):
if nb_outputs == 1:
return func_name
else:
return [func_name+f"_{idx}" for idx in range(1,nb_outputs+1)]
return [func_name + f"_{idx}" for idx in range(1, nb_outputs + 1)]
output_param = eval(config["parameters"][nb_outputs])
return [func_name+f"_{nb_outputs}=v" for v in output_param]
return [func_name + f"_{nb_outputs}={v}" for v in output_param]

functions = []
tsfel_mod = importlib.import_module("tsfel.feature_extraction")
for donain_feats in features_dict.values(): # Iterate over feature domains
for config in donain_feats.values(): # Iterate over function configs
for domain_feats in features_dict.values(): # Iterate over feature domains
for config in domain_feats.values(): # Iterate over function configs
func = getattr(tsfel_mod, config["function"].split(".")[-1])
params = config["parameters"] if config["parameters"] else {}
output_names = get_output_names(config)
Expand Down Expand Up @@ -223,7 +223,7 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
This wrapper wraps the output of tsfresh its `MinimalFCParameters()`,
`EfficientFCParameters()`, `IndexBasedFCParameters()`,
`TimeBasedFCParameters()`, or `ComprehensiveFCParameters()`. <br>
Se more [here](https://github.com/blue-yonder/tsfresh/blob/main/tsfresh/feature_extraction/settings.py).
See more [here](https://github.com/blue-yonder/tsfresh/blob/main/tsfresh/feature_extraction/settings.py).

Example
-------
Expand Down Expand Up @@ -264,5 +264,63 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
functions.append(tsfresh_combiner_wrapper(func, param))
else:
for kwargs in param:
functions.append(FuncWrapper(func, output_names=f"{func.__name__}_{str(kwargs)}", **kwargs))
functions.append(
FuncWrapper(
func, output_names=f"{func.__name__}_{str(kwargs)}", **kwargs
)
)
return functions


# ----------------------------------- --CATCH22 -------------------------------------
def catch22_wrapper(catch22_all: Callable) -> FuncWrapper:
"""Wrapper enabling compatibility with catch22.

[catch22](https://github.com/chlubba/catch22) is a collection of 22 time series
features that are a high-performing subset of the over 7000 features in hctsa.

By using this wrapper, we can plug the catch22 features in a tsflex
``FeatureCollection``.
This enables to easily extract the catch22 features while leveraging the flexibility
of tsflex.

.. Note::
This wrapper wraps the `catch22_all` function from `catch22`.
See more [here](https://github.com/chlubba/catch22/blob/master/wrap_Python/catch22/catch22.py).

Example
-------
```python
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors
from tsflex.features.integrations import catch22_wrapper
from catch22 import catch22_all

catch22_feats = MultipleFeatureDescriptors(
functions=catch22_wrapper(catch22_all),
series_names=["sig_0", "sig_1"], # list of signal names
windows="15min", strides="2min",
)

fc = FeatureCollection(catch22_feats)
fc.calculate(data) # calculate the features on your data
```

Parameters
----------
catch22_all: Callable
The `catch22_all` function from the `catch22` package.

Returns
-------
FuncWrapper
The wrapped `catch22_all` function that is compatible with tsflex.
This FuncWrapper will output the 22 catch22 features.

"""
catch22_names = catch22_all([0])["names"]

def wrap_catch22_all(x):
return catch22_all(x)["values"]

wrap_catch22_all.__name__ = "[wrapped]__" + _get_name(catch22_all)
return FuncWrapper(wrap_catch22_all, output_names=catch22_names)
16 changes: 5 additions & 11 deletions tsflex/features/segmenter/strided_rolling.py
Expand Up @@ -112,7 +112,7 @@ def __init__(
self.data_type = func_data_type

# 0. Standardize the input
series_list: List[pd.Series] = to_series_list(data)
series_list: List[pd.Series] = to_series_list(data) # TODO: isn't it always a list of series?
jvdd marked this conversation as resolved.
Show resolved Hide resolved
self.series_dtype = AttributeParser.determine_type(series_list)
self.series_key: Tuple[str, ...] = tuple([str(s.name) for s in series_list])

Expand Down Expand Up @@ -142,17 +142,12 @@ def __init__(

# 4. Check the sparsity assumption
if not self.approve_sparsity:
qs = [0, 0.1, 0.5, 0.9, 1]
for container in self.series_containers:
series_idx_stats = np.quantile(
container.end_indexes - container.start_indexes, q=qs
)
q_str = ", ".join([f"q={q}: {v}" for q, v in zip(qs, series_idx_stats)])
# Warn when min != max
if not all(series_idx_stats == series_idx_stats[-1]):
if np.ptp(container.end_indexes - container.start_indexes) != 0:
jvdd marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn(
f"There are gaps in the sequence of the {container.name}"
f"-series;\n \t Quantiles of nb values in window: {q_str}",
f"-series!",
RuntimeWarning,
)

Expand Down Expand Up @@ -382,7 +377,7 @@ def _construct_output_index(self, series: pd.Series) -> pd.Index:
window_offset = self._get_window_offset(self.window)
# bool which indicates whether the `end` lies on the boundary
# and as arange does not include the right boundary -> use it to enlarge `stop`
boundary = (self.end - self.start - self.window) % self.stride == 0
boundary = (self.end + 1 - self.start - self.window) % self.stride <= 1
return pd.Index(
data=np.arange(
start=self.start + window_offset,
Expand Down Expand Up @@ -525,7 +520,6 @@ def _construct_output_index(self, series: pd.Series) -> pd.DatetimeIndex:
# bool which indicates whether the `end` lies on the boundary
# and as arange does not include the right boundary -> use it to enlarge `stop`
boundary = (self.end - self.start - self.window) % self.stride == 0

return series.iloc[
np.arange(
start=int(window_offset),
Expand Down Expand Up @@ -581,7 +575,7 @@ def _sliding_strided_window_1d(data: np.ndarray, window: int, step: int):
assert (step >= 1) & (window < len(data))

shape = [
np.ceil(len(data) / step - window / step).astype(int),
np.floor(len(data) / step - window / step + 1).astype(int),
window,
]

Expand Down
2 changes: 1 addition & 1 deletion tsflex/features/utils.py
Expand Up @@ -141,7 +141,7 @@ def _make_single_func_robust(

output_names = func_wrapper_kwargs.get("output_names")

def wrap_func(*series: Union[np.ndarray, pd.Series], **kwargs) -> FuncWrapper:
def wrap_func(*series: Union[np.ndarray, pd.Series], **kwargs) -> Callable:
if not passthrough_nans:
series = [s[~np.isnan(s)] for s in series]
if any([len(s) < min_nb_samples for s in series]):
Expand Down