Skip to content

Commit

Permalink
♻️ update code based on review
Browse files Browse the repository at this point in the history
  • Loading branch information
jvdd committed Oct 10, 2022
1 parent 9da85b6 commit cc2c6e8
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 84 deletions.
24 changes: 17 additions & 7 deletions tests/test_features_feature_collection.py
Expand Up @@ -1516,7 +1516,7 @@ def test_mixed_featuredescriptors_time_data(dummy_data):
for warn in w])

out = fc.calculate([df_eda, df_tmp], return_df=True)
assert all(out.notna().sum(axis=0)) # TODO what do we want to assert here?
assert all(out.notna().all())


### Test vectorized features
Expand Down Expand Up @@ -1948,13 +1948,23 @@ def test_not_sorted_fc(dummy_data):
df_tmp = dummy_data["TMP"].reset_index(drop=True)
df_eda = dummy_data["EDA"].reset_index(drop=True).sample(frac=1)
assert not df_eda.index.is_monotonic_increasing
out = fc.calculate([df_tmp, df_eda], window_idx="end", return_df=True)
assert not df_eda.index.is_monotonic_increasing
with warnings.catch_warnings(record=True) as w:
out = fc.calculate([df_tmp, df_eda], return_df=True)
assert len(w) == 1
assert "not monotonic increasing" in str(w[0])
assert issubclass(w[0].category, RuntimeWarning)
assert not df_eda.index.is_monotonic_increasing
assert out.index.is_monotonic_increasing

df_eda.index = df_eda.index.astype(float)
assert not df_eda.index.is_monotonic_increasing
out = fc.calculate([df_tmp, df_eda], window_idx="end", return_df=True)
assert not df_eda.index.is_monotonic_increasing
with warnings.catch_warnings(record=True) as w:
out = fc.calculate([df_tmp, df_eda], window_idx="end", return_df=True)
assert len(w) == 1
assert "not monotonic increasing" in str(w[0])
assert issubclass(w[0].category, RuntimeWarning)
assert not df_eda.index.is_monotonic_increasing
assert out.index.is_monotonic_increasing


def test_serialization(dummy_data):
Expand Down Expand Up @@ -2121,13 +2131,13 @@ def test_feature_collection_various_timezones_segment_start_idxs():
assert np.all(res.values.ravel() == [0, 1, 2])

fc = FeatureCollection(
FeatureDescriptor(np.min, "s_usa", "3h", "3h")
FeatureDescriptor(len, "s_usa", "3h", "3h") # len bc it works on empty arrays
)
res = fc.calculate(s_usa, segment_start_idxs=s_eu.index[:3].values, n_jobs=0, return_df=True)
assert np.all(res.values == [])

fc = FeatureCollection(
FeatureDescriptor(np.min, "s_usa", "3h", "3h")
FeatureDescriptor(len, "s_usa", "3h", "3h") # len bc it works on empty arrays
)
res = fc.calculate(s_usa, segment_start_idxs=s_none.index[:3].values, n_jobs=0, return_df=True)
assert np.all(res.values == [])
20 changes: 19 additions & 1 deletion tests/test_features_feature_descriptor.py
Expand Up @@ -100,7 +100,25 @@ def sum_func(sig: np.ndarray) -> float:
assert fd.get_nb_output_features() == 1
assert isinstance(fd.function, FuncWrapper)

# TODO -> add new test in which floats represent the float position

def test_simple_feature_descriptor_floats():
def sum_func(sig: np.ndarray) -> float:
return sum(sig)

fd = FeatureDescriptor(
function=sum_func,
series_name="EDA",
window=5.0,
stride=2.5,
)

assert fd.series_name == tuple(["EDA"])
assert fd.window == 5.0
assert fd.stride == [2.5]
assert fd.get_required_series() == ["EDA"]
assert fd.get_nb_output_features() == 1
assert isinstance(fd.function, FuncWrapper)


def test_simple_feature_descriptor_str_str_seconds():
def sum_func(sig: np.ndarray) -> float:
Expand Down
54 changes: 24 additions & 30 deletions tests/test_strided_rolling.py
Expand Up @@ -418,28 +418,28 @@ def test_sequence_stroll_indexing_segment_start_idxs_outside_valid_range():
## No Force
sr = SequenceStridedRolling(s, window=3, segment_start_idxs=segment_start_idxs, window_idx="begin")
assert sr.strides is None
assert np.all(sr.index == segment_start_idxs[:4])
assert np.all(sr.index == segment_start_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_start_idxs=segment_start_idxs, window_idx="begin")
assert sr.strides is None
assert np.all(sr.index == segment_start_idxs[:4])
assert np.all(sr.index == segment_start_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_start_idxs=segment_start_idxs, window_idx="end")
assert sr.strides is None
assert np.all(sr.index == segment_start_idxs[:4] + 3)
assert np.all(sr.index == segment_start_idxs + 3)

## Force
sr = SequenceStridedRolling(s, window=3, segment_start_idxs=segment_start_idxs, window_idx="begin", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == segment_start_idxs[:4])
assert np.all(sr.index == segment_start_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_start_idxs=segment_start_idxs, window_idx="begin", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == segment_start_idxs[:4])
assert np.all(sr.index == segment_start_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_start_idxs=segment_start_idxs, window_idx="end", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == segment_start_idxs[:4] + 3)
assert np.all(sr.index == segment_start_idxs + 3)


def test_sequence_stroll_indexing_segment_end_idxs_outside_valid_range():
Expand All @@ -449,28 +449,28 @@ def test_sequence_stroll_indexing_segment_end_idxs_outside_valid_range():
## No Force
sr = SequenceStridedRolling(s, window=3, segment_end_idxs=segment_end_idxs, window_idx="end")
assert sr.strides is None
assert np.all(sr.index == segment_end_idxs[:4])
assert np.all(sr.index == segment_end_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_end_idxs=segment_end_idxs, window_idx="end")
assert sr.strides is None
assert np.all(sr.index == segment_end_idxs[:4])
assert np.all(sr.index == segment_end_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_end_idxs=segment_end_idxs, window_idx="begin")
assert sr.strides is None
assert np.all(sr.index == segment_end_idxs[:4] - 3)
assert np.all(sr.index == segment_end_idxs - 3)

## Force
sr = SequenceStridedRolling(s, window=3, segment_end_idxs=segment_end_idxs, window_idx="end", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == segment_end_idxs[:4])
assert np.all(sr.index == segment_end_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_end_idxs=segment_end_idxs, window_idx="end", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == segment_end_idxs[:4])
assert np.all(sr.index == segment_end_idxs)

sr = SequenceStridedRolling(s, window=3, strides=[3, 5], segment_end_idxs=segment_end_idxs, window_idx="begin", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == segment_end_idxs[:4] - 3)
assert np.all(sr.index == segment_end_idxs - 3)


def test_time_stroll_indexing_segment_start_idxs():
Expand Down Expand Up @@ -555,34 +555,31 @@ def test_time_stroll_indexing_segment_start_idxs_outside_valid_range():
segment_start_idxs = s.index[[0, 5, 7, 10]].values
segment_start_idxs = np.append(segment_start_idxs, (s.index[[0]] + pd.Timedelta(200, unit="h")).values)

def get_time_index(arr):
return [time_index[idx] for idx in arr]

## No Force
sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), segment_start_idxs=segment_start_idxs, window_idx="begin")
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_start_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_start_idxs=segment_start_idxs, window_idx="begin")
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_start_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_start_idxs=segment_start_idxs, window_idx="end")
assert sr.strides is None
assert np.all(sr.index == [t + pd.Timedelta(3, unit="h") for t in get_time_index([0, 5, 7, 10])])
assert np.all(sr.index == [t + pd.Timedelta(3, unit="h") for t in segment_start_idxs])

## No Force
sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), segment_start_idxs=segment_start_idxs, window_idx="begin", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_start_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_start_idxs=segment_start_idxs, window_idx="begin", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_start_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_start_idxs=segment_start_idxs, window_idx="end", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == [t + pd.Timedelta(3, unit="h") for t in get_time_index([0, 5, 7, 10])])
assert np.all(sr.index == [t + pd.Timedelta(3, unit="h") for t in segment_start_idxs])


def test_time_stroll_indexing_segment_end_idxs_outside_valid_range():
Expand All @@ -593,34 +590,31 @@ def test_time_stroll_indexing_segment_end_idxs_outside_valid_range():
segment_end_idxs = s.index[[0, 5, 7, 10]].values
segment_end_idxs = np.append(segment_end_idxs, (s.index[[0]] + pd.Timedelta(200, unit="h")).values)

def get_time_index(arr):
return [time_index[idx] for idx in arr]

## No Force
sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), segment_end_idxs=segment_end_idxs, window_idx="end")
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_end_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_end_idxs=segment_end_idxs, window_idx="end")
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_end_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_end_idxs=segment_end_idxs, window_idx="begin")
assert sr.strides is None
assert np.all(sr.index == [t - pd.Timedelta(3, unit="h") for t in get_time_index([0, 5, 7, 10])])
assert np.all(sr.index == [t - pd.Timedelta(3, unit="h") for t in segment_end_idxs])

## No Force
sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), segment_end_idxs=segment_end_idxs, window_idx="end", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_end_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_end_idxs=segment_end_idxs, window_idx="end", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == get_time_index([0, 5, 7, 10]))
assert np.all(sr.index == segment_end_idxs)

sr = TimeStridedRolling(s, window=pd.Timedelta(3, unit="h"), strides=[pd.Timedelta(3, unit="h"), pd.Timedelta(5, unit="h")], segment_end_idxs=segment_end_idxs, window_idx="begin", include_final_window=True)
assert sr.strides is None
assert np.all(sr.index == [t - pd.Timedelta(3, unit="h") for t in get_time_index([0, 5, 7, 10])])
assert np.all(sr.index == [t - pd.Timedelta(3, unit="h") for t in segment_end_idxs])



Expand Down
56 changes: 20 additions & 36 deletions tsflex/features/feature_collection.py
Expand Up @@ -330,6 +330,17 @@ def _check_no_multiple_windows(self):
+ " can only have 1 window (or None)"
)

@staticmethod
def _process_segment_idxs(
segment_idxs: Union[list, np.ndarray, pd.Series, pd.Index]
) -> np.ndarray:
if hasattr(segment_idxs, "values"):
segment_idxs = segment_idxs.values
segment_idxs = np.asarray(segment_idxs)
if segment_idxs.ndim > 1:
segment_idxs = segment_idxs.squeeze() # remove singleton dimensions
return segment_idxs

def calculate(
self,
data: Union[pd.Series, pd.DataFrame, List[Union[pd.Series, pd.DataFrame]]],
Expand Down Expand Up @@ -515,17 +526,9 @@ def calculate(

# Convert to numpy array (if necessary)
if segment_start_idxs is not None:
if hasattr(segment_start_idxs, "values"):
segment_start_idxs = segment_start_idxs.values
segment_start_idxs = np.asarray(
segment_start_idxs
).squeeze() # remove singleton dimensions
segment_start_idxs = FeatureCollection._process_segment_idxs(segment_start_idxs)
if segment_end_idxs is not None:
if hasattr(segment_end_idxs, "values"):
segment_end_idxs = segment_end_idxs.values
segment_end_idxs = np.asarray(
segment_end_idxs
).squeeze() # remove singleton dimensions
segment_end_idxs = FeatureCollection._process_segment_idxs(segment_end_idxs)

if segment_start_idxs is not None and segment_end_idxs is not None:
# Check if segment indices have same length and whether every start idx
Expand All @@ -540,7 +543,7 @@ def calculate(
fd.window is not None
for fd in flatten(self._feature_desc_dict.values())
), (
"Each feature descriptor must have a window when not both"
"Each feature descriptor must have a window when not both "
+ "segment_start_idxs and segment_end_idxs are provided"
)

Expand Down Expand Up @@ -571,7 +574,11 @@ def calculate(
series_dict: Dict[str, pd.Series] = {}
for s in to_series_list(data):
if not s.index.is_monotonic_increasing:
# TODO -> maybe raise a warning?
warnings.warn(
f"The index of series '{s.name}' is not monotonic increasing. "
+ "The series will be sorted by the index.",
RuntimeWarning,
)
s = s.sort_index(ascending=True, inplace=False, ignore_index=False)

# Assert the assumptions we make!
Expand All @@ -590,31 +597,8 @@ def calculate(
for n, s, in series_dict.items()
}

# Trim the segment indices (if necessary)
# TODO: currently commented this as this also happens in StridedRolling
# if segment_start_idxs is not None:
# start_ = start; end_ = end
# if isinstance(start, pd.Timestamp) and start.tz is not None:
# assert isinstance(end, pd.Timestamp) and end.tz == start.tz
# start_ = start_.tz_convert(None)
# end_ = end_.tz_convert(None)
# mask = (segment_start_idxs >= start_) & (segment_start_idxs <= end_)
# segment_start_idxs = segment_start_idxs[mask]
# if segment_end_idxs is not None:
# start_ = start; end_ = end
# if isinstance(start, pd.Timestamp) and start.tz is not None:
# assert isinstance(end, pd.Timestamp) and end.tz == start.tz
# start_ = start_.tz_convert(None)
# end_ = end_.tz_convert(None)
# mask = (segment_end_idxs >= start_) & (segment_end_idxs <= end_)
# segment_end_idxs = segment_end_idxs[mask]
# if segment_start_idxs is not None and segment_end_idxs is not None:
# # Check if segment indices have same length and whether every start idx
# # <= end idx
# _check_start_end_array(segment_start_idxs, segment_end_idxs)

# Note: this variable has a global scope so this is shared in multiprocessing
# TODO: try to make this more efficient
# TODO: try to make this more efficient (but is not really the bottleneck)
global get_stroll_func
get_stroll_func = self._stroll_feat_generator(
series_dict,
Expand Down
4 changes: 2 additions & 2 deletions tsflex/features/integrations.py
Expand Up @@ -208,7 +208,7 @@ def wrap_func(x: Union[np.ndarray, pd.Series]):
)


def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
def tsfresh_settings_wrapper(settings: Dict) -> List[Union[Callable, FuncWrapper]]:
"""Wrapper enabling compatibility with tsfresh feature extraction settings.
[tsfresh feature extraction settings](https://tsfresh.readthedocs.io/en/latest/text/feature_extraction_settings.html)
Expand Down Expand Up @@ -249,7 +249,7 @@ def tsfresh_settings_wrapper(settings: Dict) -> List[Callable]:
Returns
-------
List[Callable]
List[Union[Callable, FuncWrapper]]
List of the (wrapped) tsfresh functions that are now directly compatible with
with tsflex.
Expand Down

0 comments on commit cc2c6e8

Please sign in to comment.