Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Make TimeSince trafo faster by changing period diff calculation #4018

Merged
merged 1 commit into from Dec 30, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 35 additions & 14 deletions sktime/transformations/series/time_since.py
Expand Up @@ -229,26 +229,47 @@ def _transform(self, X, y=None):
freq_ = _remove_digits_from_str(self.freq_)
freq_period = get_period_alias(freq_)

# Convert `start` and datetime index to period.
start_period = pd.Period(start_, freq=freq_period)
# Convert `start` and `time_index` to period.
# Casting `start_` to PeriodIndex using pd.period_range
# here so we can later cast it to an int using
# `astype(int)` in _get_period_diff_as_int().
start_period = pd.period_range(
start=start_, periods=1, freq=freq_period
)
time_index_period = time_index.to_period(freq=freq_period)
# Compute time differences and convert to integers.
time_deltas_period = time_index_period - start_period
time_deltas = _period_to_int(time_deltas_period)
# Compute time differences.
time_deltas = _get_period_diff_as_int(
time_index_period, start_period
)

elif isinstance(time_index, pd.PeriodIndex):
if pd.__version__ < "1.5.0":
# Earlier versions of pandas returned incorrect result
# when taking a difference between Periods when the frequency
# is a multiple of a unit (e.g. "15T"). Solution is to
# cast to lowest frequency when taking difference (e.g., "T").
freq_ = _remove_digits_from_str(time_index.freqstr)
time_deltas_period = time_index.to_timestamp().to_period(
freq_
) - start_.to_timestamp().to_period(freq_)
freq_ = _remove_digits_from_str(self.freq_)

# Change freq of `start` and `time_index`.
# Casting `start_` to PeriodIndex using pd.period_range
# here so we can later cast it to an int using
# `astype(int)` in _get_period_diff_as_int().
start_period = pd.period_range(
start=start_.to_timestamp(), periods=1, freq=freq_
)
time_index = time_index.to_timestamp().to_period(freq_)
else:
time_deltas_period = time_index - start_
time_deltas = _period_to_int(time_deltas_period)
elif X.index.is_numeric():
# Casting `start_` to PeriodIndex using pd.period_range
# here so we can later cast it to an int using
# `astype(int)` in _get_period_diff_as_int().
start_period = pd.period_range(
start=start_, periods=1, freq=self.freq_
)

# Compute time differences.
time_deltas = _get_period_diff_as_int(time_index, start_period)

elif time_index.is_numeric():
time_deltas = time_index - start_
else:
time_deltas = time_index - start_
Expand Down Expand Up @@ -293,8 +314,8 @@ def get_test_params(cls, parameter_set="default"):
]


def _period_to_int(x: pd.PeriodIndex | list[pd.offsets.DateOffset]) -> int:
return x.map(lambda y: y.n)
def _get_period_diff_as_int(x: pd.PeriodIndex, y: pd.PeriodIndex) -> pd.Index:
return x.astype(int) - y.astype(int)


def _remove_digits_from_str(x: str) -> str:
Expand Down