Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Adding ADI/CV Feature Extractor #6336

Merged
merged 20 commits into from
May 19, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/source/api_reference/transformations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,14 @@ These transformers extract simple summary features.
RandomIntervalFeatureExtractor
FittedParamExtractor

.. currentmodule:: sktime.transformations.series.adi_cv

.. autosummary::
:toctree: auto_generated/
:template: class.rst

ADICVTransformer

Shapelets, wavelets, and convolution
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
217 changes: 217 additions & 0 deletions sktime/transformations/series/adi_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Feature transformer that returns features of time series including categories."""

import pandas as pd

from sktime.transformations.base import BaseTransformer

__author__ = ["shlok191"]


class ADICVTransformer(BaseTransformer):
"""
Classifier based on Intermittent Demand Estimates paper by Syntetos/Boylan.

We Set the ADI threshold to 1.32 and the CV2 threshold to 0.49 by default.
Following is the description of the parameters mentioned above.

1. Average Demand Interval (ADI): The average time period between
time periods with non-zero demands

2. Variance (CV2): Variance calculated on non-zero values
in the time series

3. Class: Classification of time series on basis of ADI threshold
and CV2 threshold.

The following are the classes we classify into:

1. Smooth: If ADI <= ADI_threshold and CV2 <= CV2_threshold
2. Erratic: If ADI <= ADI_threshold and CV2 > CV2_threshold
3. Intermittent: If ADI > ADI_threshold and CV2 <= CV2_threshold
4. Lumpy: if ADI > ADI_threshold and CV2 > CV2_threshold

Parameters
----------
adi_threshold : float (default = 1.32)
Specifies the ADI threshold utilized for classifying the time series

cv2_threshold : float (default = 0.49)
Specifies the CV2 threshold utilized for classifying the time series

features : list[str] | None (default = ['adi', 'cv2', 'class'])
Specifies all of the feature values to be calculated

Examples
--------
>>> from sktime.transformations.series.adi_cv import ADICVTransformer
>>> from sktime.datasets import load_airline
>>> y = load_airline()
>>> transformer = ADICVTransformer()
>>> y_hat = transformer.fit_transform(y)

References
----------
[1]: John E. Boylan, Aris Syntetos: “The Accuracy of Intermittent
Demand Estimates.” International Journal of Forecasting, 1 Apr. 2005
"""

_tags = {
"scitype:transform-input": "Series",
"scitype:transform-output": "Primitives",
"scitype:instancewise": False,
"scitype:transform-labels": "None",
"X_inner_mtype": "pd.DataFrame",
"y_inner_mtype": "None",
"univariate-only": True, # Demand being the only variable
"requires_y": False,
"fit_is_empty": True,
"capability:inverse_transform": False,
"capability:unequal_length": False,
"handles-missing-data": False,
"authors": ["shlok191"],
"maintainers": ["shlok191"],
}

def __init__(self, features=None, adi_threshold=1.32, cv_threshold=0.49):
"""Initialize the transformer and processes any provided parameters.

Parameters
----------
features : List[str] | None, optional
List of features to compute. Defaults to None (all features)

adi_threshold : float, optional
Threshold for Average Demand Interval. Defaults to 1.32.

cv_threshold : float, optional
Threshold for Variance. Defaults to 0.49.

Raises
------
ValueError: If features is provided and does not
contain 'adi','cv2', or 'class'.
"""
self.adi_threshold = adi_threshold
self.cv_threshold = cv_threshold
self.features = features

self.features_internal = features

# Initialize the parent class
super().__init__()

# Checking if the features parameter is valid
if features is not None:
if (
"adi" not in features
or "cv2" not in features
or "class" not in features
):
raise ValueError(
"The features list must either be None or include adi "
+ "cv2, and class as elements."
)

else:
# Helpful to transform None to default list for transform function
self.features_internal = ["adi", "cv2", "class"]

def _transform(self, X, y=None):
"""Transform X and return a transformed version.

private _transform containing core logic, called from transform

Parameters
----------
X : Series
Series of time series data this transform function aims to classify

y : Series | None, default=None
Not required for the classification of the time series

Returns
-------
X_transformed : pd.DataFrame

The returned DataFrame consists of the columns in the features list passed
during initialization. Specifically, the columns include (by default):

1. Average Demand Interval (ADI)
2. Variance (CV2)
3. categorical class
"""
X_non_zero = X.to_numpy().nonzero()
X_non_zero = X.iloc[X_non_zero]

# Calculating ADI value based on formula from paper
adi_value = (len(X) / len(X_non_zero)) - 1

# Calculating variance for all non-zero values
variance = X_non_zero.var().iloc[0]
cv2_value = variance / len(X_non_zero)

# Calculating the class type

adi_low = adi_value <= self.adi_threshold
cv2_low = cv2_value <= self.cv_threshold

if adi_low:
if cv2_low:
class_type = "smooth"

else:
class_type = "erratic"

elif cv2_low:
class_type = "intermittent"

else:
class_type = "lumpy"

# Collecting all values together into dict and converting to DF
return_dict = {}

if "adi" in self.features_internal:
return_dict["adi"] = [adi_value]

if "cv2" in self.features_internal:
return_dict["cv2"] = [cv2_value]

if "class" in self.features_internal:
return_dict["class"] = [class_type]

df = pd.DataFrame(return_dict)

# Ordering the dataframe in the correct order
df = df.loc[:, self.features_internal]

return df

# todo: return default parameters, so that a test instance can be created
# required for automated unit and integration testing of estimator
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests.
If no special parameters are defined for a value, will return
`"default"` set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test case
"""
# Testing with 0 thresholds for both thresholds
# in independent test cases!

params = [
{"features": None, "adi_threshold": 1.32, "cv_threshold": 0.49},
]

return params
132 changes: 132 additions & 0 deletions sktime/transformations/series/tests/test_adi_cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
"""Tests for adi_cv transformers for time series Series."""

import numpy as np
import pandas as pd
import pytest

from sktime.transformations.series.adi_cv import ADICVTransformer


def _generate_smooth_series(size: int = 750):
"""Generates a demand time series of the "smooth" category.

Parameters
----------
size : int, optional
The size of the generated time series, by default 750

Returns
-------
Pandas.Series
Returns the generated series in the Pandas Series format.
"""

# Generating a smooth series, we keep variance low by keeping the
# standard deviation to 0.25. Denoted as scale according to numpy docs

smooth_series = np.random.normal(loc=10, scale=0.25, size=size)

return pd.Series(smooth_series)


def _generate_erratic_series(size: int = 750):
"""Generates a demand time series of the "erratic" category.

Parameters
----------
size : int, optional
The size of the generated time series, by default 750

Returns
-------
Pandas.Series
Returns the generated series in the Pandas Series format.
"""

# Generating an erratic series, we keep variance high by keeping the
# standard deviation to 1 and then squaring the values

erratic_series = np.random.normal(loc=10, scale=2.5, size=size) ** 2

return pd.Series(erratic_series)


def _generate_intermittent_series(size: int = 750):
"""Generates a demand time series of the "intermittent" category.

Parameters
----------
size : int, optional
The size of the generated time series, by default 750

Returns
-------
Pandas.Series
Returns the generated series in the Pandas Series format.
"""

# Generating an intermittent series, we keep ADI high by
# setting only 10% of all values to non-zero values

intermittent_series = np.zeros(shape=(size,))
non_zero_indices = np.random.choice(size, size=size // 10, replace=False)

intermittent_series[non_zero_indices] = np.random.normal(10, 0.25, size=size // 10)

return pd.Series(intermittent_series)


def _generate_lumpy_series(size: int = 750):
"""Generates a demand time series of the "lumpy" category.

Parameters
----------
size : int, optional
The size of the generated time series, by default 750

Returns
-------
Pandas.Series
Returns the generated series in the Pandas Series format.
"""

# Generating a lumpy series, we keep ADI high by
# setting only 10% of all values to non-zero values

lumpy_series = np.zeros(shape=(size,))
non_zero_indices = np.random.choice(size, size=size // 10, replace=False)

lumpy_series[non_zero_indices] = np.random.normal(10, 2.5, size=size // 10) ** 2

return pd.Series(lumpy_series)


# Defining all of the categories we wish to run tests for
@pytest.mark.parametrize(
"series_generator, expected_class",
[
(_generate_smooth_series, "smooth"),
(_generate_erratic_series, "erratic"),
(_generate_intermittent_series, "intermittent"),
(_generate_lumpy_series, "lumpy"),
],
)
def test_adi_cv_extractor(series_generator, expected_class):
"""
Runs a PyTest for all 4 demand time series categories.

Parameters
----------
series_generator
A function that generates a time series in the Pandas Series format.

expected_class
The expected class to be predicted by the transformer.
"""

series = series_generator()
transformer = ADICVTransformer()

df = transformer.fit_transform(series)
assert df["class"].iloc[0] == expected_class