From 8c6ee1edfc5dacd0d9cff72f846a9e4a1c6ce2c8 Mon Sep 17 00:00:00 2001 From: Z-Fran <1396925302@qq.com> Date: Tue, 26 Mar 2024 08:00:58 +0800 Subject: [PATCH 1/3] [ENH] add test parameter sets for KNeighborsTimeSeriesRegressor --- .../distance_based/_time_series_neighbors.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/sktime/regression/distance_based/_time_series_neighbors.py b/sktime/regression/distance_based/_time_series_neighbors.py index 824561b0891..98dce25a20a 100644 --- a/sktime/regression/distance_based/_time_series_neighbors.py +++ b/sktime/regression/distance_based/_time_series_neighbors.py @@ -246,3 +246,40 @@ def _predict(self, X): y_pred = self.knn_estimator_.predict(dist_mat) return y_pred + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return ``"default"`` set. + + Returns + ------- + params : dict or list of dict, default={} + Parameters to create testing instances of the class. + Each dict are parameters to construct an "interesting" test instance, i.e., + ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test + instance. + ``create_test_instance`` uses the first (or only) dictionary in ``params``. + """ + param1 = { + 'n_neighbors': 1, + 'weights': 'uniform', + 'algorithm': 'auto', + 'distance': 'euclidean', + 'distance_params': None, + 'n_jobs': None, + } + param2 = { + 'n_neighbors': 3, + 'weights': 'distance', + 'algorithm': 'ball_tree', + 'distance': 'dtw', + 'distance_params': {'window': 0.5}, + 'n_jobs': -1, + } + return [param1, param2] From 18183b555938fed5970795465363302d5888431c Mon Sep 17 00:00:00 2001 From: Z-Fran <1396925302@qq.com> Date: Wed, 27 Mar 2024 09:39:04 +0800 Subject: [PATCH 2/3] fix precommit --- .../distance_based/_time_series_neighbors.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/sktime/regression/distance_based/_time_series_neighbors.py b/sktime/regression/distance_based/_time_series_neighbors.py index 98dce25a20a..c93631faec9 100644 --- a/sktime/regression/distance_based/_time_series_neighbors.py +++ b/sktime/regression/distance_based/_time_series_neighbors.py @@ -267,19 +267,19 @@ def get_test_params(cls, parameter_set="default"): ``create_test_instance`` uses the first (or only) dictionary in ``params``. """ param1 = { - 'n_neighbors': 1, - 'weights': 'uniform', - 'algorithm': 'auto', - 'distance': 'euclidean', - 'distance_params': None, - 'n_jobs': None, + "n_neighbors": 1, + "weights": "uniform", + "algorithm": "auto", + "distance": "euclidean", + "distance_params": None, + "n_jobs": None, } param2 = { - 'n_neighbors': 3, - 'weights': 'distance', - 'algorithm': 'ball_tree', - 'distance': 'dtw', - 'distance_params': {'window': 0.5}, - 'n_jobs': -1, + "n_neighbors": 3, + "weights": "distance", + "algorithm": "ball_tree", + "distance": "dtw", + "distance_params": {"window": 0.5}, + "n_jobs": -1, } return [param1, param2] From 04a8a681b2a446bc20a8bf33a9b4c0ae6575f231 Mon Sep 17 00:00:00 2001 From: Z-Fran <1396925302@qq.com> Date: Thu, 28 Mar 2024 04:18:33 +0800 Subject: [PATCH 3/3] fix precomputed --- .../distance_based/_time_series_neighbors.py | 131 +++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/sktime/regression/distance_based/_time_series_neighbors.py b/sktime/regression/distance_based/_time_series_neighbors.py index c93631faec9..477f600ccdc 100644 --- a/sktime/regression/distance_based/_time_series_neighbors.py +++ b/sktime/regression/distance_based/_time_series_neighbors.py @@ -11,8 +11,11 @@ __author__ = ["fkiraly"] __all__ = ["KNeighborsTimeSeriesRegressor"] +import numpy as np +import pandas as pd from sklearn.neighbors import KNeighborsRegressor +from sktime.datatypes import convert from sktime.distances import pairwise_distance from sktime.regression.base import BaseRegressor @@ -48,7 +51,21 @@ class KNeighborsTimeSeriesRegressor(BaseRegressor): one of: 'uniform', 'distance', or a callable function algorithm : str, optional. default = 'brute' search method for neighbours - one of {'auto', 'ball_tree', 'kd_tree', 'brute'} + one of {'auto', 'ball_tree', 'brute', 'brute_incr'} + + * 'brute' precomputes the distance matrix and applies + ``sklearn`` ``KNeighborsRegressor`` directly. + This algorithm is not memory efficient as it scales with the size + of the distance matrix, but may be more runtime efficient. + * 'brute_incr' passes the distance to ``sklearn`` ``KNeighborsRegressor``, + with ``algorithm='brute'``. This is useful for large datasets, + for memory efficiency, as the distance is used incrementally, + without precomputation. However, this may be less runtime efficient. + * 'ball_tree' uses a ball tree to find the nearest neighbors, + using ``KNeighborsRegressor`` from ``sklearn``. + May be more runtime and memory efficient on mid-to-large datasets, + however, the distance computation may be slower. + distance : str or callable, optional. default ='dtw' distance measure between time series if str, must be one of the following strings: @@ -172,6 +189,67 @@ def _distance(self, X, X2): else: return distance(X, X2) + def _one_element_distance_npdist(self, x, y, n_vars=None): + if n_vars is None: + n_vars = self.n_vars_ + x = np.reshape(x, (1, n_vars, -1)) + y = np.reshape(y, (1, n_vars, -1)) + return self._distance(x, y)[0, 0] + + def _one_element_distance_sktime_dist(self, x, y, n_vars=None): + if n_vars is None: + n_vars = self.n_vars_ + if n_vars == 1: + x = np.reshape(x, (1, n_vars, -1)) + y = np.reshape(y, (1, n_vars, -1)) + elif self._X_metadata["is_equal_length"]: + x = np.reshape(x, (-1, n_vars)) + y = np.reshape(y, (-1, n_vars)) + x_ix = pd.MultiIndex.from_product([[0], range(len(x))]) + y_ix = pd.MultiIndex.from_product([[0], range(len(y))]) + x = pd.DataFrame(x, index=x_ix) + y = pd.DataFrame(y, index=y_ix) + else: # multivariate, unequal length + # in _convert_X_to_sklearn, we have encoded the length as the first column + # this was coerced to float, so we round to avoid rounding errors + x_len = round(x[0]) + y_len = round(y[0]) + # pd.pivot switches the axes, compared to numpy + x = np.reshape(x[1:], (n_vars, -1)).T + y = np.reshape(y[1:], (n_vars, -1)).T + # cut to length + x = x[:x_len] + y = y[:y_len] + x_ix = pd.MultiIndex.from_product([[0], range(x_len)]) + y_ix = pd.MultiIndex.from_product([[0], range(y_len)]) + x = pd.DataFrame(x, index=x_ix) + y = pd.DataFrame(y, index=y_ix) + return self._distance(x, y)[0, 0] + + def _convert_X_to_sklearn(self, X): + """Convert X to 2D numpy for sklearn.""" + # special treatment for unequal length series + if not self._X_metadata["is_equal_length"]: + # then we know we are dealing with pd-multiindex + # as a trick to deal with unequal length data, + # we flatten encode the length as the first column + X_w_ix = X.reset_index(-1) + X_pivot = X_w_ix.pivot(columns=[X_w_ix.columns[0]]) + # fillna since this creates nan but sklearn does not accept these + # the fill value does not matter as the distance ignores it + X_pivot = X_pivot.fillna(0).to_numpy() + X_lens = X.groupby(X_w_ix.index).size().to_numpy() + # add the first column, encoding length of individual series + X_w_lens = np.concatenate([X_lens[:, None], X_pivot], axis=1) + return X_w_lens + + # equal length series case + if isinstance(X, np.ndarray): + X_mtype = "numpy3D" + else: + X_mtype = "pd-multiindex" + return convert(X, from_type=X_mtype, to_type="numpyflat") + def _fit(self, X, y): """Fit the model using X as training data and y as target values. @@ -181,6 +259,44 @@ def _fit(self, X, y): y : {array-like, sparse matrix} Target values of shape = [n_samples] """ + self.n_vars_ = X.shape[1] + if self.algorithm == "brute": + return self._fit_precomp(X=X, y=y) + else: + return self._fit_dist(X=X, y=y) + + def _fit_dist(self, X, y): + """Fit the model using adapted distance metric.""" + # sklearn wants distance callabel element-wise, + # numpy1D x numpy1D -> float + # sktime distance classes are Panel x Panel -> numpy2D + # and the numba distances are numpy3D x numpy3D -> numpy2D + # so we need to wrap the sktime distances + if isinstance(self.distance, str): + # numba distances + metric = self._one_element_distance_npdist + else: + # sktime distance classes + metric = self._one_element_distance_sktime_dist + + algorithm = self.algorithm + if algorithm == "brute_incr": + algorithm = "brute" + + self.knn_estimator_ = KNeighborsRegressor( + n_neighbors=self.n_neighbors, + algorithm=algorithm, + metric=metric, + leaf_size=self.leaf_size, + n_jobs=self.n_jobs, + ) + + X = self._convert_X_to_sklearn(X) + self.knn_estimator_.fit(X, y) + return self + + def _fit_precomp(self, X, y): + """Fit the model using precomputed distance matrix.""" # store full data as indexed X self._X = X @@ -240,6 +356,19 @@ def _predict(self, X): y : array of shape [n_samples] or [n_samples, n_outputs] Class labels for each data sample. """ + if self.algorithm == "brute": + return self._predict_precomp(X) + else: + return self._predict_dist(X) + + def _predict_dist(self, X): + """Predict using adapted distance metric.""" + X = self._convert_X_to_sklearn(X) + y_pred = self.knn_estimator_.predict(X) + return y_pred + + def _predict_precomp(self, X): + """Predict using precomputed distance matrix.""" # self._X should be the stored _X dist_mat = self._distance(X, self._X)