sktime · fkiraly · Apr 3, 2024 · Mar 26, 2024 · Mar 27, 2024 · Mar 27, 2024
@@ -11,8 +11,11 @@
 __author__ = ["fkiraly"]
 __all__ = ["KNeighborsTimeSeriesRegressor"]
 
+import numpy as np
+import pandas as pd
 from sklearn.neighbors import KNeighborsRegressor
 
+from sktime.datatypes import convert
 from sktime.distances import pairwise_distance
 from sktime.regression.base import BaseRegressor
 
@@ -48,7 +51,21 @@ class KNeighborsTimeSeriesRegressor(BaseRegressor):
         one of: 'uniform', 'distance', or a callable function
     algorithm : str, optional. default = 'brute'
         search method for neighbours
-        one of {'auto', 'ball_tree', 'kd_tree', 'brute'}
+        one of {'auto', 'ball_tree', 'brute', 'brute_incr'}
+
+        * 'brute' precomputes the distance matrix and applies
+          ``sklearn`` ``KNeighborsRegressor`` directly.
+          This algorithm is not memory efficient as it scales with the size
+          of the distance matrix, but may be more runtime efficient.
+        * 'brute_incr' passes the distance to ``sklearn`` ``KNeighborsRegressor``,
+          with ``algorithm='brute'``. This is useful for large datasets,
+          for memory efficiency, as the distance is used incrementally,
+          without precomputation. However, this may be less runtime efficient.
+        * 'ball_tree' uses a ball tree to find the nearest neighbors,
+          using ``KNeighborsRegressor`` from ``sklearn``.
+          May be more runtime and memory efficient on mid-to-large datasets,
+          however, the distance computation may be slower.
+
     distance : str or callable, optional. default ='dtw'
         distance measure between time series
         if str, must be one of the following strings:
@@ -172,6 +189,67 @@ def _distance(self, X, X2):
         else:
             return distance(X, X2)
 
+    def _one_element_distance_npdist(self, x, y, n_vars=None):
+        if n_vars is None:
+            n_vars = self.n_vars_
+        x = np.reshape(x, (1, n_vars, -1))
+        y = np.reshape(y, (1, n_vars, -1))
+        return self._distance(x, y)[0, 0]
+
+    def _one_element_distance_sktime_dist(self, x, y, n_vars=None):
+        if n_vars is None:
+            n_vars = self.n_vars_
+        if n_vars == 1:
+            x = np.reshape(x, (1, n_vars, -1))
+            y = np.reshape(y, (1, n_vars, -1))
+        elif self._X_metadata["is_equal_length"]:
+            x = np.reshape(x, (-1, n_vars))
+            y = np.reshape(y, (-1, n_vars))
+            x_ix = pd.MultiIndex.from_product([[0], range(len(x))])
+            y_ix = pd.MultiIndex.from_product([[0], range(len(y))])
+            x = pd.DataFrame(x, index=x_ix)
+            y = pd.DataFrame(y, index=y_ix)
+        else:  # multivariate, unequal length
+            # in _convert_X_to_sklearn, we have encoded the length as the first column
+            # this was coerced to float, so we round to avoid rounding errors
+            x_len = round(x[0])
+            y_len = round(y[0])
+            # pd.pivot switches the axes, compared to numpy
+            x = np.reshape(x[1:], (n_vars, -1)).T
+            y = np.reshape(y[1:], (n_vars, -1)).T
+            # cut to length
+            x = x[:x_len]
+            y = y[:y_len]
+            x_ix = pd.MultiIndex.from_product([[0], range(x_len)])
+            y_ix = pd.MultiIndex.from_product([[0], range(y_len)])
+            x = pd.DataFrame(x, index=x_ix)
+            y = pd.DataFrame(y, index=y_ix)
+        return self._distance(x, y)[0, 0]
+
+    def _convert_X_to_sklearn(self, X):
+        """Convert X to 2D numpy for sklearn."""
+        # special treatment for unequal length series
+        if not self._X_metadata["is_equal_length"]:
+            # then we know we are dealing with pd-multiindex
+            # as a trick to deal with unequal length data,
+            # we flatten encode the length as the first column
+            X_w_ix = X.reset_index(-1)
+            X_pivot = X_w_ix.pivot(columns=[X_w_ix.columns[0]])
+            # fillna since this creates nan but sklearn does not accept these
+            # the fill value does not matter as the distance ignores it
+            X_pivot = X_pivot.fillna(0).to_numpy()
+            X_lens = X.groupby(X_w_ix.index).size().to_numpy()
+            # add the first column, encoding length of individual series
+            X_w_lens = np.concatenate([X_lens[:, None], X_pivot], axis=1)
+            return X_w_lens
+
+        # equal length series case
+        if isinstance(X, np.ndarray):
+            X_mtype = "numpy3D"
+        else:
+            X_mtype = "pd-multiindex"
+        return convert(X, from_type=X_mtype, to_type="numpyflat")
+
     def _fit(self, X, y):
         """Fit the model using X as training data and y as target values.
 
@@ -181,6 +259,44 @@ def _fit(self, X, y):
         y : {array-like, sparse matrix}
             Target values of shape = [n_samples]
         """
+        self.n_vars_ = X.shape[1]
+        if self.algorithm == "brute":
+            return self._fit_precomp(X=X, y=y)
+        else:
+            return self._fit_dist(X=X, y=y)
+
+    def _fit_dist(self, X, y):
+        """Fit the model using adapted distance metric."""
+        # sklearn wants distance callabel element-wise,
+        # numpy1D x numpy1D -> float
+        # sktime distance classes are Panel x Panel -> numpy2D
+        # and the numba distances are numpy3D x numpy3D -> numpy2D
+        # so we need to wrap the sktime distances
+        if isinstance(self.distance, str):
+            # numba distances
+            metric = self._one_element_distance_npdist
+        else:
+            # sktime distance classes
+            metric = self._one_element_distance_sktime_dist
+
+        algorithm = self.algorithm
+        if algorithm == "brute_incr":
+            algorithm = "brute"
+
+        self.knn_estimator_ = KNeighborsRegressor(
+            n_neighbors=self.n_neighbors,
+            algorithm=algorithm,
+            metric=metric,
+            leaf_size=self.leaf_size,
+            n_jobs=self.n_jobs,
+        )
+
+        X = self._convert_X_to_sklearn(X)
+        self.knn_estimator_.fit(X, y)
+        return self
+
+    def _fit_precomp(self, X, y):
+        """Fit the model using precomputed distance matrix."""
         # store full data as indexed X
         self._X = X
 
@@ -240,9 +356,59 @@ def _predict(self, X):
         y : array of shape [n_samples] or [n_samples, n_outputs]
             Class labels for each data sample.
         """
+        if self.algorithm == "brute":
+            return self._predict_precomp(X)
+        else:
+            return self._predict_dist(X)
+
+    def _predict_dist(self, X):
+        """Predict using adapted distance metric."""
+        X = self._convert_X_to_sklearn(X)
+        y_pred = self.knn_estimator_.predict(X)
+        return y_pred
+
+    def _predict_precomp(self, X):
+        """Predict using precomputed distance matrix."""
         # self._X should be the stored _X
         dist_mat = self._distance(X, self._X)
 
         y_pred = self.knn_estimator_.predict(dist_mat)
 
         return y_pred
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return ``"default"`` set.
+
+        Returns
+        -------
+        params : dict or list of dict, default={}
+            Parameters to create testing instances of the class.
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
+            instance.
+            ``create_test_instance`` uses the first (or only) dictionary in ``params``.
+        """
+        param1 = {
+            "n_neighbors": 1,
+            "weights": "uniform",
+            "algorithm": "auto",
+            "distance": "euclidean",
+            "distance_params": None,
+            "n_jobs": None,
+        }
+        param2 = {
+            "n_neighbors": 3,
+            "weights": "distance",
+            "algorithm": "ball_tree",
+            "distance": "dtw",
+            "distance_params": {"window": 0.5},
+            "n_jobs": -1,
+        }
+        return [param1, param2]