[ENH] k-nearest neighbors classifier: support for non-brute algorithm…

…s and non-precomputed mode to improve memory efficiency (#5937) This PR adds code to `KNeighborsTimeSeriesClassifier` that passes a callable to the internal sklearn estimator to avoid oom errors when precomputing the distance matrix, trading off memory efficiency with compute efficiency. Adds the following algorithms - with test coverage - to `KNeighborsTimeSeriesClassifier`: * `ball_tree` * `brute_incr` - brute force, but distances are not all precomputed These strategies also allow for unequal length time series given internal adapter encoding. Removes `kd_tree` from the docstring which did not work previously, and cannot be interfaced, as `sklearn` does not allow this to be called together with a custom distance. Potentially fixes #5914, and fixes #2774
sktime · Feb 18, 2024 · 2442e5a · 2442e5a
1 parent 63a95f2
commit 2442e5a
Show file tree

Hide file tree

Showing 2 changed files with 175 additions and 30 deletions.
diff --git a/sktime/classification/distance_based/_time_series_neighbors.py b/sktime/classification/distance_based/_time_series_neighbors.py
@@ -21,10 +21,11 @@
 from inspect import signature
 
 import numpy as np
+import pandas as pd
 from sklearn.neighbors import KNeighborsClassifier
 
 from sktime.classification.base import BaseClassifier
-from sktime.datatypes import check_is_mtype
+from sktime.datatypes import convert
 from sktime.distances import pairwise_distance
 
 # add new distance string codes here
@@ -66,28 +67,45 @@ class KNeighborsTimeSeriesClassifier(BaseClassifier):
         - [callable] : a user-defined function which accepts an
           array of distances, and returns an array of the same shape
           containing the weights.
+
     algorithm : str, optional. default = 'brute'
         search method for neighbours
-        one of {'auto', 'ball_tree', 'kd_tree', 'brute'}
+        one of {'ball_tree', 'brute', 'brute_incr'}
+
+        * 'brute' precomputes the distance matrix and applies
+          ``sklearn`` ``KNeighborsClassifier`` directly.
+          This algorithm is not memory efficient as it scales with the size
+          of the distance matrix, but may be more runtime efficient.
+        * 'brute_incr' passes the distance to ``sklearn`` ``KNeighborsClassifier``,
+          with ``algorithm='brute'``. This is useful for large datasets,
+          for memory efficiency, as the distance is used incrementally,
+          without precomputation. However, this may be less runtime efficient.
+        * 'ball_tree' uses a ball tree to find the nearest neighbors,
+          using ``KNeighborsClassifier`` from ``sklearn``.
+          May be more runtime and memory efficient on mid-to-large datasets,
+          however, the distance computation may be slower.
+
     distance : str or callable, optional. default ='dtw'
         distance measure between time series
-        if str, must be one of the following strings:
-            'euclidean', 'squared', 'dtw', 'ddtw', 'wdtw', 'wddtw',
-            'lcss', 'edr', 'erp', 'msm', 'twe'
-        this will substitute a hard-coded distance metric from sktime.distances
-        If non-class callable, parameters can be passed via distance_params
-            Example: knn_dtw = KNeighborsTimeSeriesClassifier(
-                                    distance='dtw', distance_params={'epsilon':0.1})
-        if any callable, must be of signature (X: Panel, X2: Panel) -> np.ndarray
-            output must be mxn array if X is Panel of m Series, X2 of n Series
-            if distance_mtype is not set, must be able to take
-                X, X2 which are pd_multiindex and numpy3D mtype
-        can be pairwise panel transformer inheriting from BasePairwiseTransformerPanel
+
+        * if str, must be one of the following strings:
+          'euclidean', 'squared', 'dtw', 'ddtw', 'wdtw', 'wddtw',
+          'lcss', 'edr', 'erp', 'msm', 'twe'
+          this will substitute a hard-coded distance metric from sktime.distances
+        * If non-class callable, parameters can be passed via distance_params
+          Example: knn_dtw = KNeighborsTimeSeriesClassifier(
+          distance='dtw', distance_params={'epsilon':0.1})
+        * if any callable, must be of signature (X: Panel, X2: Panel) -> np.ndarray
+          output must be mxn array if X is Panel of m Series, X2 of n Series
+          if distance_mtype is not set, must be able to take
+          X, X2 which are pd_multiindex and numpy3D mtype
+          can be pairwise panel transformer inheriting from BasePairwiseTransformerPanel
+
     distance_params : dict, optional. default = None.
         dictionary for distance parameters, in case that distance is a str or callable
     distance_mtype : str, or list of str optional. default = None.
         mtype that distance expects for X and X2, if a callable
-            only set this if distance is not BasePairwiseTransformerPanel descendant
+        only set this if distance is not BasePairwiseTransformerPanel descendant
     pass_train_distances : bool, optional, default = False.
         Whether distances between training points are computed and passed to sklearn.
         Passing is superfluous for algorithm='brute', but may have impact otherwise.
@@ -212,6 +230,67 @@ def _distance(self, X, X2=None):
                 else:
                     return distance(X, **distance_params)
 
+    def _one_element_distance_npdist(self, x, y, n_vars=None):
+        if n_vars is None:
+            n_vars = self.n_vars_
+        x = np.reshape(x, (1, n_vars, -1))
+        y = np.reshape(y, (1, n_vars, -1))
+        return self._distance(x, y)[0, 0]
+
+    def _one_element_distance_sktime_dist(self, x, y, n_vars=None):
+        if n_vars is None:
+            n_vars = self.n_vars_
+        if n_vars == 1:
+            x = np.reshape(x, (1, n_vars, -1))
+            y = np.reshape(y, (1, n_vars, -1))
+        elif self._X_metadata["is_equal_length"]:
+            x = np.reshape(x, (-1, n_vars))
+            y = np.reshape(y, (-1, n_vars))
+            x_ix = pd.MultiIndex.from_product([[0], range(len(x))])
+            y_ix = pd.MultiIndex.from_product([[0], range(len(y))])
+            x = pd.DataFrame(x, index=x_ix)
+            y = pd.DataFrame(y, index=y_ix)
+        else:  # multivariate, unequal length
+            # in _convert_X_to_sklearn, we have encoded the length as the first column
+            # this was coerced to float, so we round to avoid rounding errors
+            x_len = round(x[0])
+            y_len = round(y[0])
+            # pd.pivot switches the axes, compared to numpy
+            x = np.reshape(x[1:], (n_vars, -1)).T
+            y = np.reshape(y[1:], (n_vars, -1)).T
+            # cut to length
+            x = x[:x_len]
+            y = y[:y_len]
+            x_ix = pd.MultiIndex.from_product([[0], range(x_len)])
+            y_ix = pd.MultiIndex.from_product([[0], range(y_len)])
+            x = pd.DataFrame(x, index=x_ix)
+            y = pd.DataFrame(y, index=y_ix)
+        return self._distance(x, y)[0, 0]
+
+    def _convert_X_to_sklearn(self, X):
+        """Convert X to 2D numpy for sklearn."""
+        # special treatment for unequal length series
+        if not self._X_metadata["is_equal_length"]:
+            # then we know we are dealing with pd-multiindex
+            # as a trick to deal with unequal length data,
+            # we flatten encode the length as the first column
+            X_w_ix = X.reset_index(-1)
+            X_pivot = X_w_ix.pivot(columns=[X_w_ix.columns[0]])
+            # fillna since this creates nan but sklearn does not accept these
+            # the fill value does not matter as the distance ignores it
+            X_pivot = X_pivot.fillna(0).to_numpy()
+            X_lens = X.groupby(X_w_ix.index).size().to_numpy()
+            # add the first column, encoding length of individual series
+            X_w_lens = np.concatenate([X_lens[:, None], X_pivot], axis=1)
+            return X_w_lens
+
+        # equal length series case
+        if isinstance(X, np.ndarray):
+            X_mtype = "numpy3D"
+        else:
+            X_mtype = "pd-multiindex"
+        return convert(X, from_type=X_mtype, to_type="numpyflat")
+
     def _fit(self, X, y):
         """Fit the model using X as training data and y as target values.
 
@@ -222,22 +301,57 @@ def _fit(self, X, y):
         y : {array-like, sparse matrix}
             Target values of shape = [n]
         """
+        self.n_vars_ = X.shape[1]
+        if self.algorithm == "brute":
+            return self._fit_precomp(X=X, y=y)
+        else:
+            return self._fit_dist(X=X, y=y)
+
+    def _fit_dist(self, X, y):
+        """Fit the model using adapted distance metric."""
+        # sklearn wants distance callabel element-wise,
+        # numpy1D x numpy1D -> float
+        # sktime distance classes are Panel x Panel -> numpy2D
+        # and the numba distances are numpy3D x numpy3D -> numpy2D
+        # so we need to wrap the sktime distances
+        if isinstance(self.distance, str):
+            # numba distances
+            metric = self._one_element_distance_npdist
+        else:
+            # sktime distance classes
+            metric = self._one_element_distance_sktime_dist
+
+        algorithm = self.algorithm
+        if algorithm == "brute_incr":
+            algorithm = "brute"
+
+        self.knn_estimator_ = KNeighborsClassifier(
+            n_neighbors=self.n_neighbors,
+            algorithm=algorithm,
+            metric=metric,
+            leaf_size=self.leaf_size,
+            n_jobs=self.n_jobs,
+            weights=self.weights,
+        )
+
+        X = self._convert_X_to_sklearn(X)
+        self.knn_estimator_.fit(X, y)
+        return self
+
+    def _fit_precomp(self, X, y):
+        """Fit the model using precomputed distance matrix."""
         # store full data as indexed X
         self._X = X
 
         if self.pass_train_distances:
             dist_mat = self._distance(X)
         else:
+            n = self._X_metadata["n_instances"]
             # if we do not want/need to pass train-train distances,
             #   we still need to pass a zeros matrix, this means "do not consider"
             # citing the sklearn KNeighborsClassifier docs on distance matrix input:
             # "X may be a sparse graph, in which case only "nonzero" elements
             #   may be considered neighbors."
-            X_inner_mtype = self.get_tag("X_inner_mtype")
-            _, _, X_meta = check_is_mtype(
-                X, X_inner_mtype, return_metadata=True, msg_return_dict="list"
-            )
-            n = X_meta["n_instances"]
             dist_mat = np.zeros([n, n], dtype="float")
 
         self.knn_estimator_.fit(dist_mat, y)
@@ -294,11 +408,22 @@ def _predict(self, X):
         y : array of shape [n_samples] or [n_samples, n_outputs]
             Class labels for each data sample.
         """
+        if self.algorithm == "brute":
+            return self._predict_precomp(X)
+        else:
+            return self._predict_dist(X)
+
+    def _predict_dist(self, X):
+        """Predict using adapted distance metric."""
+        X = self._convert_X_to_sklearn(X)
+        y_pred = self.knn_estimator_.predict(X)
+        return y_pred
+
+    def _predict_precomp(self, X):
+        """Predict using precomputed distance matrix."""
         # self._X should be the stored _X
         dist_mat = self._distance(X, self._X)
-
         y_pred = self.knn_estimator_.predict(dist_mat)
-
         return y_pred
 
     def _predict_proba(self, X):
@@ -316,11 +441,22 @@ def _predict_proba(self, X):
             The class probabilities of the input samples. Classes are ordered
             by lexicographic order.
         """
+        if self.algorithm == "brute":
+            return self._predict_proba_precomp(X)
+        else:
+            return self._predict_proba_dist(X)
+
+    def _predict_proba_dist(self, X):
+        """Predict (proba) using adapted distance metric."""
+        X = self._convert_X_to_sklearn(X)
+        y_pred = self.knn_estimator_.predict_proba(X)
+        return y_pred
+
+    def _predict_proba_precomp(self, X):
+        """Predict (proba) using precomputed distance matrix."""
         # self._X should be the stored _X
         dist_mat = self._distance(X, self._X)
-
         y_pred = self.knn_estimator_.predict_proba(dist_mat)
-
         return y_pred
 
     @classmethod
@@ -346,15 +482,25 @@ def get_test_params(cls, parameter_set="default"):
             `create_test_instance` uses the first (or only) dictionary in `params`.
         """
         # non-default distance and algorithm
-        params1 = {"distance": "euclidean"}
+        params0 = {"distance": "euclidean"}
 
         # testing distance_params
-        params2 = {"distance": "dtw", "distance_params": {"epsilon": 0.1}}
+        params1 = {"distance": "dtw", "distance_params": {"epsilon": 0.1}}
 
         # testing that callables/classes can be passed
         from sktime.dists_kernels.compose_tab_to_panel import AggrDist
 
         dist = AggrDist.create_test_instance()
-        params3 = {"distance": dist}
-
-        return [params1, params2, params3]
+        params2 = {"distance": dist}
+
+        params3 = {"algorithm": "ball_tree"}
+        # params5 = {"algorithm": "kd_tree", "distance": "euclidean"}
+        params4 = {
+            "algorithm": "brute_incr",
+            "distance": "dtw",
+            "distance_params": {"epsilon": 0.1},
+        }
+        params5 = {"algorithm": "ball_tree", "distance": dist}
+
+        params = [params0, params1, params2, params3, params4, params5]
+        return params
diff --git a/sktime/utils/profiling.py b/sktime/utils/profiling.py
@@ -25,7 +25,6 @@ def profile_classifier(
 
     Of each experiment, time spent in fit and time spent in predict is measured.
 
-
     Parameters
     ----------
     est : sktime classifier, BaseClassifier descendant, object or class