Skip to content

Commit

Permalink
[ENH] k-nearest neighbors classifier: support for non-brute algorithm…
Browse files Browse the repository at this point in the history
…s and non-precomputed mode to improve memory efficiency (#5937)

This PR adds code to `KNeighborsTimeSeriesClassifier` that passes a
callable to the internal sklearn estimator to avoid oom errors when
precomputing the distance matrix, trading off memory efficiency with compute efficiency.

Adds the following algorithms - with test coverage - to
`KNeighborsTimeSeriesClassifier`:

* `ball_tree`
* `brute_incr` - brute force, but distances are not all precomputed

These strategies also allow for unequal length time series given
internal adapter encoding.

Removes `kd_tree` from the docstring which did not work previously, and
cannot be interfaced, as `sklearn` does not allow this to be called
together with a custom distance.

Potentially fixes #5914, and
fixes #2774
  • Loading branch information
fkiraly committed Feb 18, 2024
1 parent 63a95f2 commit 2442e5a
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 30 deletions.
204 changes: 175 additions & 29 deletions sktime/classification/distance_based/_time_series_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@
from inspect import signature

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

from sktime.classification.base import BaseClassifier
from sktime.datatypes import check_is_mtype
from sktime.datatypes import convert
from sktime.distances import pairwise_distance

# add new distance string codes here
Expand Down Expand Up @@ -66,28 +67,45 @@ class KNeighborsTimeSeriesClassifier(BaseClassifier):
- [callable] : a user-defined function which accepts an
array of distances, and returns an array of the same shape
containing the weights.
algorithm : str, optional. default = 'brute'
search method for neighbours
one of {'auto', 'ball_tree', 'kd_tree', 'brute'}
one of {'ball_tree', 'brute', 'brute_incr'}
* 'brute' precomputes the distance matrix and applies
``sklearn`` ``KNeighborsClassifier`` directly.
This algorithm is not memory efficient as it scales with the size
of the distance matrix, but may be more runtime efficient.
* 'brute_incr' passes the distance to ``sklearn`` ``KNeighborsClassifier``,
with ``algorithm='brute'``. This is useful for large datasets,
for memory efficiency, as the distance is used incrementally,
without precomputation. However, this may be less runtime efficient.
* 'ball_tree' uses a ball tree to find the nearest neighbors,
using ``KNeighborsClassifier`` from ``sklearn``.
May be more runtime and memory efficient on mid-to-large datasets,
however, the distance computation may be slower.
distance : str or callable, optional. default ='dtw'
distance measure between time series
if str, must be one of the following strings:
'euclidean', 'squared', 'dtw', 'ddtw', 'wdtw', 'wddtw',
'lcss', 'edr', 'erp', 'msm', 'twe'
this will substitute a hard-coded distance metric from sktime.distances
If non-class callable, parameters can be passed via distance_params
Example: knn_dtw = KNeighborsTimeSeriesClassifier(
distance='dtw', distance_params={'epsilon':0.1})
if any callable, must be of signature (X: Panel, X2: Panel) -> np.ndarray
output must be mxn array if X is Panel of m Series, X2 of n Series
if distance_mtype is not set, must be able to take
X, X2 which are pd_multiindex and numpy3D mtype
can be pairwise panel transformer inheriting from BasePairwiseTransformerPanel
* if str, must be one of the following strings:
'euclidean', 'squared', 'dtw', 'ddtw', 'wdtw', 'wddtw',
'lcss', 'edr', 'erp', 'msm', 'twe'
this will substitute a hard-coded distance metric from sktime.distances
* If non-class callable, parameters can be passed via distance_params
Example: knn_dtw = KNeighborsTimeSeriesClassifier(
distance='dtw', distance_params={'epsilon':0.1})
* if any callable, must be of signature (X: Panel, X2: Panel) -> np.ndarray
output must be mxn array if X is Panel of m Series, X2 of n Series
if distance_mtype is not set, must be able to take
X, X2 which are pd_multiindex and numpy3D mtype
can be pairwise panel transformer inheriting from BasePairwiseTransformerPanel
distance_params : dict, optional. default = None.
dictionary for distance parameters, in case that distance is a str or callable
distance_mtype : str, or list of str optional. default = None.
mtype that distance expects for X and X2, if a callable
only set this if distance is not BasePairwiseTransformerPanel descendant
only set this if distance is not BasePairwiseTransformerPanel descendant
pass_train_distances : bool, optional, default = False.
Whether distances between training points are computed and passed to sklearn.
Passing is superfluous for algorithm='brute', but may have impact otherwise.
Expand Down Expand Up @@ -212,6 +230,67 @@ def _distance(self, X, X2=None):
else:
return distance(X, **distance_params)

def _one_element_distance_npdist(self, x, y, n_vars=None):
if n_vars is None:
n_vars = self.n_vars_
x = np.reshape(x, (1, n_vars, -1))
y = np.reshape(y, (1, n_vars, -1))
return self._distance(x, y)[0, 0]

def _one_element_distance_sktime_dist(self, x, y, n_vars=None):
if n_vars is None:
n_vars = self.n_vars_
if n_vars == 1:
x = np.reshape(x, (1, n_vars, -1))
y = np.reshape(y, (1, n_vars, -1))
elif self._X_metadata["is_equal_length"]:
x = np.reshape(x, (-1, n_vars))
y = np.reshape(y, (-1, n_vars))
x_ix = pd.MultiIndex.from_product([[0], range(len(x))])
y_ix = pd.MultiIndex.from_product([[0], range(len(y))])
x = pd.DataFrame(x, index=x_ix)
y = pd.DataFrame(y, index=y_ix)
else: # multivariate, unequal length
# in _convert_X_to_sklearn, we have encoded the length as the first column
# this was coerced to float, so we round to avoid rounding errors
x_len = round(x[0])
y_len = round(y[0])
# pd.pivot switches the axes, compared to numpy
x = np.reshape(x[1:], (n_vars, -1)).T
y = np.reshape(y[1:], (n_vars, -1)).T
# cut to length
x = x[:x_len]
y = y[:y_len]
x_ix = pd.MultiIndex.from_product([[0], range(x_len)])
y_ix = pd.MultiIndex.from_product([[0], range(y_len)])
x = pd.DataFrame(x, index=x_ix)
y = pd.DataFrame(y, index=y_ix)
return self._distance(x, y)[0, 0]

def _convert_X_to_sklearn(self, X):
"""Convert X to 2D numpy for sklearn."""
# special treatment for unequal length series
if not self._X_metadata["is_equal_length"]:
# then we know we are dealing with pd-multiindex
# as a trick to deal with unequal length data,
# we flatten encode the length as the first column
X_w_ix = X.reset_index(-1)
X_pivot = X_w_ix.pivot(columns=[X_w_ix.columns[0]])
# fillna since this creates nan but sklearn does not accept these
# the fill value does not matter as the distance ignores it
X_pivot = X_pivot.fillna(0).to_numpy()
X_lens = X.groupby(X_w_ix.index).size().to_numpy()
# add the first column, encoding length of individual series
X_w_lens = np.concatenate([X_lens[:, None], X_pivot], axis=1)
return X_w_lens

# equal length series case
if isinstance(X, np.ndarray):
X_mtype = "numpy3D"
else:
X_mtype = "pd-multiindex"
return convert(X, from_type=X_mtype, to_type="numpyflat")

def _fit(self, X, y):
"""Fit the model using X as training data and y as target values.
Expand All @@ -222,22 +301,57 @@ def _fit(self, X, y):
y : {array-like, sparse matrix}
Target values of shape = [n]
"""
self.n_vars_ = X.shape[1]
if self.algorithm == "brute":
return self._fit_precomp(X=X, y=y)
else:
return self._fit_dist(X=X, y=y)

def _fit_dist(self, X, y):
"""Fit the model using adapted distance metric."""
# sklearn wants distance callabel element-wise,
# numpy1D x numpy1D -> float
# sktime distance classes are Panel x Panel -> numpy2D
# and the numba distances are numpy3D x numpy3D -> numpy2D
# so we need to wrap the sktime distances
if isinstance(self.distance, str):
# numba distances
metric = self._one_element_distance_npdist
else:
# sktime distance classes
metric = self._one_element_distance_sktime_dist

algorithm = self.algorithm
if algorithm == "brute_incr":
algorithm = "brute"

self.knn_estimator_ = KNeighborsClassifier(
n_neighbors=self.n_neighbors,
algorithm=algorithm,
metric=metric,
leaf_size=self.leaf_size,
n_jobs=self.n_jobs,
weights=self.weights,
)

X = self._convert_X_to_sklearn(X)
self.knn_estimator_.fit(X, y)
return self

def _fit_precomp(self, X, y):
"""Fit the model using precomputed distance matrix."""
# store full data as indexed X
self._X = X

if self.pass_train_distances:
dist_mat = self._distance(X)
else:
n = self._X_metadata["n_instances"]
# if we do not want/need to pass train-train distances,
# we still need to pass a zeros matrix, this means "do not consider"
# citing the sklearn KNeighborsClassifier docs on distance matrix input:
# "X may be a sparse graph, in which case only "nonzero" elements
# may be considered neighbors."
X_inner_mtype = self.get_tag("X_inner_mtype")
_, _, X_meta = check_is_mtype(
X, X_inner_mtype, return_metadata=True, msg_return_dict="list"
)
n = X_meta["n_instances"]
dist_mat = np.zeros([n, n], dtype="float")

self.knn_estimator_.fit(dist_mat, y)
Expand Down Expand Up @@ -294,11 +408,22 @@ def _predict(self, X):
y : array of shape [n_samples] or [n_samples, n_outputs]
Class labels for each data sample.
"""
if self.algorithm == "brute":
return self._predict_precomp(X)
else:
return self._predict_dist(X)

def _predict_dist(self, X):
"""Predict using adapted distance metric."""
X = self._convert_X_to_sklearn(X)
y_pred = self.knn_estimator_.predict(X)
return y_pred

def _predict_precomp(self, X):
"""Predict using precomputed distance matrix."""
# self._X should be the stored _X
dist_mat = self._distance(X, self._X)

y_pred = self.knn_estimator_.predict(dist_mat)

return y_pred

def _predict_proba(self, X):
Expand All @@ -316,11 +441,22 @@ def _predict_proba(self, X):
The class probabilities of the input samples. Classes are ordered
by lexicographic order.
"""
if self.algorithm == "brute":
return self._predict_proba_precomp(X)
else:
return self._predict_proba_dist(X)

def _predict_proba_dist(self, X):
"""Predict (proba) using adapted distance metric."""
X = self._convert_X_to_sklearn(X)
y_pred = self.knn_estimator_.predict_proba(X)
return y_pred

def _predict_proba_precomp(self, X):
"""Predict (proba) using precomputed distance matrix."""
# self._X should be the stored _X
dist_mat = self._distance(X, self._X)

y_pred = self.knn_estimator_.predict_proba(dist_mat)

return y_pred

@classmethod
Expand All @@ -346,15 +482,25 @@ def get_test_params(cls, parameter_set="default"):
`create_test_instance` uses the first (or only) dictionary in `params`.
"""
# non-default distance and algorithm
params1 = {"distance": "euclidean"}
params0 = {"distance": "euclidean"}

# testing distance_params
params2 = {"distance": "dtw", "distance_params": {"epsilon": 0.1}}
params1 = {"distance": "dtw", "distance_params": {"epsilon": 0.1}}

# testing that callables/classes can be passed
from sktime.dists_kernels.compose_tab_to_panel import AggrDist

dist = AggrDist.create_test_instance()
params3 = {"distance": dist}

return [params1, params2, params3]
params2 = {"distance": dist}

params3 = {"algorithm": "ball_tree"}
# params5 = {"algorithm": "kd_tree", "distance": "euclidean"}
params4 = {
"algorithm": "brute_incr",
"distance": "dtw",
"distance_params": {"epsilon": 0.1},
}
params5 = {"algorithm": "ball_tree", "distance": dist}

params = [params0, params1, params2, params3, params4, params5]
return params
1 change: 0 additions & 1 deletion sktime/utils/profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def profile_classifier(
Of each experiment, time spent in fit and time spent in predict is measured.
Parameters
----------
est : sktime classifier, BaseClassifier descendant, object or class
Expand Down

0 comments on commit 2442e5a

Please sign in to comment.