Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] k-nearest neighbors regressor: support for non-brute algorithms and non-precomputed mode to improve memory efficiency #6217

Merged
merged 3 commits into from Apr 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
168 changes: 167 additions & 1 deletion sktime/regression/distance_based/_time_series_neighbors.py
Expand Up @@ -11,8 +11,11 @@
__author__ = ["fkiraly"]
__all__ = ["KNeighborsTimeSeriesRegressor"]

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor

from sktime.datatypes import convert
from sktime.distances import pairwise_distance
from sktime.regression.base import BaseRegressor

Expand Down Expand Up @@ -48,7 +51,21 @@ class KNeighborsTimeSeriesRegressor(BaseRegressor):
one of: 'uniform', 'distance', or a callable function
algorithm : str, optional. default = 'brute'
search method for neighbours
one of {'auto', 'ball_tree', 'kd_tree', 'brute'}
one of {'auto', 'ball_tree', 'brute', 'brute_incr'}

* 'brute' precomputes the distance matrix and applies
``sklearn`` ``KNeighborsRegressor`` directly.
This algorithm is not memory efficient as it scales with the size
of the distance matrix, but may be more runtime efficient.
* 'brute_incr' passes the distance to ``sklearn`` ``KNeighborsRegressor``,
with ``algorithm='brute'``. This is useful for large datasets,
for memory efficiency, as the distance is used incrementally,
without precomputation. However, this may be less runtime efficient.
* 'ball_tree' uses a ball tree to find the nearest neighbors,
using ``KNeighborsRegressor`` from ``sklearn``.
May be more runtime and memory efficient on mid-to-large datasets,
however, the distance computation may be slower.

distance : str or callable, optional. default ='dtw'
distance measure between time series
if str, must be one of the following strings:
Expand Down Expand Up @@ -172,6 +189,67 @@ def _distance(self, X, X2):
else:
return distance(X, X2)

def _one_element_distance_npdist(self, x, y, n_vars=None):
if n_vars is None:
n_vars = self.n_vars_
x = np.reshape(x, (1, n_vars, -1))
y = np.reshape(y, (1, n_vars, -1))
return self._distance(x, y)[0, 0]

def _one_element_distance_sktime_dist(self, x, y, n_vars=None):
if n_vars is None:
n_vars = self.n_vars_
if n_vars == 1:
x = np.reshape(x, (1, n_vars, -1))
y = np.reshape(y, (1, n_vars, -1))
elif self._X_metadata["is_equal_length"]:
x = np.reshape(x, (-1, n_vars))
y = np.reshape(y, (-1, n_vars))
x_ix = pd.MultiIndex.from_product([[0], range(len(x))])
y_ix = pd.MultiIndex.from_product([[0], range(len(y))])
x = pd.DataFrame(x, index=x_ix)
y = pd.DataFrame(y, index=y_ix)
else: # multivariate, unequal length
# in _convert_X_to_sklearn, we have encoded the length as the first column
# this was coerced to float, so we round to avoid rounding errors
x_len = round(x[0])
y_len = round(y[0])
# pd.pivot switches the axes, compared to numpy
x = np.reshape(x[1:], (n_vars, -1)).T
y = np.reshape(y[1:], (n_vars, -1)).T
# cut to length
x = x[:x_len]
y = y[:y_len]
x_ix = pd.MultiIndex.from_product([[0], range(x_len)])
y_ix = pd.MultiIndex.from_product([[0], range(y_len)])
x = pd.DataFrame(x, index=x_ix)
y = pd.DataFrame(y, index=y_ix)
return self._distance(x, y)[0, 0]

def _convert_X_to_sklearn(self, X):
"""Convert X to 2D numpy for sklearn."""
# special treatment for unequal length series
if not self._X_metadata["is_equal_length"]:
# then we know we are dealing with pd-multiindex
# as a trick to deal with unequal length data,
# we flatten encode the length as the first column
X_w_ix = X.reset_index(-1)
X_pivot = X_w_ix.pivot(columns=[X_w_ix.columns[0]])
# fillna since this creates nan but sklearn does not accept these
# the fill value does not matter as the distance ignores it
X_pivot = X_pivot.fillna(0).to_numpy()
X_lens = X.groupby(X_w_ix.index).size().to_numpy()
# add the first column, encoding length of individual series
X_w_lens = np.concatenate([X_lens[:, None], X_pivot], axis=1)
return X_w_lens

# equal length series case
if isinstance(X, np.ndarray):
X_mtype = "numpy3D"
else:
X_mtype = "pd-multiindex"
return convert(X, from_type=X_mtype, to_type="numpyflat")

def _fit(self, X, y):
"""Fit the model using X as training data and y as target values.

Expand All @@ -181,6 +259,44 @@ def _fit(self, X, y):
y : {array-like, sparse matrix}
Target values of shape = [n_samples]
"""
self.n_vars_ = X.shape[1]
if self.algorithm == "brute":
return self._fit_precomp(X=X, y=y)
else:
return self._fit_dist(X=X, y=y)

def _fit_dist(self, X, y):
"""Fit the model using adapted distance metric."""
# sklearn wants distance callabel element-wise,
# numpy1D x numpy1D -> float
# sktime distance classes are Panel x Panel -> numpy2D
# and the numba distances are numpy3D x numpy3D -> numpy2D
# so we need to wrap the sktime distances
if isinstance(self.distance, str):
# numba distances
metric = self._one_element_distance_npdist
else:
# sktime distance classes
metric = self._one_element_distance_sktime_dist

algorithm = self.algorithm
if algorithm == "brute_incr":
algorithm = "brute"

self.knn_estimator_ = KNeighborsRegressor(
n_neighbors=self.n_neighbors,
algorithm=algorithm,
metric=metric,
leaf_size=self.leaf_size,
n_jobs=self.n_jobs,
)

X = self._convert_X_to_sklearn(X)
self.knn_estimator_.fit(X, y)
return self

def _fit_precomp(self, X, y):
"""Fit the model using precomputed distance matrix."""
# store full data as indexed X
self._X = X

Expand Down Expand Up @@ -240,9 +356,59 @@ def _predict(self, X):
y : array of shape [n_samples] or [n_samples, n_outputs]
Class labels for each data sample.
"""
if self.algorithm == "brute":
return self._predict_precomp(X)
else:
return self._predict_dist(X)

def _predict_dist(self, X):
"""Predict using adapted distance metric."""
X = self._convert_X_to_sklearn(X)
y_pred = self.knn_estimator_.predict(X)
return y_pred

def _predict_precomp(self, X):
"""Predict using precomputed distance matrix."""
# self._X should be the stored _X
dist_mat = self._distance(X, self._X)

y_pred = self.knn_estimator_.predict(dist_mat)

return y_pred

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return ``"default"`` set.

Returns
-------
params : dict or list of dict, default={}
Parameters to create testing instances of the class.
Each dict are parameters to construct an "interesting" test instance, i.e.,
``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test
instance.
``create_test_instance`` uses the first (or only) dictionary in ``params``.
"""
param1 = {
"n_neighbors": 1,
"weights": "uniform",
"algorithm": "auto",
"distance": "euclidean",
"distance_params": None,
"n_jobs": None,
}
param2 = {
"n_neighbors": 3,
"weights": "distance",
"algorithm": "ball_tree",
"distance": "dtw",
"distance_params": {"window": 0.5},
"n_jobs": -1,
}
return [param1, param2]