Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
Implements minkowski distance for knn
Browse files Browse the repository at this point in the history
  • Loading branch information
sdpython committed Aug 30, 2019
1 parent c25113f commit 6c5a8fc
Show file tree
Hide file tree
Showing 4 changed files with 256 additions and 19 deletions.
2 changes: 2 additions & 0 deletions _doc/sphinxdoc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,13 @@
'json': 'https://docs.python.org/3/library/json.html',
'JSON': 'https://en.wikipedia.org/wiki/JSON',
'lightgbm': 'https://lightgbm.readthedocs.io/en/latest/',
'Minkowski distance': 'https://en.wikipedia.org/wiki/Minkowski_distance',
'ONNX': 'https://onnx.ai/',
'onnx': 'https://github.com/onnx/onnx',
'ONNX Operators': 'https://github.com/onnx/onnx/blob/master/docs/Operators.md',
'ONNX ML Operators': 'https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md',
'onnxconverter_common': 'https://github.com/onnx/onnxmltools/tree/master/onnxutils/onnxconverter_common',
'OnnxOperatorMixin': 'https://github.com/onnx/sklearn-onnx/blob/master/skl2onnx/algebra/onnx_operator_mixin.py#L16',
'onnxruntime': 'https://github.com/microsoft/onnxruntime',
'Python': 'https://www.python.org/',
'Rust': 'https://www.rust-lang.org/',
Expand Down
108 changes: 102 additions & 6 deletions _unittests/ut_onnx_conv/test_onnx_conv_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
from logging import getLogger
import warnings
import numpy
from scipy.spatial.distance import cdist as scipy_cdist
from pyquickhelper.pycode import ExtTestCase
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.algebra.onnx_ops import ( # pylint: disable=E0611
OnnxAdd, OnnxIdentity
)
from mlprodict.onnx_conv import register_converters
from mlprodict.onnx_conv.sklconv.knn import onnx_cdist
from mlprodict.onnxrt import OnnxInference, to_onnx


Expand All @@ -19,15 +25,101 @@ def setUp(self):
logger = getLogger('skl2onnx')
logger.disabled = True

def test_onnx_example_cdist_in_euclidean(self):
x = numpy.array([1, 2, 4, 5, 5, 4]).astype(
numpy.float32).reshape((3, 2))
x2 = numpy.array([1.1, 2.1, 4.01, 5.01, 5.001, 4.001, 0, 0]).astype(
numpy.float32).reshape((4, 2))
cop = OnnxAdd('input', 'input')
cop2 = OnnxIdentity(onnx_cdist(cop, x2, dtype=numpy.float32, metric='euclidean'),
output_names=['cdist'])

model_def = cop2.to_onnx(
inputs=[('input', FloatTensorType([None, None]))],
outputs=[('cdist', FloatTensorType())])

sess = OnnxInference(model_def)
res = sess.run({'input': x})['cdist']
exp = scipy_cdist(x * 2, x2, metric="euclidean")
self.assertEqualArray(exp, res, decimal=5)

x = numpy.array(
[[6.1, 2.8, 4.7, 1.2],
[5.7, 3.8, 1.7, 0.3],
[7.7, 2.6, 6.9, 2.3],
[6.0, 2.9, 4.5, 1.5],
[6.8, 2.8, 4.8, 1.4],
[5.4, 3.4, 1.5, 0.4],
[5.6, 2.9, 3.6, 1.3],
[6.9, 3.1, 5.1, 2.3]], dtype=numpy.float32)
cop = OnnxAdd('input', 'input')
cop2 = OnnxIdentity(onnx_cdist(cop, x, dtype=numpy.float32),
output_names=['cdist'])

model_def = cop2.to_onnx(
inputs=[('input', FloatTensorType([None, None]))],
outputs=[('cdist', FloatTensorType())])

sess = OnnxInference(model_def)
res = sess.run({'input': x})['cdist']
exp = scipy_cdist(x * 2, x, metric="sqeuclidean")
self.assertEqualArray(exp, res, decimal=4)

def test_onnx_example_cdist_in_minkowski(self):
x = numpy.array([1, 2, 1, 3, 2, 2, 2, 3]).astype(
numpy.float32).reshape((4, 2))
x2 = numpy.array([[1, 2], [2, 2], [2.1, 2.1], [2, 2]]).astype(
numpy.float32).reshape((4, 2))
cop = OnnxIdentity('input')
pp = 1.
cop2 = OnnxIdentity(
onnx_cdist(cop, x2, dtype=numpy.float32, metric="minkowski", p=pp),
output_names=['cdist'])

model_def = cop2.to_onnx(
inputs=[('input', FloatTensorType([None, None]))],
outputs=[('cdist', FloatTensorType())])

sess = OnnxInference(model_def)
res = sess.run({'input': x})['cdist']
exp = scipy_cdist(x, x2, metric="minkowski", p=pp)
self.assertEqualArray(exp, res, decimal=5)

x = numpy.array(
[[6.1, 2.8, 4.7, 1.2],
[5.7, 3.8, 1.7, 0.3],
[7.7, 2.6, 6.9, 2.3],
[6.0, 2.9, 4.5, 1.5],
[6.8, 2.8, 4.8, 1.4],
[5.4, 3.4, 1.5, 0.4],
[5.6, 2.9, 3.6, 1.3],
[6.9, 3.1, 5.1, 2.3]], dtype=numpy.float32)
cop = OnnxAdd('input', 'input')
cop2 = OnnxIdentity(
onnx_cdist(cop, x, dtype=numpy.float32, metric="minkowski", p=3),
output_names=['cdist'])

model_def = cop2.to_onnx(
inputs=[('input', FloatTensorType([None, None]))],
outputs=[('cdist', FloatTensorType())])

sess = OnnxInference(model_def)
res = sess.run({'input': x})['cdist']
exp = scipy_cdist(x * 2, x, metric="minkowski", p=3)
self.assertEqualArray(exp, res, decimal=4)

def test_register_converters(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore", ResourceWarning)
res = register_converters(True)
self.assertGreater(len(res), 2)

def onnx_test_knn_single_regressor(self, dtype, n_targets=1, debug=False, **kwargs):
def onnx_test_knn_single_regressor(self, dtype, n_targets=1, debug=False,
add_noise=False, **kwargs):
iris = load_iris()
X, y = iris.data, iris.target
if add_noise:
X += numpy.random.randn(X.shape[0], X.shape[1]) * 10
y = y.astype(dtype)
if n_targets != 1:
yn = numpy.empty((y.shape[0], n_targets), dtype=dtype)
Expand Down Expand Up @@ -83,15 +175,19 @@ def test_onnx_test_knn_single_regressor32_k1_target2(self):
def test_onnx_test_knn_single_regressor32_minkowski(self):
self.onnx_test_knn_single_regressor(numpy.float32, metric='minkowski')

def test_onnx_test_knn_single_regressor32_minkowski_p1(self):
self.onnx_test_knn_single_regressor(numpy.float32, metric='minkowski',
metric_params={'p': 1}, add_noise=True)

def test_onnx_test_knn_single_regressor32_minkowski_p21(self):
self.onnx_test_knn_single_regressor(numpy.float32, metric='minkowski',
algorithm='brute', metric_params={'p': 2.1})

@unittest.skip(reason="not yet implemented")
def test_onnx_test_knn_single_regressor32_distance(self):
self.onnx_test_knn_single_regressor(numpy.float32, weights='distance')

@unittest.skip(reason="not yet implemented")
def test_onnx_test_knn_single_regressor32_minkowski_p3(self):
self.onnx_test_knn_single_regressor(numpy.float32, metric='minkowski',
metric_params={'p': 3})


if __name__ == "__main__":
TestOnnxConvKNN().test_onnx_example_cdist_in_minkowski()
unittest.main()
156 changes: 144 additions & 12 deletions mlprodict/onnx_conv/sklconv/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,158 @@
@brief Rewrites some of the converters implemented in
:epkg:`sklearn-onnx`.
"""
from collections import OrderedDict
import numpy
from skl2onnx.algebra.complex_functions import onnx_cdist
from skl2onnx.common.data_types import FloatTensorType, DoubleTensorType
from skl2onnx.algebra.onnx_ops import ( # pylint: disable=E0611
OnnxTopK, OnnxMul, OnnxArrayFeatureExtractor, OnnxReduceMean,
OnnxFlatten, OnnxShape, OnnxReshape,
OnnxConcat, OnnxTranspose
OnnxConcat, OnnxTranspose, OnnxSub,
OnnxIdentity, OnnxReduceSumSquare,
OnnxScan, OnnxSqrt,
OnnxPow, OnnxReduceSum, OnnxAbs
)


def onnx_nearest_neighbors_indices(X, Y, k, metric='euclidean', dtype=None, **kwargs):
def onnx_cdist(X, Y, metric='sqeuclidean', dtype=None, op_version=None, **kwargs):
"""
Returns the ONNX graph which computes
``cdist(X, Y, metric=metric)``.
:param X: :epkg:`numpy:ndarray` or :epkg:`OnnxOperatorMixin`
:param Y: :epkg:`numpy:ndarray` or :epkg:`OnnxOperatorMixin`
:param metric: distance type
:param dtype: *numpy.float32* or *numpy.float64*
:param op_version: opset version
:param kwargs: addition parameter
:return: :epkg:`OnnxOperatorMixin`
"""
if metric == 'sqeuclidean':
return _onnx_cdist_sqeuclidean(
X, Y, dtype=dtype, op_version=op_version, **kwargs)
elif metric == 'euclidean':
res = _onnx_cdist_sqeuclidean(X, Y, dtype=dtype, op_version=op_version)
return OnnxSqrt(res, op_version=op_version, **kwargs)
elif metric == 'minkowski':
p = kwargs.pop('p')
res = _onnx_cdist_minkowski(
X, Y, dtype=dtype, op_version=op_version, p=p)
return OnnxPow(res, numpy.array([1. / p], dtype=dtype),
op_version=op_version, **kwargs)
elif metric == 'manhattan':
return _onnx_cdist_manhattan(
X, Y, dtype=dtype, op_version=op_version, **kwargs)
else:
raise NotImplementedError("metric='{}' is not implemented.".format(
metric))


def _onnx_cdist_sqeuclidean(X, Y, dtype=None, op_version=None, **kwargs):
"""
Returns the ONNX graph which computes
``cdist(X, metric='sqeuclidean')``.
"""
diff = OnnxSub('next_in', 'next', output_names=[
'diff'], op_version=op_version)
id_next = OnnxIdentity('next_in', output_names=[
'next_out'], op_version=op_version)
norm = OnnxReduceSumSquare(diff, output_names=['norm'], axes=[
1], keepdims=0, op_version=op_version)
flat = OnnxIdentity(norm, output_names=['scan_out'], op_version=op_version)
tensor_type = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
id_next.set_onnx_name_prefix('cdistsqe')
scan_body = id_next.to_onnx(
OrderedDict([('next_in', tensor_type()),
('next', tensor_type())]),
outputs=[('next_out', tensor_type()),
('scan_out', tensor_type())],
other_outputs=[flat],
dtype=dtype)

node = OnnxScan(X, Y, output_names=['scan0_{idself}', 'scan1_{idself}'],
num_scan_inputs=1, body=scan_body.graph, op_version=op_version)
return OnnxTranspose(node[1], perm=[1, 0], op_version=op_version,
**kwargs)


def _onnx_cdist_minkowski(X, Y, dtype=None, op_version=None, p=2, **kwargs):
"""
Returns the ONNX graph which computes the :epkg:`Minkowski distance`
or ``minkowski(X, Y, p)``.
"""
diff = OnnxSub('next_in', 'next', output_names=[
'diff'], op_version=op_version)
id_next = OnnxIdentity('next_in', output_names=[
'next_out'], op_version=op_version)
diff_pow = OnnxPow(OnnxAbs(diff, op_version=op_version),
numpy.array([p], dtype=dtype), op_version=op_version)
norm = OnnxReduceSum(diff_pow, axes=[1], output_names=[
'norm'], keepdims=0, op_version=op_version)
flat = OnnxIdentity(norm, output_names=['scan_out'], op_version=op_version)
tensor_type = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
id_next.set_onnx_name_prefix('cdistmink')
scan_body = id_next.to_onnx(
OrderedDict([('next_in', tensor_type()),
('next', tensor_type())]),
outputs=[('next_out', tensor_type()),
('scan_out', tensor_type())],
other_outputs=[flat],
dtype=dtype)

node = OnnxScan(X, Y, output_names=['scan0_{idself}', 'scan1_{idself}'],
num_scan_inputs=1, body=scan_body.graph, op_version=op_version)
return OnnxTranspose(node[1], perm=[1, 0], op_version=op_version,
**kwargs)


def _onnx_cdist_manhattan(X, Y, dtype=None, op_version=None, **kwargs):
"""
Returns the ONNX graph which computes the :epkg:`Minkowski distance`
or ``minkowski(X, Y, p)``.
"""
diff = OnnxSub('next_in', 'next', output_names=[
'diff'], op_version=op_version)
id_next = OnnxIdentity('next_in', output_names=[
'next_out'], op_version=op_version)
diff_pow = OnnxAbs(diff, op_version=op_version)
norm = OnnxReduceSum(diff_pow, axes=[1], output_names=[
'norm'], keepdims=0, op_version=op_version)
flat = OnnxIdentity(norm, output_names=['scan_out'], op_version=op_version)
tensor_type = FloatTensorType if dtype == numpy.float32 else DoubleTensorType
id_next.set_onnx_name_prefix('cdistmink')
scan_body = id_next.to_onnx(
OrderedDict([('next_in', tensor_type()),
('next', tensor_type())]),
outputs=[('next_out', tensor_type()),
('scan_out', tensor_type())],
other_outputs=[flat],
dtype=dtype)

node = OnnxScan(X, Y, output_names=['scan0_{idself}', 'scan1_{idself}'],
num_scan_inputs=1, body=scan_body.graph, op_version=op_version)
return OnnxTranspose(node[1], perm=[1, 0], op_version=op_version,
**kwargs)


def onnx_nearest_neighbors_indices(X, Y, k, metric='euclidean', dtype=None,
op_version=None, **kwargs):
"""
Retrieves the nearest neigbours :epkg:`ONNX`.
:param X: features
:param Y: neighbours
:param X: features or :epkg:`OnnxOperatorMixin`
:param Y: neighbours or :epkg:`OnnxOperatorMixin`
:param k: number of neighbours to retrieve
:param metric: requires metric
:param dtype: numerical type
:param op_version: opset version
:param kwargs: additional parameters such as *op_version*
:return: top indices
"""
dist = onnx_cdist(X, Y, metric=metric, dtype=dtype, **kwargs)
neg_dist = OnnxMul(dist, numpy.array([-1], dtype=dtype))
dist = onnx_cdist(X, Y, metric=metric, dtype=dtype,
op_version=op_version, **kwargs)
neg_dist = OnnxMul(dist, numpy.array(
[-1], dtype=dtype), op_version=op_version)
topk = OnnxTopK(neg_dist, numpy.array([k], dtype=numpy.int64),
**kwargs)[1]
op_version=op_version, **kwargs)[1]
return topk


Expand All @@ -50,12 +178,16 @@ def convert_nearest_neighbors_regressor(scope, operator, container):
neighb = op._fit_X.astype(container.dtype)
k = op.n_neighbors
training_labels = op._y
# distance_power = (
# op.p if op.metric == 'minkowski'
# else (2 if op.metric in ('euclidean', 'l2') else 1))
distance_kwargs = {}
if metric == 'minkowski':
if op.p != 2:
distance_kwargs['p'] = op.p
else:
metric = "euclidean"

top_indices = onnx_nearest_neighbors_indices(
X, neighb, k, metric=metric, dtype=dtype, op_version=opv)
X, neighb, k, metric=metric, dtype=dtype,
op_version=opv, **distance_kwargs)
shape = OnnxShape(top_indices, op_version=opv)
flattened = OnnxFlatten(top_indices, op_version=opv)
if ndim > 1:
Expand Down
9 changes: 8 additions & 1 deletion mlprodict/onnxrt/validate/validate_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, OutputCodeClassifier
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier, ClassifierChain, RegressorChain
from sklearn.neighbors import LocalOutlierFactor
from sklearn.neighbors import LocalOutlierFactor, KNeighborsRegressor
from sklearn.preprocessing import Normalizer
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.svm import SVC, NuSVC
Expand Down Expand Up @@ -87,6 +87,13 @@ def build_custom_scenarios():
'param_grid': {'n_clusters': [2, 3]},
}, ['cluster']),
],
KNeighborsRegressor: [
('default', {'algorithm': 'brute'}),
('kd_tree', {'algorithm': 'kd_tree'}),
('mink', {'algorithm': 'kd_tree',
'distance': "minkowski",
'metric_params': {'p': 2.1}}),
],
LocalOutlierFactor: [
('novelty', {
'novelty': True,
Expand Down

0 comments on commit 6c5a8fc

Please sign in to comment.