# Как быстро посчитать расстояние между парами объектов

$ X $ - n x d матрица 

$ Y $ - m x d матрица

Необходимо найти матрицу $D$, такую что $D[i, j]$ - это расстояние между $i-ой$ строкой матрицы $X$ и $j-ой$ строкой матрицы $Y$ 

$D$ - n x m матрица

$ D[i, j] = dist(X[i, :], Y[j, :]) $ $ \forall i, j$

В качестве расстояния возьмем евклидово.

Пусть $x$ - строка матрицы X, а $y$ - строка матрицы Y, тогда

$dist(x, y) = \sqrt{\sum_{i=0}^d (x_i - y_i)^2}$

In [1]:
import pandas as pd
import numpy as np
from line_profiler import LineProfiler
from sklearn import datasets

%load_ext Cython

In [2]:
!/Users/Alexander/anaconda2/bin/pip install line_profiler



In [3]:
def profile_print(func_to_call, *args):
    profiler = LineProfiler()
    profiler.add_function(func_to_call)
    profiler.runcall(func_to_call, *args)
    profiler.print_stats()

In [4]:
num_features = 100

# we will need only feature representations
X = datasets.make_blobs(777, n_features=num_features, centers=10)[0]
Y = datasets.make_blobs(500, n_features=num_features, centers=30)[0]
X.shape, Y.shape

((777, 100), (500, 100))

### Naive way

In [5]:
def vector_distance(x, y):
    dist = ((x - y) ** 2).sum()
    return np.sqrt(dist)


def naive_mdist(X, Y):
    D = np.zeros((X.shape[0], Y.shape[0]))
    for i, x in enumerate(X):
        for j, y in enumerate(Y):
            D[i, j] = vector_distance(x, y)
    return D

In [6]:
%%timeit
naive_mdist(X, Y)

1 loop, best of 3: 2.52 s per loop


In [7]:
profile_print(naive_mdist, X, Y)

Timer unit: 1e-06 s

Total time: 3.55532 s
File: <ipython-input-5-1f07dfd582d9>
Function: naive_mdist at line 6

Line #      Hits         Time  Per Hit   % Time  Line Contents
     6                                           def naive_mdist(X, Y):
     7         1        252.0    252.0      0.0      D = np.zeros((X.shape[0], Y.shape[0]))
     8       778        665.0      0.9      0.0      for i, x in enumerate(X):
     9    389277     297462.0      0.8      8.4          for j, y in enumerate(Y):
    10    388500    3256939.0      8.4     91.6              D[i, j] = vector_distance(x, y)
    11         1          0.0      0.0      0.0      return D



In [8]:
D_naive = naive_mdist(X, Y)

## Better numpy

### How to write euclidean distance in vector-matrix form ? 

#### Broadcasting
http://scipy.github.io/old-wiki/pages/EricsBroadcastingDoc

In [9]:
broad = X - Y[0]
not_broad = np.array([x - Y[0] for x in X])

print('Shapes:', broad.shape, not_broad.shape)
print('Number of different values:', (broad != not_broad).sum())

('Shapes:', (777, 100), (777, 100))
('Number of different values:', 0)


In [10]:
%%timeit 
X - Y[0]

The slowest run took 7.95 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 88.7 µs per loop


In [11]:
%%timeit
np.array([x - Y[0] for x in X])

1000 loops, best of 3: 1.4 ms per loop


In [12]:
def broadcast_numpy_dist(X, Y):
    D = np.zeros((X.shape[0], Y.shape[0]))
    for j, y in enumerate(Y):
        dist = ((X - y) ** 2).sum(axis=1)
        D[:, j] = np.sqrt(dist)
    return D

In [13]:
D_broadcast = broadcast_numpy_dist(X, Y)
np.abs(D_broadcast - D_naive).sum()

0.0

In [22]:
%%timeit 
broadcast_numpy_dist(X, Y)

10 loops, best of 3: 87.4 ms per loop


In [23]:
profile_print(broadcast_numpy_dist, X, Y)

NameError: global name 'LineProfiler' is not defined

### Pure numpy

$x$, $y$ - одномерные вектора
$$ dist(x, y)^2 = \sum_i(x_i - y_i)^2 = \sum_i(x_i^2 - 2x_i y_i + y_i^2) = \sum_ix_i^2 -2 \sum_ix_i y_i + \sum_iy_i^2$$

$$ numpy :  np.sum(x \cdot \cdot 2 - 2 \cdot x \cdot y - y \cdot \cdot 2) $$

In [24]:
def numpy_dist(X, Y):
    x_dist = (X ** 2).sum(axis=1)
    y_dist = (Y ** 2).sum(axis=1)
    xy_dist = X.dot(Y.T)
    dist = - 2 * xy_dist + y_dist + x_dist.reshape(-1, 1)
    return np.sqrt(dist)

In [25]:
D_numpy = numpy_dist(X, Y)
np.abs(D_numpy - D_naive).sum()

1.7274857100346708e-09

In [26]:
%%timeit 
numpy_dist(X, Y)

100 loops, best of 3: 5.28 ms per loop


In [27]:
profile_print(numpy_dist, X, Y)

NameError: global name 'LineProfiler' is not defined

### Library (maybe the best way)

In [28]:
from scipy.spatial.distance import cdist

In [29]:
D_scipy = cdist(X, Y, metric='euclidean')
np.abs(D_scipy - D_naive).sum()

3.3365523677275633e-09

In [30]:
%%timeit 
cdist(X, Y, metric='euclidean')

10 loops, best of 3: 38.4 ms per loop


In [31]:
profile_print(cdist, X, Y)

NameError: global name 'LineProfiler' is not defined

### Another one

In [32]:
from sklearn.metrics.pairwise import euclidean_distances as sklearn_euclidean_distances

In [33]:
D_sklearn = sklearn_euclidean_distances(X, Y)
np.abs(D_sklearn - D_naive).sum()

2.034283852481167e-09

In [34]:
%%timeit 
sklearn_euclidean_distances(X, Y)

100 loops, best of 3: 4.66 ms per loop


In [35]:
profile_print(sklearn_euclidean_distances, X, Y)

NameError: global name 'LineProfiler' is not defined

### Numba  (оптимизирует код)

In [37]:
from numba import jit

In [38]:
@jit(nopython=True)
def vector_distance(x, y):
    dist = ((x - y) ** 2).sum()
    return np.sqrt(dist)

@jit(nopython=True)
def numba_mdist(X, Y):
    D = np.zeros((X.shape[0], Y.shape[0]))
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            D[i, j] = vector_distance(X[i, :], Y[j, :])
    return D

In [39]:
D_numba = numba_mdist(X, Y)
np.abs(D_numba - D_naive).sum()

3.3365523677275633e-09

In [40]:
%%timeit 
numba_mdist(X, Y) # x20

10 loops, best of 3: 167 ms per loop


In [41]:
# useless
profile_print(numba_mdist, X, Y)

NameError: global name 'LineProfiler' is not defined

### Broadcast Numba

In [42]:
@jit(nopython=True)
def broadcast_numba_dist(X, Y):
    D = np.zeros((X.shape[0], Y.shape[0]))
    for j in range(Y.shape[0]):
        dist = ((X - Y[j, :]) ** 2).sum(axis=1)
        D[:, j] = np.sqrt(dist)
    return D

In [43]:
D_br_numba = broadcast_numba_dist(X, Y)
np.abs(D_br_numba - D_naive).sum()

3.3365523677275633e-09

In [44]:
%%timeit 
broadcast_numba_dist(X, Y)

10 loops, best of 3: 88.4 ms per loop


### Numpy numba 

In [50]:
@jit(nopython=True)
def numpy_numba_dist(X, Y):
    x_dist = (X ** 2).sum(axis=1)
    y_dist = (Y ** 2).sum(axis=1)
#     xy_dist = X.dot(Y.T)
#   !!!!!!!!!!!!!!!!!!!!!!!!!!!
    xy_dist = np.dot(X, Y.T)
    dist = - 2 * xy_dist + y_dist + x_dist.reshape(-1, 1)
    return np.sqrt(dist)

In [51]:
D_np_numba = numpy_numba_dist(X, Y)
np.abs(D_np_numba - D_naive).sum()

2.8202578050695593e-09

In [52]:
%%timeit 
numpy_numba_dist(X, Y)

100 loops, best of 3: 4.03 ms per loop


### Cython

In [53]:
%%cython
import numpy as np
cimport numpy as np

def vector_distance(x, y):
    dist = ((x - y) ** 2).sum()
    return np.sqrt(dist)


def cython_mdist(X, Y):
    D = np.zeros((X.shape[0], Y.shape[0]))
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            D[i, j] = vector_distance(X[i, :], Y[j, :])
    return D

In [54]:
D_cython = cython_mdist(X, Y)
np.abs(D_cython - D_naive).sum()

0.0

In [55]:
%%timeit 
cython_mdist(X, Y)

1 loop, best of 3: 2.4 s per loop


In [56]:
# useless
profile_print(cython_mdist, X, Y)

NameError: global name 'LineProfiler' is not defined

### Cython annotations

In [60]:
%%cython -a
# показывает "опасные" места

import numpy as np
cimport numpy as np

cdef vector_distance(x, y):
    dist = ((x - y) ** 2).sum()
    return np.sqrt(dist)


cdef cython_mdist(X, Y):
    D = np.zeros((X.shape[0], Y.shape[0]))
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            D[i, j] = vector_distance(X[i, :], Y[j, :])
    return D

In [62]:
%%cython -a
import numpy as np
cimport numpy as np

cdef vector_distance(x, y):
    dist = ((x - y) ** 2).sum()
    return np.sqrt(dist)


cdef cython_mdist(np.ndarray[np.float64_t, ndim=2] X, 
                  np.ndarray[np.float64_t, ndim=2] Y):
    cdef np.ndarray[np.float64_t, ndim=2] D;
    D = np.zeros((X.shape[0], Y.shape[0]))
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            D[i, j] = vector_distance(X[i, :], Y[j, :])
    return D

In [63]:
D_cython = cython_mdist(X, Y)
np.abs(D_cython - D_naive).sum()

0.0

In [64]:
%%timeit 
cython_mdist(X, Y)

1 loop, best of 3: 2.46 s per loop


In [65]:
%%cython -a
import numpy as np
cimport numpy as np
cimport cython

from libc.math cimport sqrt


cpdef cython_mdist(np.ndarray[np.float64_t, ndim=2] X, 
                  np.ndarray[np.float64_t, ndim=2] Y):
    cdef np.ndarray[np.float64_t, ndim=2] D;
    D = np.zeros((X.shape[0], Y.shape[0]), dtype=np.float64)
    cdef np.float64_t dist = 0.0;
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            dist = 0.0
            for d in range(X.shape[1]):
                dist += (X[i, d] - Y[j, d]) ** 2
            D[i, j] = sqrt(dist)
    return D

In [66]:
D_cython = cython_mdist(X, Y)
np.abs(D_cython - D_naive).sum()

3.3365523677275633e-09

In [67]:
%%timeit 
cython_mdist(X, Y)

10 loops, best of 3: 82.5 ms per loop


In [69]:
%%cython -a
import numpy as np
cimport numpy as np
cimport cython

from libc.math cimport sqrt


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cython_mdist(np.ndarray[np.float64_t, ndim=2] X, 
                  np.ndarray[np.float64_t, ndim=2] Y):
    cdef np.ndarray[np.float64_t, ndim=2] D;
    D = np.zeros((X.shape[0], Y.shape[0]), dtype=np.float64)
    cdef np.float64_t dist = 0.0;
    for i in range(X.shape[0]):
        for j in range(Y.shape[0]):
            dist = 0.0
            for d in range(X.shape[1]):
                dist += (X[i, d] - Y[j, d]) ** 2
            D[i, j] = sqrt(dist)
    return D

In [70]:
D_cython = cython_mdist(X, Y)
np.abs(D_cython - D_naive).sum()

3.3365523677275633e-09

In [71]:
%%timeit 
cython_mdist(X, Y)

10 loops, best of 3: 33.6 ms per loop
