In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree as skKNN
from cuml import KNN as cumlKNN
import cudf
import os

import numba.cuda

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
#     if os.path.exists(cached) and source=='mortgage':
#         print('use mortgage data')
#         with gzip.open(cached) as f:
#             X = np.load(f)
#         X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
#     else:
    print('use random data')
    X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=1e-2,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    
    print(a)
    print(b)
    
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
    else:
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    elif isinstance(x, numba.cuda.cudadrv.devicearray.DeviceNDArray):
        return np.asarray(x)
    return x    

# Run tests

In [5]:
%%time
nrows = 15
ncols = 2

X = load_data(nrows,ncols)
print('data',X.shape)

use random data
data (15, 2)
CPU times: user 1.44 ms, sys: 937 µs, total: 2.38 ms
Wall time: 2 ms


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
fea0    15 non-null float32
fea1    15 non-null float32
dtypes: float32(2)
memory usage: 200.0 bytes


In [7]:
n_neighbors = 10

In [8]:
print(str(X))

        fea0      fea1
0   0.940755  0.856856
1   0.476706  0.925938
2   0.735778  0.430330
3   0.812152  0.424700
4   0.684502  0.772031
5   0.926316  0.158021
6   0.017679  0.070085
7   0.351044  0.618492
8   0.219023  0.435588
9   0.586819  0.099291
10  0.137871  0.741323
11  0.440433  0.432177
12  0.608372  0.285098
13  0.637000  0.733376
14  0.146847  0.989862


In [9]:
%%time
knn_sk = skKNN(X)
D_sk,I_sk = knn_sk.query(X, 5)

CPU times: user 1.05 ms, sys: 5 µs, total: 1.05 ms
Wall time: 1.02 ms


In [10]:
print(str(D_sk))

[[0.         0.26992688 0.32789377 0.45088524 0.46916262]
 [0.         0.25054785 0.25858603 0.33213581 0.3359959 ]
 [0.         0.07658132 0.19319607 0.29535102 0.31873834]
 [0.         0.07658132 0.24701223 0.29008768 0.35490796]
 [0.         0.06124295 0.25858603 0.26992688 0.34552677]
 [0.         0.29008768 0.33235061 0.34239906 0.34453975]
 [0.         0.41729051 0.55662489 0.56988857 0.62860851]
 [0.         0.20664876 0.22557434 0.24602948 0.30817013]
 [0.         0.22143674 0.22557434 0.31632205 0.41729051]
 [0.         0.1870529  0.34453975 0.36300933 0.36365033]
 [0.         0.24602948 0.24870108 0.31632205 0.38586587]
 [0.         0.20664876 0.22143674 0.22323878 0.29535102]
 [0.         0.1870529  0.19319607 0.22323878 0.24701223]
 [0.         0.06124295 0.25054785 0.30817013 0.31873834]
 [0.         0.24870108 0.3359959  0.42380675 0.55320388]]


In [11]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 361 ms, sys: 169 ms, total: 530 ms
Wall time: 534 ms


In [12]:
print(str(X))

          fea0       fea1
 0   0.9407548 0.85685587
 1  0.47670612  0.9259385
 2   0.7357783 0.43033025
 3   0.8121523 0.42469975
 4  0.68450236  0.7720312
 5  0.92631614 0.15802114
 6 0.017679134 0.07008498
 7  0.35104445   0.618492
 8  0.21902256 0.43558764
 9   0.5868188  0.0992912
[5 more rows]


In [13]:
%%time
knn_cuml = cumlKNN()
knn_cuml.fit(X)

float32
140699418756608
15
15
2
CPU times: user 132 ms, sys: 260 ms, total: 392 ms
Wall time: 390 ms


In [14]:
print(str(X[0:4]))

        fea0       fea1
0  0.9407548 0.85685587
1 0.47670612  0.9259385
2  0.7357783 0.43033025
3  0.8121523 0.42469975


In [15]:
print(str(np.asarray(X[0:4].as_gpu_matrix(order="C"))))

[[0.94075477 0.85685587]
 [0.47670612 0.92593849]
 [0.73577827 0.43033025]
 [0.81215233 0.42469975]]


In [16]:
D_cuml, I_cuml = knn_cuml.query(X,5)

15
int64
[ 0.  4. 13.  3.  1.  1. 13.  4.  7. 14.  2.  3. 12. 11. 13.  3.  2. 12.
  5. 13.  4. 13.  1.  0.  2.  5.  3.  2. 12.  9.  6.  8. 11.  9. 12.  7.
 11.  8. 10. 13.  8. 11.  7. 10.  6.  9. 12.  5.  2. 11. 10.  7. 14.  8.
  1. 11.  7.  8. 12.  2. 12.  9.  2. 11.  3. 13.  4.  1.  7.  2. 14. 10.
  1.  7. 13.]
[-2.38418579e-07  7.28604794e-02  1.07514381e-01  2.03297377e-01
  2.20113397e-01  0.00000000e+00  6.27741814e-02  6.68667555e-02
  1.10314190e-01  1.12893224e-01  0.00000000e+00  5.86473942e-03
  3.73247862e-02  8.72322321e-02  1.01594210e-01  1.19209290e-07
  5.86473942e-03  6.10150099e-02  8.41509104e-02  1.25959635e-01
  0.00000000e+00  3.75080109e-03  6.68667555e-02  7.28604794e-02
  1.19388819e-01  1.19209290e-07  8.41509104e-02  1.10456944e-01
  1.17237091e-01  1.18707716e-01  0.00000000e+00  1.74131349e-01
  3.09831291e-01  3.24773014e-01  3.95148665e-01  0.00000000e+00
  4.27036583e-02  5.08837402e-02  6.05305433e-02  9.49688554e-02
 -2.98023224e-08  4.90342379e-02  5

In [17]:
print(str(I_cuml))

      0    1    2    3    4
 0    0    4   13    3    1
 1    1   13    4    7   14
 2    2    3   12   11   13
 3    3    2   12    5   13
 4    4   13    1    0    2
 5    5    3    2   12    9
 6    6    8   11    9   12
 7    7   11    8   10   13
 8    8   11    7   10    6
 9    9   12    5    2   11
[5 more rows]


In [18]:
I_sk

array([[ 0,  4, 13,  3,  1],
       [ 1, 13,  4,  7, 14],
       [ 2,  3, 12, 11, 13],
       [ 3,  2, 12,  5, 13],
       [ 4, 13,  1,  0,  2],
       [ 5,  3,  2, 12,  9],
       [ 6,  8, 11,  9, 12],
       [ 7, 11,  8, 10, 13],
       [ 8, 11,  7, 10,  6],
       [ 9, 12,  5,  2, 11],
       [10,  7, 14,  8,  1],
       [11,  7,  8, 12,  2],
       [12,  9,  2, 11,  3],
       [13,  4,  1,  7,  2],
       [14, 10,  1,  7, 13]])

In [19]:
passed = array_equal(D_sk,D_cuml)
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)
passed = array_equal(I_sk,I_cuml)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

[[0.         0.26992688 0.32789377 0.45088524 0.46916262]
 [0.         0.25054785 0.25858603 0.33213581 0.3359959 ]
 [0.         0.07658132 0.19319607 0.29535102 0.31873834]
 [0.         0.07658132 0.24701223 0.29008768 0.35490796]
 [0.         0.06124295 0.25858603 0.26992688 0.34552677]
 [0.         0.29008768 0.33235061 0.34239906 0.34453975]
 [0.         0.41729051 0.55662489 0.56988857 0.62860851]
 [0.         0.20664876 0.22557434 0.24602948 0.30817013]
 [0.         0.22143674 0.22557434 0.31632205 0.41729051]
 [0.         0.1870529  0.34453975 0.36300933 0.36365033]
 [0.         0.24602948 0.24870108 0.31632205 0.38586587]
 [0.         0.20664876 0.22143674 0.22323878 0.29535102]
 [0.         0.1870529  0.19319607 0.22323878 0.24701223]
 [0.         0.06124295 0.25054785 0.30817013 0.31873834]
 [0.         0.24870108 0.3359959  0.42380675 0.55320388]]
[[-2.38418579e-07  7.28604794e-02  1.07514381e-01  2.03297377e-01
   2.20113397e-01]
 [ 0.00000000e+00  6.27741814e-02  6.6866755

In [20]:
a = np.zeros((30, 2), dtype=np.int64)

In [21]:
def to_cudf(df, col=''):
    # convert pandas dataframe to cudf dataframe
    if isinstance(df,np.ndarray):
        df = pd.DataFrame({'%s_neighbor_%d'%(col, i): df[:, i] for i in range(df.shape[1])})
    pdf = cudf.DataFrame.from_pandas(df)
    return pdf

In [22]:
to_cudf(a)

<cudf.DataFrame ncols=2 nrows=30 >