In [1]:
import numpy as np
import pandas as pd
import cudf
import os

from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN

# Helper Functions

In [2]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.g',source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [20]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=1e-3,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
        res = error<threshold
    elif metric=='abs':
        error = a-b
        res = len(error[error>threshold]) == 0
    elif metric == 'acc':
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
        res = error<threshold
    return res

def accuracy(a,b, threshold=1e-4):
    a = to_nparray(a)
    b = to_nparray(b)
    c = a-b
    c = len(c[c>1]) / (c.shape[0]*c.shape[1])
    return c<threshold

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

# Run tests

In [4]:
%%time
nrows = 2**15
ncols = 40

X = load_data(nrows,ncols)
print('data',X.shape)

use random data
data (32768, 40)
CPU times: user 34.2 ms, sys: 3.49 ms, total: 37.6 ms
Wall time: 35.9 ms


In [5]:
n_neighbors = 10

In [6]:
%%time
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X,n_neighbors)

CPU times: user 31.5 s, sys: 4.52 s, total: 36 s
Wall time: 36 s


In [7]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 361 ms, sys: 71.5 ms, total: 432 ms
Wall time: 430 ms


In [9]:
%%time
knn_cuml = cumlKNN()
knn_cuml.fit(X)
D_cuml,I_cuml = knn_cuml.kneighbors(X,n_neighbors)

CPU times: user 6.75 s, sys: 250 ms, total: 7 s
Wall time: 2.71 s


In [23]:
passed = array_equal(D_sk,D_cuml, metric='abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn distances NOT equal


In [17]:
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn indexes equal
