## KNN
K Nearnest Neighbors (KNN) is an supervised algorithm which uses the information of the nearest neighbors of a point to predict its label.
The cuml library has an implementation of the knn model which allows the user to change the number of neighbors whos, normalize and algorithm parameters. The linear regression model can use two different algorithms: eig and svd. The possible values for fit_intercept and normalize are 'True' or 'False'. The model accepts only numpy arrays or cudf dataframes as the input. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the linear regression model please refer to the documentation on

In [None]:
import numpy as np
import pandas as pd
import cudf
import os

from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN

# Helper Functions

In [None]:
# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for clustering 
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        # create a random dataset
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [None]:
from sklearn.metrics import mean_squared_error
# this function checks if the results obtained from two different methods (sklearn and cuml) are the same
def array_equal(a,b,threshold=1e-3,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
        res = error<threshold
    elif metric=='abs':
        error = a-b
        res = len(error[error>threshold]) == 0
    elif metric == 'acc':
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
        res = error<threshold
    return res

# calculate the accuracy 
def accuracy(a,b, threshold=1e-4):
    a = to_nparray(a)
    b = to_nparray(b)
    c = a-b
    c = len(c[c>1]) / (c.shape[0]*c.shape[1])
    return c<threshold

# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [None]:
%%time
# nrows = number of samples
# ncols = number of features of each sample

nrows = 2**15
ncols = 40

X = load_data(nrows,ncols)
print('data',X.shape)

In [None]:
# the number of neighbors whos labels are to be checked
n_neighbors = 10

In [None]:
%%time
# use the sklearn KNN model to fit the dataset 
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X,n_neighbors)

In [None]:
%%time
# convert the pandas dataframe to cudf dataframe
X = cudf.DataFrame.from_pandas(X)

In [None]:
%%time
# use cuml's KNN model to fit the dataset
knn_cuml = cumlKNN()
knn_cuml.fit(X)

# calculate the distance and the indices of the samples present in the dataset
D_cuml,I_cuml = knn_cuml.kneighbors(X,n_neighbors)

In [None]:
# compare the distance obtained while using sklearn and cuml models
passed = array_equal(D_sk,D_cuml, metric='abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)

In [None]:
# compare the labels obtained while using sklearn and cuml models
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)