In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN as skDBSCAN
from cuML import DBSCAN as cumlDBSCAN
import pygdf
import os

# Helper Functions

In [None]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [None]:
def load_data(nrows, ncols, cached = 'data/mortgage.npy'):
    if os.path.exists(cached):
        X = np.load(cached)
        return X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        return np.random.rand(nrows,ncols)

In [None]:
def np2pygdf(df):
    # convert numpy array to pygdf dataframe
    pdf = pygdf.DataFrame()
    for c in range(df.shape[1]):
        pdf[c] = df[:,c]
    return pdf

In [None]:
def array_equal(a,b,threshold=1e-4,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    res = np.max(np.abs(a-b))<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,pygdf.DataFrame) or isinstance(x,pygdf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [None]:
%%time
nrows = 1000
ncols = 128

X = load_data(nrows,ncols)
print('data',X.shape)

In [None]:
eps = 3
min_samples = 2

In [None]:
%%time
clustering_sk = skDBSCAN(eps = eps, min_samples = min_samples)
clustering_sk.fit(X)

In [None]:
%%time
X = np2pygdf(X)

In [None]:
%%time
clustering_cuml = cumlDBSCAN(eps = eps, min_samples = min_samples)
clustering_cuml.fit(X)

In [None]:
passed = array_equal(clustering_sk.labels_,clustering_cuml.labels_
            threshold=1e-3,with_sign = True)
message = 'compare dbscan: cuml vs sklearn labels_ %s'%('equal'if passed else 'NOT equal')
print(message)