In [None]:
import gzip
import os

import cudf
from cuml import DBSCAN as cumlDBSCAN
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN as skDBSCAN
from sklearn.metrics import mean_squared_error

# Helper Functions

In [None]:
def load_data(nrows, ncols, cached="data/mortgage.npy.gz"):
    """Load mortgage dataset if it is available, otherwise use random data."""
    if os.path.exists(cached):
        print("Using mortgage data.")
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0, X.shape[0] - 1, nrows), :ncols]
    else:
        print("Using random data.")
        X = np.random.rand(nrows,ncols)
    df = pd.DataFrame({"fea%d" % i: X[:, i] for i in range(X.shape[1])})
    return df

In [None]:
def array_equal(a, b, threshold=5e-3, with_sign=True):
    """Check the equality of two arrays with a given threshold."""
    a = to_nparray(a)
    b = to_nparray(b)
    if not with_sign:
        a, b = np.abs(a), np.abs(b)
    res = mean_squared_error(a,b) < threshold
    return res

def to_nparray(x):
    """Convert a given array into a numpy array."""
    if isinstance(x, np.ndarray) or isinstance(x, pd.DataFrame):
        return np.array(x)
    elif isinstance(x, np.float64):
        return np.array([x])
    elif isinstance(x, cudf.DataFrame) or isinstance(x, cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [None]:
%%time
nrows = 10000
ncols = 128

X = load_data(nrows,ncols)
print("Data shape:", X.shape)

In [None]:
eps = 3
min_samples = 2

In [None]:
%%time
clustering_sk = skDBSCAN(eps=eps, min_samples=min_samples)
clustering_sk.fit(X)

In [None]:
%%time
X = cudf.DataFrame.from_pandas(X)

In [None]:
%%time
clustering_cuml = cumlDBSCAN(eps=eps, min_samples=min_samples)
clustering_cuml.fit(X)

In [None]:
passed = array_equal(clustering_sk.labels_, clustering_cuml.labels_)
message = "compare dbscan: cuml vs sklearn labels_ %s" % ("equal" if passed else "NOT equal")
print(message)