In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN as skDBSCAN
from cuml import DBSCAN as cumlDBSCAN
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
            print(str(len(X)))
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        print('use random data')
        X = np.random.rand(nrows,ncols)
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
    return df

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=5e-3,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    res = mean_squared_error(a,b)<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [5]:
%%time
nrows = 50000
ncols = 128

X = load_data(nrows,ncols)
print('data',X.shape)

use mortgage data
800000
data (50000, 128)
CPU times: user 5.82 s, sys: 812 ms, total: 6.63 s
Wall time: 6.63 s


In [6]:
eps = 0.3
min_samples = 2

In [7]:
%%time
clustering_sk = skDBSCAN(eps = eps, min_samples = min_samples)
clustering_sk.fit(X)

CPU times: user 6min 41s, sys: 916 ms, total: 6min 42s
Wall time: 6min 41s


In [8]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 7.31 s, sys: 1.17 s, total: 8.48 s
Wall time: 2.08 s


In [None]:
%%time
clustering_cuml = cumlDBSCAN(eps = eps, min_samples = min_samples)
clustering_cuml.fit(X)

In [None]:
skl = clustering_sk.labels_
cuml = clustering_cuml.labels_.to_array()

In [None]:
passed = array_equal(skl, cuml)
message = 'compare dbscan: cuml vs sklearn labels_ %s'%('equal'if passed else 'NOT equal')
print(message)

In [157]:
skl[545]

7

In [158]:
cuml[545]

7

In [159]:
idx = 0

for z in zip(skl, cuml):
    
    if idx % 1000 == 0:
        print(str(z[0]))
        print(str(idx))
    
    if (z[0] != z[1]):
        print(str("NOT EQUAL: " + str(z)))
        print(str(idx))
        
    idx+=1
    
    

0
0
NOT EQUAL: (77, 78)
693
NOT EQUAL: (78, 79)
704
NOT EQUAL: (79, 81)
717
NOT EQUAL: (80, 82)
722
NOT EQUAL: (81, 83)
726
NOT EQUAL: (82, 84)
734
NOT EQUAL: (83, 85)
743
NOT EQUAL: (84, 86)
751
NOT EQUAL: (85, 87)
756
NOT EQUAL: (86, 88)
759
NOT EQUAL: (87, 89)
766
NOT EQUAL: (85, 87)
779
NOT EQUAL: (85, 87)
782
NOT EQUAL: (88, 90)
788
NOT EQUAL: (89, 91)
817
NOT EQUAL: (87, 89)
869
NOT EQUAL: (85, 87)
918
NOT EQUAL: (81, 83)
944
NOT EQUAL: (86, 88)
946
NOT EQUAL: (90, 94)
947
NOT EQUAL: (85, 87)
975
2
1000
NOT EQUAL: (83, 85)
1012
NOT EQUAL: (91, 95)
1020
NOT EQUAL: (78, 79)
1032
NOT EQUAL: (92, 96)
1043
NOT EQUAL: (93, 97)
1045
NOT EQUAL: (93, 97)
1046
NOT EQUAL: (94, 98)
1049
NOT EQUAL: (83, 85)
1080
NOT EQUAL: (88, 90)
1081
NOT EQUAL: (92, 96)
1119
NOT EQUAL: (95, 99)
1146
NOT EQUAL: (87, 89)
1150
NOT EQUAL: (93, 97)
1156
NOT EQUAL: (96, 100)
1177
NOT EQUAL: (88, 90)
1199
NOT EQUAL: (97, 102)
1204
NOT EQUAL: (85, 87)
1216
NOT EQUAL: (98, 103)
1225
NOT EQUAL: (99, 104)
1230
NOT EQ

In [50]:
print(str(zip(clustering_cuml.labels_, clustering_cuml.labels_)))

<zip object at 0x7f2336f41f88>
