In [1]:
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster(threads_per_worker=1)

In [2]:
from dask.distributed import Client, wait
import time

import dask
import dask_cudf
import dask.dataframe as dd

import pandas as pd

import cudf
import numpy as np

import pandas.testing

from dask_cuml import knn as cumlKNN


In [3]:
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:37959  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 2  Memory: 50.39 GB


In [4]:
def create_df(f, m, n):
    X = np.random.rand(m, n)
    ret = cudf.DataFrame([(i,X[:,i].astype(np.float32)) for i in range(n)], 
                        index = cudf.dataframe.RangeIndex(f*m, f*m+m))
    return ret

def get_meta(df):
    return df.iloc[:0]

In [5]:
workers = client.has_what().keys()
workers

dict_keys(['tcp://127.0.0.1:38213', 'tcp://127.0.0.1:39374'])

In [6]:
# Per gpu/worker
train_m = 500000 
train_n = 1000

In [7]:
search_m = 100
search_k = 15

In [8]:
%%time

# Create dfs on each worker (gpu)
dfs = [client.submit(create_df, n, train_m, train_n, workers = [worker])
       for worker, n in list(zip(workers, list(range(len(workers)))))]

# Wait for completion
wait(dfs)

meta = client.submit(get_meta, dfs[0]).result()

CPU times: user 1.07 s, sys: 286 ms, total: 1.36 s
Wall time: 20.3 s


In [9]:
lr = cumlKNN.KNN()

In [10]:
print(str(meta))

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 999]
Index: []


In [11]:
%%time
X_df = dask_cudf.from_delayed(dfs, meta=meta)

CPU times: user 910 ms, sys: 66.3 ms, total: 977 ms
Wall time: 929 ms


In [12]:
client.who_has()

{'create_df-1be1d19380376830e7b218a9fe128f1e': ('tcp://127.0.0.1:38213',),
 'create_df-5e4acbcb64aba093a6f6056cdb75fa43': ('tcp://127.0.0.1:39374',)}

In [13]:
%%time
lr.fit(X_df)

Getting cols
Done.
Done compute.
CPU times: user 323 ms, sys: 11.2 ms, total: 334 ms
Wall time: 1.91 s


In [14]:
%%time
I, D = lr.kneighbors(X_df, search_k)

Running kneighbors()
Done.
Building dfs
Done.
CPU times: user 5.95 s, sys: 922 ms, total: 6.88 s
Wall time: 2min 39s


In [15]:
print(str(I.compute()))

   0       1       2       3       4       5       6 ...      14
0  0  800219  866600  913998  609086  127380  199272 ...   31370
1  1  784421  961660  247878  981250  114660  948108 ...  567954
2  2  578122  935613  907744  924725  897348  376233 ...  433142
3  3  990613   13353  418735  561557   83221  505352 ...   60998
4  4  491433   43195   45794  812829  512616  915536 ...  863804
5  5  501727  337180  934072  761261   52497   97657 ...  679068
6  6  981107  924121  338739  947320  457847  695488 ...  144769
7  7  203916  752554  381931  773029  235201  919365 ...  928036
8  8  170649  570132  713854  597654  670859  628270 ...  889589
9  9  333577  528665  289605  456570  225025  177986 ...  941612
[999990 more rows]
[7 more columns]


In [16]:
print(str(D.compute()))

               0          1          2          3          4          5          6 ...         14
0            0.0  138.22614  139.28516  139.51675  139.53528  139.84683  139.85376 ...  141.16846
1  0.00036621094   141.5008  141.92093  141.92621  143.86847  143.99811  144.11163 ...  144.47433
2  0.00030517578  138.82639  141.06738  141.07892  141.57983  141.75616   142.0841 ...  143.34451
3            0.0  140.02899  140.09644  140.44287  140.50266  140.66104  140.78137 ...  141.77151
4  0.00036621094  140.48593  140.74207  141.01614  142.33026  142.93298  143.24335 ...  144.57098
5            0.0  140.22134  141.62616  141.70609  141.70877   141.8728  141.97922 ...  142.74298
6            0.0  141.31137   141.7381  142.26617   142.2894  142.64737  143.07135 ...  143.95782
7            0.0  138.25073  139.48328  139.74197   140.7066  142.19696  142.41483 ...  143.13177
8  0.00048828125   138.7756  139.12875  139.25522   140.9685  141.16046   141.2074 ...  141.88354
9            0.0  13

In [31]:
a = cudf.DataFrame([('a', [1, 2, 3, 4])])
a = a.set_index(np.arange(10, 15))

In [36]:
a.index[-1]

14