# Dask Dataframe with cuDF joins

This shows using Dask DataFrame with cuDF on an eight-GPU machine.

This makes three points:

1.  Joins work
2.  They're slow due to communication
3.  Agnostic Pandas/cuDF workflows provide usability gains

## Use a DGX

In [None]:
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

cluster = LocalCUDACluster()
client = Client(cluster)
client

In [None]:
client.scheduler_info()

## Create Random Dataset

In [None]:
import dask.array as da
import dask.dataframe as dd

n_rows = 1000000000
n_keys = 5000000

left = dd.concat([
    da.random.random(n_rows).to_dask_dataframe(columns='x'),
    da.random.randint(0, n_keys, size=n_rows).to_dask_dataframe(columns='id'),
], axis=1).persist()
left

In [None]:
n_rows = 10000000

right = dd.concat([
    da.random.random(n_rows).to_dask_dataframe(columns='y'),
    da.random.randint(0, n_keys, size=n_rows).to_dask_dataframe(columns='id'),
], axis=1).persist()
right

## Convert data to GPU and persist in device memory

In [None]:
import dask
import cudf

gleft = left.map_partitions(cudf.from_pandas)
gright = right.map_partitions(cudf.from_pandas)

gleft, gright = dask.persist(gleft, gright)  # persist data in device memory

In [None]:
gleft

In [None]:
%time gleft.x.sum().compute()

In [None]:
from distributed.utils import format_bytes
format_bytes(len(gleft) * 8 * 2)  # TODO: cudf needs `.memory_usage()` method

In [None]:
format_bytes(len(gright) * 8 * 2)

## Join on the ID column

In [None]:
out = gleft.merge(gright, on=['id'])  # this is lazy
out

In [None]:
out = out.persist()
%time _ = wait(out)

## Inspect output

In [None]:
out.head().to_pandas()

In [None]:
from distributed.utils import format_bytes
format_bytes(len(out) * 8 * 3)  # TODO: cudf needs `.memory_usage()` method