## Benchmark cuGraph vs DGL on OBGN-Products DataLoading

### Imports

In [1]:
import gc
import rmm                                                                                                                                                                                                 
import torch
import dgl
import numpy as np

#TODO: Enable in torch nightly
# torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
        
import cugraph_dgl
from dgl.data import AsNodePredDataset
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
from ogb.nodeproppred import DglNodePropPredDataset

  from .autonotebook import tqdm as notebook_tqdm


### Configure if single_gpu

In [2]:
single_gpu = False

In [3]:
if not single_gpu:
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import cugraph.dask.comms.comms as Comms
    cluster = LocalCUDACluster(protocol='tcp',rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES='1,2')
    client = Client(cluster)
    Comms.initialize(p2p=True)
else:
    rmm.reinitialize(pool_allocator=True, initial_pool_size=5e9, maximum_pool_size=20e9)
    torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)

2023-01-16 18:58:50,778 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-16 18:58:50,779 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-01-16 18:58:50,875 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-01-16 18:58:50,875 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


## Create Graphs

In [4]:
dataset = AsNodePredDataset(DglNodePropPredDataset("ogbn-products",root='/datasets/vjawa/gnn'))
## Adding Self loops to make testing easier
## As we fail with isolated edges
## in cuGraph
## See comment: https://github.com/rapidsai/cugraph/pull/2997
g = dgl.add_self_loop(dataset[0])

## Create DataLoader

In [10]:
def create_dataloader(dataset, g, device='cuda'):
    train_idx = dataset.train_idx.to(device)
    val_idx = dataset.val_idx.to(device)
    sampler = NeighborSampler(
        [20,20,20],# Multiple Fanout
        prefetch_node_feats=["feat"],
        prefetch_labels=["label"],
    )
    batch_size = 1024*20
    train_dataloader = DataLoader(
        g,
        train_idx,
        sampler,
        device=device,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=0,
        use_uva=False,
    )
    return train_dataloader

## DGL CPU Benchmark

In [11]:
dataloader = create_dataloader(dataset, g, device='cpu')
dataloader_it = iter(dataloader)
# warm up
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 
del dataloader
del dataloader_it
torch.cuda.empty_cache()
gc.collect()



2.69 s ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


0

## DGL GPU Benchmark 

In [12]:
dataloader = create_dataloader(dataset, dataset[0].to('cuda'), device='cuda')
dataloader_it = iter(dataloader)
# warmup
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

del dataloader
del dataloader_it
del input_nodes, output_nodes, blocks
torch.cuda.empty_cache()
gc.collect()

87.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


130

## cuGraph Benchmark  (features on host)

In [13]:
cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)
assert cugraph_g.ndata['feat']['_N'].device.type=='cpu'

dataloader = create_dataloader(dataset, cugraph_g, device='cuda')
dataloader_it = iter(dataloader)
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

del dataloader
del dataloader_it
del input_nodes, output_nodes, blocks
torch.cuda.empty_cache()
gc.collect()

  [b'\x98\xb1B\x7f\xf0\x7fJ\x81\xa8LH\xa5\xa3o\xb9\x ... =int32), False]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


3.17 s ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


335

## cuGraph Benchmark  (features on device)

In [14]:
cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g.to('cuda'), single_gpu=single_gpu)
dataloader = create_dataloader(dataset, cugraph_g, device='cuda')
dataloader_it = iter(dataloader)
# warmup
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

del dataloader
del dataloader_it
del input_nodes, output_nodes, blocks
torch.cuda.empty_cache()
gc.collect()

3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


585