## Benchmark cuGraph vs DGL on DataLoading on homogenous datasets

### Imports

In [1]:
import gc
import rmm                                                                                                                                                                                                 
import torch
import dgl
import numpy as np

#TODO: Enable in torch nightly
# torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
        
import cugraph_dgl
from dgl.data import AsNodePredDataset
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
from ogb.nodeproppred import DglNodePropPredDataset

  from .autonotebook import tqdm as notebook_tqdm


### Configure if single_gpu

In [2]:
single_gpu = False

In [3]:
def enable_cudf_spilling():
    import cudf
    SPILL_FLAG = True
    cudf.set_option("spill", SPILL_FLAG)

if not single_gpu:
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import cugraph.dask.comms.comms as Comms
    cluster = LocalCUDACluster(protocol='tcp',rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES='1,2,3,4,5,6,7', jit_unspill=False)
    client = Client(cluster)
    client.run(enable_cudf_spilling)
    Comms.initialize(p2p=True)
    enable_cudf_spilling()
else:
    enable_cudf_spilling()
    rmm.reinitialize(pool_allocator=True, initial_pool_size=5e9, maximum_pool_size=20e9)
    #torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)

2023-01-17 11:40:18,889 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize


## Create  DGL Graph

In [4]:
dataset_name='ogbn-products'
dataset_root = '/raid/vjawa/gnn/'
dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)
split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
g, label = dataset[0]
g.ndata['label'] = label

## Adding Self loops to make testing easier
## As we fail with isolated edges
## in cuGraph
## See comment: https://github.com/rapidsai/cugraph/pull/2997
g = dgl.add_self_loop(g)

## Create DataLoader

In [5]:
def create_dataloader(train_idx, g, device='cuda'):
    if isinstance(train_idx, torch.Tensor):
        train_idx = train_idx.to(device).to(g.idtype)
    else:
        train_idx={k:v.to(device).to(g.idtype) for k,v in train_idx.items()}
    sampler = NeighborSampler(
        [20,20,20],# Multiple Fanout
        prefetch_node_feats=["feat"],
        prefetch_labels=["label"],
    )
    batch_size = 1024*20
    train_dataloader = DataLoader(
        g,
        train_idx,
        sampler,
        device=device,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=0,
        use_uva=False,
    )
    return train_dataloader

## DGL CPU Benchmark

In [6]:
dataloader = create_dataloader(train_idx, g, device='cpu')
dataloader_it = iter(dataloader)
# warm up
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 
del dataloader
del dataloader_it
torch.cuda.empty_cache()
gc.collect()



3.71 s ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


41

## DGL GPU Benchmark 

In [7]:
# dataloader = create_dataloader(train_idx, dataset[0].to('cuda'), device='cuda')
# dataloader_it = iter(dataloader)
# # warmup
# input_nodes, output_nodes, blocks = next(dataloader_it) 
# %timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

# del dataloader
# del dataloader_it
# del input_nodes, output_nodes, blocks
# torch.cuda.empty_cache()
# gc.collect()

## cuGraph Benchmark  (features on host)

In [8]:
%time cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)
assert cugraph_g.ndata['feat']['_N'].device.type=='cpu'

dataloader = create_dataloader(train_idx, cugraph_g, device='cuda')
dataloader_it = iter(dataloader)
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

# del dataloader
# del dataloader_it
# del input_nodes, output_nodes, blocks
torch.cuda.empty_cache()
gc.collect()

CPU times: user 11.9 s, sys: 7.23 s, total: 19.2 s
Wall time: 18.3 s


  [b'\xad\xd1\xe3\x9c\x96\x83O\xb3\xba1\x86\x94\xb6\ ... =int32), False]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


2.42 s ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


417

## cuGraph Benchmark  (features on device)

In [9]:
# cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g.to('cuda'), single_gpu=single_gpu)
# dataloader = create_dataloader(train_idx, cugraph_g, device='cuda')
# dataloader_it = iter(dataloader)
# # warmup
# input_nodes, output_nodes, blocks = next(dataloader_it) 
# %timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

# del dataloader
# del dataloader_it
# del input_nodes, output_nodes, blocks
# torch.cuda.empty_cache()
# gc.collect()