## Benchmark cuGraph vs DGL on OBGN-Products/ DataLoading

### Imports

In [1]:
import gc
import rmm                                                                                                                                                                                                 
import torch
import dgl
import numpy as np

#TODO: Enable in torch nightly
# torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
        
import cugraph_dgl
from dgl.data import AsNodePredDataset
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
from ogb.nodeproppred import DglNodePropPredDataset

  from .autonotebook import tqdm as notebook_tqdm


### Configure if single_gpu

In [2]:
single_gpu = False

In [3]:
def enable_cudf_spilling():
    import cudf
    SPILL_FLAG = True
    cudf.set_option("spill", SPILL_FLAG)

if not single_gpu:
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import cugraph.dask.comms.comms as Comms
    cluster = LocalCUDACluster(protocol='tcp',rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES='1,2,3,4,5,6,7', jit_unspill=False)
    client = Client(cluster)
    client.run(enable_cudf_spilling)
    Comms.initialize(p2p=True)
    enable_cudf_spilling()
else:
    enable_cudf_spilling()
    rmm.reinitialize(pool_allocator=True, initial_pool_size=5e9, maximum_pool_size=20e9)
    #torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)

2023-01-17 08:59:41,128 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-qw_it2va', purging


## Create  DGL Graph

In [4]:
dataset_name='ogbn-products'
dataset_root = '/raid/vjawa/gnn/'
dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)
split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
g, label = dataset[0]
g.ndata['label'] = label

In [5]:
## Adding Self loops to make testing easier
## As we fail with isolated edges
## in cuGraph
## See comment: https://github.com/rapidsai/cugraph/pull/2997
g = dgl.add_self_loop(g)

## Create DataLoader

In [6]:
def create_dataloader(train_idx, g, device='cuda'):
    train_idx = train_idx.to(device).to(g.idtype)
    sampler = NeighborSampler(
        [20],# Multiple Fanout
        prefetch_node_feats=["feat"],
        prefetch_labels=["label"],
    )
    batch_size = 1024
    train_dataloader = DataLoader(
        g,
        train_idx,
        sampler,
        device=device,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=0,
        use_uva=False,
    )
    return train_dataloader

## DGL CPU Benchmark

In [7]:
dataloader = create_dataloader(train_idx, g, device='cpu')
dataloader_it = iter(dataloader)
# warm up
input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 
del dataloader
del dataloader_it
torch.cuda.empty_cache()
gc.collect()



4.93 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


70

## DGL GPU Benchmark 

In [8]:
# dataloader = create_dataloader(train_idx, dataset[0].to('cuda'), device='cuda')
# dataloader_it = iter(dataloader)
# # warmup
# input_nodes, output_nodes, blocks = next(dataloader_it) 
# %timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

# del dataloader
# del dataloader_it
# del input_nodes, output_nodes, blocks
# torch.cuda.empty_cache()
# gc.collect()

## cuGraph Benchmark  (features on host)

In [9]:
#client.has_what()
%time cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)
assert cugraph_g.ndata['feat']['_N'].device.type=='cpu'

CPU times: user 11.3 s, sys: 5.44 s, total: 16.8 s
Wall time: 16.3 s


In [10]:
client.has_what()

Worker,Key count,Key list
tcp://127.0.0.1:36623,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 0)"
tcp://127.0.0.1:41013,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 6)"
tcp://127.0.0.1:42541,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 1)"
tcp://127.0.0.1:43349,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 4)"
tcp://127.0.0.1:44181,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 2)"
tcp://127.0.0.1:44459,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 5)"
tcp://127.0.0.1:46533,1,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 3)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 0)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 6)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 1)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 4)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 2)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 5)"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 3)"


In [11]:
# %time cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)
# assert cugraph_g.ndata['feat']['_N'].device.type=='cpu'

dataloader = create_dataloader(train_idx, cugraph_g, device='cuda')
dataloader_it = iter(dataloader)
%time input_nodes, output_nodes, blocks = next(dataloader_it) 
%timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

del dataloader
del dataloader_it
del input_nodes, output_nodes, blocks
torch.cuda.empty_cache()
gc.collect()

CPU times: user 2.67 s, sys: 3.13 s, total: 5.8 s
Wall time: 15.8 s
137 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 5 loops each)


610

In [12]:
client.has_what()

Worker,Key count,Key list
tcp://127.0.0.1:36623,2,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 0)  _make_plc_graph-8525f8e9c60871d7320248fc011defd0"
tcp://127.0.0.1:41013,2,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 6)  _make_plc_graph-0a2e064d3f0c457bf971604d747a4dce"
tcp://127.0.0.1:42541,3,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 1)  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 2)  _make_plc_graph-88ab626224796b84931f96f3162def04"
tcp://127.0.0.1:43349,1,Expand  _make_plc_graph-45ecd9aaf5be3aa20f5c063b203a3369
tcp://127.0.0.1:44181,2,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 4)  _make_plc_graph-aef0a1694a27eda94a8b0fc714aa52ca"
tcp://127.0.0.1:44459,2,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 5)  _make_plc_graph-bd13ca4486ebaaf30e4a658216e372e1"
tcp://127.0.0.1:46533,2,"Expand  ('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 3)  _make_plc_graph-33b6785475bde1e09dd63bd73d1a3e33"

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 0)"
_make_plc_graph-8525f8e9c60871d7320248fc011defd0

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 6)"
_make_plc_graph-0a2e064d3f0c457bf971604d747a4dce

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 1)"
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 2)"
_make_plc_graph-88ab626224796b84931f96f3162def04

0
_make_plc_graph-45ecd9aaf5be3aa20f5c063b203a3369

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 4)"
_make_plc_graph-aef0a1694a27eda94a8b0fc714aa52ca

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 5)"
_make_plc_graph-bd13ca4486ebaaf30e4a658216e372e1

0
"('from_pandas-96ef48fc1ba5fceb27a2429b9a631fc6', 3)"
_make_plc_graph-33b6785475bde1e09dd63bd73d1a3e33


## cuGraph Benchmark  (features on device)

In [13]:
# cugraph_g = cugraph_dgl.cugraph_storage_from_heterograph(g.to('cuda'), single_gpu=single_gpu)
# dataloader = create_dataloader(train_idx, cugraph_g, device='cuda')
# dataloader_it = iter(dataloader)
# # warmup
# input_nodes, output_nodes, blocks = next(dataloader_it) 
# %timeit -n 5 -r 1 input_nodes, output_nodes, blocks = next(dataloader_it) 

# del dataloader
# del dataloader_it
# del input_nodes, output_nodes, blocks
# torch.cuda.empty_cache()
# gc.collect()