# ogbn-papers100M benchmarks

In [1]:
single_gpu = False
use_cugraph = True

In [2]:
if single_gpu:
    import os
    import warnings
    os.environ["CUDA_VISIBLE_DEVICES"]="1"
    if use_cugraph:
        warnings.warn("Error on 1 GPU even with managed memory, Switch on MULTI-GPU")
        import rmm
        rmm.reinitialize(managed_memory=True)

In [3]:
if not single_gpu:
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import time
    import cugraph.dask.comms.comms as Comms

## Start Dask Cluster

In [4]:
if not single_gpu:
    cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES='1,2,3,4,5,6,7',rmm_managed_memory=True, enable_tcp_over_ucx=False, pre_import=['cugraph','torch'])
    client = Client(cluster)
    Comms.initialize(p2p=True)

2022-12-04 11:01:24,596 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize


# Import DGL Specific libraries

In [5]:
from ogb.nodeproppred import DglNodePropPredDataset
from cugraph_dgl.convert import cugraph_storage_from_heterograph
import cudf

from dgl.data import AsNodePredDataset
import dgl.backend as F
from dgl.dataloading import DataLoader, NeighborSampler
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Create cugraph_dgl GraphStorage

In [6]:
%%time 
dataset = DglNodePropPredDataset("ogbn-papers100M", root='/datasets/vjawa/gnn/')

CPU times: user 2.69 s, sys: 45.2 s, total: 47.9 s
Wall time: 45 s


In [7]:
%%time 

gs,labels = dataset[0]
gs = gs.int()
## Clear Data to allow scale testing 
## Check how to not do this
gs.ndata.clear()
gs.edata.clear()

CPU times: user 4.59 s, sys: 5.35 s, total: 9.94 s
Wall time: 9.07 s


In [8]:
if use_cugraph:
    gs = cugraph_storage_from_heterograph(gs, single_gpu=single_gpu)

In [9]:
### Get Train IDS
# Created like below:
# train_ids = gs.graphstore.gdata.get_edge_data()['_DST_'].unique()
# if hasattr(train_ids, 'compute'):
#     train_ids=train_ids.compute()
# train_ids.to_frame().to_parquet('ogbn_papers100M_train_nids/train_nids.parquet')
train_ids = cudf.read_parquet('ogbn_papers100M_train_nids/train_nids.parquet')['_DST_']
train_ids = torch.as_tensor(train_ids.values).to(gs.device)

## Benchmark Sampling

In [10]:
def setup_loader(dataloader):
    for it,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
        break
    return

def benchmark_dataloader(g,train_ids,batch_size):
    batch_size = batch_size
    sampler = NeighborSampler(
            [10, 10],  # fanout for [layer-0, layer-1]
            #prefetch_node_feats=["feat"],
            #prefetch_labels=["label"],
        )
    dataloader = DataLoader(
        g,
        train_ids[:2_000_000],
        sampler,
        device='cuda',
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=0,
    )
    
    setup_loader(dataloader)
    n_nodes = len(dataloader.indices)
    st = time.time()
    for it,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
        input_nodes
    et = time.time()
    print(f"Completed sampling {n_nodes}  nodes with {batch_size} in  {et-st} s time on graph type {type(gs)}")
    return 

In [11]:
for batch_size in [10_000,20_000,40_000,80_000,160_000,320_000]:
    benchmark_dataloader(gs, train_ids, batch_size=batch_size)

Completed sampling 2000000  nodes with 10000 in  100.52753567695618 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2000000  nodes with 20000 in  51.80714750289917 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2000000  nodes with 40000 in  31.575586080551147 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2000000  nodes with 80000 in  21.818739652633667 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2000000  nodes with 160000 in  13.731544017791748 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>


  [b'\xf5&|\xad\xf2\xd1C\x8b\xb8r\x0fWtu/N', '_make_ ... =int32), False]
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


Completed sampling 2000000  nodes with 320000 in  9.879771947860718 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>


In [6]:
# CPU
# Completed sampling 2000000  nodes with 10000 in  12.739480972290039 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
# Completed sampling 2000000  nodes with 20000 in  11.33974838256836 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
# Completed sampling 2000000  nodes with 40000 in  11.867838621139526 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
# Completed sampling 2000000  nodes with 80000 in  11.385798215866089 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
# Completed sampling 2000000  nodes with 160000 in  12.451390027999878 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
# Completed sampling 2000000  nodes with 320000 in  13.918638706207275 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>


# DGL GPU Fails 

# GPU (7 GPU and 1 training client)
# Completed sampling 2000000  nodes with 10000 in  101.24437046051025 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
# Completed sampling 2000000  nodes with 20000 in  58.943265199661255 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
# Completed sampling 2000000  nodes with 40000 in  44.02875232696533 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
# Completed sampling 2000000  nodes with 80000 in  25.86843991279602 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
# Completed sampling 2000000  nodes with 160000 in  16.176891326904297 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
# Completed sampling 2000000  nodes with 320000 in  11.451149225234985 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>

In [12]:
!pwd

/datasets/vjawa/gnn
