# ogbn-papers100M benchmarks

In [1]:
%env RAPIDS_NO_INITIALIZE=1
single_gpu = False
use_cugraph = True
import time

env: RAPIDS_NO_INITIALIZE=1


In [2]:
if single_gpu:
    import os
    import warnings
    if use_cugraph:
        # Enable when Pytorch+RMM lands
        warnings.warn("Error on 1 GPU even with managed memory, Switch on MULTI-GPU")
        import rmm
        rmm.reinitialize(managed_memory=True)

In [3]:
if not single_gpu:
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import cugraph.dask.comms.comms as Comms

## Start Dask Cluster

In [4]:
if not single_gpu:
    cluster = LocalCUDACluster(protocol='ucx',rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES='1,2,3,4,5,6,7,8,9,10,11,12')
                               #jit_unspill=False)
    client = Client(cluster)
    Comms.initialize(p2p=True)
if use_cugraph:
    import rmm
    rmm.reinitialize(pool_allocator=True)

[1671324589.007996] [exp02:82923:0]          parser.c:1989 UCX  WARN  unused environment variable: UCX_MEMTYPE_CACHE (maybe: UCX_MEMTYPE_CACHE?)
[1671324589.502557] [exp02:82923:0]            sock.c:470  UCX  ERROR bind(fd=260 addr=0.0.0.0:53867) failed: Address already in use


2022-12-17 16:49:51,987 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space-10123/worker-096ke4wg', purging
2022-12-17 16:49:51,987 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space-10123/worker-vrj5catg', purging
2022-12-17 16:49:51,988 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space-10123/worker-ts_7mw3u', purging
2022-12-17 16:49:51,988 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space-10123/worker-te7fs10k', purging
2022-12-17 16:49:51,988 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space-10123/worker-7yc7no2h', purging
2022-12-17 16:49:51,988 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space-10123/worker-0an9buyk', purging
2022-12-17 16:49:51,988 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-wo

# Import DGL Specific libraries

In [5]:
from ogb.nodeproppred import DglNodePropPredDataset
from cugraph_dgl.convert import cugraph_storage_from_heterograph
import cudf

from dgl.data import AsNodePredDataset
import dgl.backend as F
from dgl.dataloading import DataLoader, NeighborSampler
import torch

  from .autonotebook import tqdm as notebook_tqdm



# Create cugraph_dgl GraphStorage

In [6]:
%%time 
dataset = DglNodePropPredDataset("ogbn-papers100M", root='/raid/vjawa/gnn/')

CPU times: user 6.97 s, sys: 49.1 s, total: 56 s
Wall time: 46.4 s


In [7]:
%%time 

gs,labels = dataset[0]
gs = gs.int()
## Clear Data to allow scale testing 
## Check how to not do this
gs.ndata.clear()
gs.edata.clear()

CPU times: user 7.05 s, sys: 8.04 s, total: 15.1 s
Wall time: 12.8 s


In [8]:
%%time
if use_cugraph:
    gs = cugraph_storage_from_heterograph(gs, single_gpu=single_gpu)

CPU times: user 57.9 s, sys: 19.6 s, total: 1min 17s
Wall time: 1min 18s


In [9]:
### Get Train IDS
# train_ids = gs.graphstore.gdata.get_edge_data()['_DST_'].unique()
# if hasattr(train_ids, 'compute'):
#     train_ids=train_ids.compute()
# train_ids.to_frame().to_parquet('train_nids.parquet')
train_ids = cudf.read_parquet('train_nids.parquet')['_DST_']
train_ids = torch.as_tensor(train_ids.values).to(gs.device)

In [10]:
f"{len(train_ids):,}"

'89,566,794'

## Benchmark Sampling

In [15]:
def setup_loader(dataloader):
    for it,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
        break
    return

def benchmark_dataloader(g,train_ids,batch_size):
    batch_size = batch_size
    sampler = NeighborSampler(
            [10, 10],  # fanout for [layer-0, layer-1]
            #prefetch_node_feats=["feat"],
            #prefetch_labels=["label"],
        )
    dataloader = DataLoader(
        g,
        train_ids[:2_000_000],
        sampler,
        device='cuda',
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=0,
    )
    
    setup_loader(dataloader)
    n_nodes = len(dataloader.indices)
    st = time.time()
    for it,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
        input_nodes
    et = time.time()
    print(f"Completed sampling {n_nodes:,}  nodes with batch_size {batch_size:,} in  {et-st} s time on graph type {type(gs)}")
    return 

In [16]:
#dir(gs.graphstore)

In [17]:
for batch_size in [10_000, 20_000, 40_000, 80_000, 80_000, 160_000, 320_000]:
    benchmark_dataloader(gs, train_ids, batch_size=batch_size)

Completed sampling 2,000,000  nodes with batch_size 10,000 in  52.02749013900757 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 20,000 in  25.835906982421875 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 40,000 in  13.072922468185425 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 80,000 in  7.277414560317993 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 80,000 in  6.9900617599487305 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 160,000 in  3.862619161605835 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 320,000 in  2.

# DGL CPU
```bash
Completed sampling 2,000,000  nodes with 10,000 in  12.739480972290039 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
Completed sampling 2,000,000  nodes with 20,000 in  11.33974838256836 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
Completed sampling 2,000,000  nodes with 40,000 in  11.867838621139526 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
Completed sampling 2,000,000  nodes with 80,000 in  11.385798215866089 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
Completed sampling 2,000,000  nodes with 160,000 in  12.451390027999878 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
Completed sampling 2,000,000  nodes with 320,000 in  13.918638706207275 s time on graph type <class 'dgl.heterograph.DGLHeteroGraph'>
```

# DGL GPU Fails 

## GPU (11 GPU and 1 training client)
```bash
Completed sampling 2,000,000  nodes with batch_size 10,000 in  52.02749013900757 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 20,000 in  25.835906982421875 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 40,000 in  13.072922468185425 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 80,000 in  7.277414560317993 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 80,000 in  6.9900617599487305 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 160,000 in  3.862619161605835 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
Completed sampling 2,000,000  nodes with batch_size 320,000 in  2.3878278732299805 s time on graph type <class 'cugraph_dgl.cugraph_storage.CuGraphStorage'>
```

In [19]:
!pwd

/home/nfs/vjawa/dgl/cugraph/benchmarks/cugraph_dgl/sampling/notebooks
