# Testing Notebook for cugraph DGL vs DGL Upstream

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"
import cudf
import rmm
import torch
from rmm.allocators.torch import rmm_torch_allocator
rmm.reinitialize(initial_pool_size=15e9)
#Switch to async pool in case of memory issues due to fragmentation of the pool
#rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource(initial_pool_size=15e9))
torch.cuda.memory.change_current_allocator(rmm_torch_allocator)

In [2]:
single_gpu = True

In [3]:
def load_dgl_dataset(dataset_name='ogbn-products'):
    from ogb.nodeproppred import DglNodePropPredDataset
    dataset_root = '/raid/vjawa/gnn/'
    dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
    g, label = dataset[0]
    g.ndata['label'] = label
    g = g.add_self_loop()
    g = g.to('cpu')
    return g, train_idx

# cuGraph DGL DataLoader

In [4]:
import cugraph_dgl
import tempfile

In [5]:
!rm -rf "/raid/vjawa/obgn_products_sampling/"

In [6]:
g, train_idx = load_dgl_dataset()
g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)

batch_size = 1024
fanout_vals=[25, 25]
sampler = cugraph_dgl.dataloading.NeighborSampler(fanout_vals)
dataloader = cugraph_dgl.dataloading.DataLoader(
    g,                               
    train_idx.to('cuda'),                        # train_nid must be on GPU.
    sampler,
    sampling_output_dir="/raid/vjawa/obgn_products_sampling/", # Path to save sampling results to, Change to the fastest IO path available
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                 # Number of workers must be 0.
    batch_size=batch_size,
    batches_per_partition=50,
    seeds_per_call=50*batch_size,
    drop_last=False,
    shuffle=False)

In [7]:
%%timeit
batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}

7.25 s ± 916 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
del dataloader
del g

# Pure DGL DataLoader

In [9]:
from dgl.dataloading import DataLoader, NeighborSampler
import dgl

In [10]:
g, train_idx = load_dgl_dataset()
batch_size = 1024
fanout_vals=[25, 25]
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout_vals)
dataloader = dgl.dataloading.DataLoader(
    g,                               
    train_idx.to(g.device),                        # train_nid must be on GPU.
    sampler,
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                    # Number of workers must be 0.
    use_uva=False,
    batch_size=batch_size,
    drop_last=False,
    shuffle=False)

In [11]:
%%timeit
dgl_batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    dgl_batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}

4.22 s ± 345 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
del dataloader
del g