# Testing Notebook for cugraph DGL vs DGL Upstream

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"
import rmm  
import torch
#rmm.reinitialize(pool_allocator = True, initial_pool_size = 15e9, maximum_pool_size=25e9)
#torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)
import cudf

In [2]:
single_gpu = True

In [3]:
def load_dgl_dataset(dataset_name='ogbn-products'):
    from ogb.nodeproppred import DglNodePropPredDataset

    dataset_root = '/raid/vjawa/gnn/'
    dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
    g, label = dataset[0]
    g.ndata['label'] = label
    g = g.add_self_loop()
    g = g.to('cpu')
    return g, train_idx

# cuGraph DGL DataLoader

In [4]:
import cugraph_dgl
import tempfile

In [5]:
g, train_idx = load_dgl_dataset()
g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)

temp_dir = tempfile.TemporaryDirectory()
batch_size = 1024
fanout_vals=[25, 25]
sampler = cugraph_dgl.dataloading.NeighborSampler(fanout_vals)
dataloader = cugraph_dgl.dataloading.DataLoader(
    g,                               
    train_idx.to('cuda'),                        # train_nid must be on GPU.
    sampler,
    sampling_output_dir=temp_dir.name, # Path to save sampling results to
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                 # Number of workers must be 0.
    batch_size=batch_size,
    batches_per_partition=20,
    seeds_per_call=50_000,
    drop_last=False,
    shuffle=False)

In [6]:
%%time

batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}

CPU times: user 7.97 s, sys: 8.78 s, total: 16.8 s
Wall time: 18.3 s


In [7]:
del dataloader
del g

# Pure DGL DataLoader

In [10]:
from dgl.dataloading import DataLoader, NeighborSampler
import dgl

In [11]:
g, train_idx = load_dgl_dataset()
batch_size = 1024
fanout_vals=[25, 25]
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout_vals)
dataloader = dgl.dataloading.DataLoader(
    g,                               
    train_idx.to(g.device),                        # train_nid must be on GPU.
    sampler,
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                    # Number of workers must be 0.
    use_uva=False,
    batch_size=batch_size,
    drop_last=False,
    shuffle=False)

In [12]:
%%time
dgl_batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    dgl_batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}

CPU times: user 10min 7s, sys: 1min 18s, total: 11min 26s
Wall time: 16.6 s
