# Testing Notebook for cugraph DGL vs DGL Upstream

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import cudf
import rmm      
rmm.reinitialize(pool_allocator = True, initial_pool_size = 15e9, maximum_pool_size=24e9)
import torch
torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)

In [None]:
single_gpu = False
if not single_gpu:
    from dask_cuda import LocalCUDACluster
    from dask.distributed import Client
    import cugraph.dask.comms.comms as Comms
    cluster = LocalCUDACluster(protocol='tcp',rmm_pool_size='25GB', CUDA_VISIBLE_DEVICES='2,3,4')
    client = Client(cluster)
    Comms.initialize(p2p=True)

# cuGraph DGL DataLoader

In [2]:
from cugraph_dgl.dataloading import HomogenousBulkSamplerDataset 
import dgl
import cudf

In [5]:
#!ls /raid/vjawa/gnn_sampling/bulksampling_new/rank=0/

In [6]:
%%time
sampled_file_dir = '/raid/vjawa/gnn_sampling/bulksampling_new/rank=0/'
cugraph_dgl_dataset = HomogenousBulkSamplerDataset(
                         num_batches=192,
                         total_number_of_nodes=2_449_029,
                         edge_dir='in',
                        )

cugraph_dgl_dataset.set_input_directory(sampled_file_dir)
bs_dataloader = torch.utils.data.DataLoader(cugraph_dgl_dataset, batch_size=None, num_workers=0)
batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(bs_dataloader):
    l = len(output_nodes)
    batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}
    if batch_id==0:
        input_nodes_0, output_nodes_0, blocks_0 = input_nodes, output_nodes, blocks

CPU times: user 4.2 s, sys: 2.22 s, total: 6.42 s
Wall time: 6.44 s


In [9]:
output_nodes_0

tensor([   0,    1,    2,  ..., 1021, 1022, 1023], device='cuda:0')

# Pure DGL DataLoader

In [8]:
from dgl.dataloading import DataLoader, NeighborSampler, MultiLayerFullNeighborSampler
from ogb.nodeproppred import DglNodePropPredDataset
import dgl

def load_dgl_dataset(dataset_name='ogbn-products'):
    dataset_root = '/raid/vjawa/gnn/'
    dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
    g, label = dataset[0]
    g.ndata['label'] = label
    return g, train_idx

In [9]:
g, train_idx = load_dgl_dataset()
g = g.to('cuda')
batch_size = 1024
#DGL->cugraph are reverse of fanout as src,dst is reversed
fanout_vals=[25, 25]
fanout_vals_reverse = fanout_vals.copy()
fanout_vals_reverse.reverse()
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout_vals_reverse)
dataloader = dgl.dataloading.DataLoader(
    g,                               
    train_idx.to('cuda'),                        # train_nid must be on GPU.
    sampler,
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                    # Number of workers must be 0.
    use_uva=False,
    batch_size=batch_size,
    drop_last=False,
    shuffle=False)

In [10]:
%%time
dgl_batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    l = len(output_nodes)
    dgl_batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}
    if batch_id==0:
        dgl_input_nodes_0, dgl_output_nodes_0, dgl_blocks_0 = input_nodes, output_nodes, blocks

CPU times: user 7.15 s, sys: 1.29 s, total: 8.43 s
Wall time: 1.5 s


In [11]:
blocks_0

[Block(num_src_nodes=319848, num_dst_nodes=22272, num_edges=665600),
 Block(num_src_nodes=22272, num_dst_nodes=1024, num_edges=25600)]