# Testing Notebook for cugraph DGL vs DGL Upstream

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import cudf
import rmm      
rmm.reinitialize(pool_allocator = True, initial_pool_size = 15e9)
import torch
torch.cuda.memory.change_current_allocator(rmm.rmm_torch_allocator)

In [2]:
single_gpu = True

In [3]:
def load_dgl_dataset(dataset_name='ogbn-products'):
    from ogb.nodeproppred import DglNodePropPredDataset

    dataset_root = '/raid/vjawa/gnn/'
    dataset =  DglNodePropPredDataset(name = dataset_name, root=dataset_root)
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
    g, label = dataset[0]
    g.ndata['label'] = label
    g = g.add_self_loop()
    return g, train_idx

# cuGraph DGL DataLoader

In [4]:
import cugraph_dgl
import tempfile

In [5]:
g, train_idx = load_dgl_dataset()
g = cugraph_dgl.cugraph_storage_from_heterograph(g, single_gpu=single_gpu)

temp_dir = tempfile.TemporaryDirectory()

batch_size = 1024
fanout_vals=[25, 25]
sampler = cugraph_dgl.dataloading.NeighborSampler(fanout_vals)
dataloader = cugraph_dgl.dataloading.DataLoader(
    g,                               
    train_idx.to('cuda'),                        # train_nid must be on GPU.
    sampler,
    sampling_output_dir=temp_dir.name, # Path to save sampling results to
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                 # Number of workers must be 0.
    batch_size=batch_size,
    batches_per_partition=50,
    drop_last=False,
    shuffle=False)


In [6]:
%%time

batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    l = len(output_nodes)
    batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}
    if batch_id==0:
        input_nodes_0, output_nodes_0, blocks_0 = input_nodes, output_nodes, blocks

DGLError: [16:06:25] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:116: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: out of memory
Stack trace:
  [bt] (0) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7f935624f1ef]
  [bt] (1) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::AllocDataSpace(DGLContext, unsigned long, unsigned long, DGLDataType)+0x260) [0x7f9356712710]
  [bt] (2) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::Empty(std::vector<long, std::allocator<long> >, DGLDataType, DGLContext)+0x177) [0x7f9356580587]
  [bt] (3) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(dgl::aten::NewIdArray(long, DGLContext, unsigned char)+0x6d) [0x7f9356216d9d]
  [bt] (4) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(std::tuple<std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> > > dgl::transform::cuda::MapEdges<long>(std::shared_ptr<dgl::BaseHeteroGraph>, std::vector<dgl::EdgeArray, std::allocator<dgl::EdgeArray> > const&, dgl::transform::cuda::DeviceNodeMap<long> const&, CUstream_st*)+0x1bd) [0x7f93573c1f9d]
  [bt] (5) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(std::tuple<std::shared_ptr<dgl::BaseHeteroGraph>, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> > > dgl::transform::(anonymous namespace)::ToBlockGPU<long>(std::shared_ptr<dgl::BaseHeteroGraph>, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> > const&, bool, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >*)+0x13be) [0x7f93573c972e]
  [bt] (6) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(dgl::transform::ToBlockGPU64(std::shared_ptr<dgl::BaseHeteroGraph>, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> > const&, bool, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >*)+0x3b) [0x7f93573c508b]
  [bt] (7) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(std::tuple<std::shared_ptr<dgl::BaseHeteroGraph>, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> > > dgl::transform::ToBlock<(DGLDeviceType)2, long>(std::shared_ptr<dgl::BaseHeteroGraph>, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> > const&, bool, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >*)+0x3a) [0x7f93566b0d6a]
  [bt] (8) /datasets/vjawa/miniconda3/envs/cugraph_dev_feb_17/lib/python3.10/site-packages/dgl/libdgl.so(+0x80d602) [0x7f93566b5602]



In [10]:
#output_nodes_0

# Pure DGL DataLoader

In [8]:
from dgl.dataloading import DataLoader, NeighborSampler
import dgl

In [9]:
g, train_idx = load_dgl_dataset()
batch_size = 1024
fanout_vals=[25, 25]
sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout_vals_reverse)
dataloader = dgl.dataloading.DataLoader(
    g,                               
    train_idx.to('cuda'),                        # train_nid must be on GPU.
    sampler,
    device=torch.device('cuda'),    # The device argument must be GPU.
    num_workers=0,                    # Number of workers must be 0.
    use_uva=False,
    batch_size=batch_size,
    drop_last=False,
    shuffle=False)

In [10]:
%%time
dgl_batch_stats = {}
for batch_id,(input_nodes, output_nodes, blocks) in enumerate(dataloader):
    l = len(output_nodes)
    dgl_batch_stats[batch_id]={'input_nodes':len(input_nodes),'output_nodes':len(output_nodes)}
    if batch_id==0:
        dgl_input_nodes_0, dgl_output_nodes_0, dgl_blocks_0 = input_nodes, output_nodes, blocks

CPU times: user 7.15 s, sys: 1.29 s, total: 8.43 s
Wall time: 1.5 s


In [11]:
blocks_0

[Block(num_src_nodes=319848, num_dst_nodes=22272, num_edges=665600),
 Block(num_src_nodes=22272, num_dst_nodes=1024, num_edges=25600)]