# DGL UVA Sampling Benchmark 

In [1]:
import torch
import dgl
import pandas as pd
import os

In [2]:
def get_edgelist(scale, edgefactor, dataset_dir = '/datasets/vjawa/gnn_data/'):
    fp = os.path.join(dataset_dir, f'mg_scale_{scale}_edgefactor_{edgefactor}.parquet')
    return pd.read_parquet(fp).reset_index(drop=True)
                      
def create_dgl_graph_from_df(df):
    src_tensor = torch.as_tensor(df['src'].values)
    dst_tensor = torch.as_tensor(df['dst'].values)
    # Reverse edges to match cuGraph behavior
    g = dgl.graph(data = (dst_tensor, src_tensor))
    return g


def get_dgl_dataloader(dgl_graph, df, batch_size, fanout):
    g,df = dgl_graph, df
    assert g.device.type =='cpu'
    seed_nodes = torch.as_tensor(df['dst'][:30_000_000])
    seed_nodes = seed_nodes.to('cuda')
    ### Reverse because dgl sampler samples from destination to source
    fanout.reverse()
    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanout)
    dataloader = dgl.dataloading.DataLoader(
    g,                               
    seed_nodes,                        # train_nid must be on GPU.
    sampler,
    device=torch.device('cuda:0'),    # The device argument must be GPU.
    num_workers=0,                    # Number of workers must be 0.
    use_uva=True,
    batch_size=batch_size,
    drop_last=False,
    shuffle=False)
    
    return dataloader


def uva_benchmark(dataloader_it):
    input_nodes, output_nodes, blocks = next(dataloader_it)
    return  input_nodes, output_nodes, blocks 

### Scale: 27,16 Benchmarks

In [3]:
%%time
df = get_edgelist(27,16)
df = df.astype('int64') # Fails on int32 at  scales>=27
print("{:,}".format(len(df)))
g = create_dgl_graph_from_df(df)

2,147,483,648
CPU times: user 1min 22s, sys: 1min 41s, total: 3min 3s
Wall time: 1min 2s


In [4]:
for batch_size in [100, 500, 1_000, 2_500, 5_000, 10_000, 20_000, 30_000, 40_000, 50_000, 60_000, 70_000, 80_000, 90_000, 100_000]:
    print(f"---"*15+f"Batch_Size:{batch_size}"+"---"*15)
    dataloader = get_dgl_dataloader(g, df, batch_size, [10,25])
    dataloader_it = iter(dataloader)
    input_nodes, output_nodes, blocks  = uva_benchmark(dataloader_it)  
    assert len(output_nodes)==batch_size
    %timeit input_nodes, output_nodes, blocks  = uva_benchmark(dataloader_it) 

---------------------------------------------Batch_Size:100---------------------------------------------
5.8 ms ± 67 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
---------------------------------------------Batch_Size:500---------------------------------------------
9.63 ms ± 130 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
---------------------------------------------Batch_Size:1000---------------------------------------------
14.4 ms ± 234 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
---------------------------------------------Batch_Size:2500---------------------------------------------
24 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
---------------------------------------------Batch_Size:5000---------------------------------------------
37.7 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
---------------------------------------------Batch_Size:10000---------------------------------------------
62.6 ms ± 1.3 

### Scale: 28,16 Benchmarks

In [3]:
%%time
df = get_edgelist(28,16)
df = df.astype('int64')
print("{:,}".format(len(df)))
g = create_dgl_graph_from_df(df)

4,294,967,296
CPU times: user 3min 6s, sys: 4min 37s, total: 7min 43s
Wall time: 2min 41s


In [4]:
for batch_size in [100, 500, 1_000, 2_500, 5_000, 10_000, 20_000, 30_000, 40_000, 50_000, 60_000, 70_000, 80_000, 90_000, 100_000]:
    print(f"---"*10+f"Batch_Size:{batch_size}"+"--"*10)
    dataloader = get_dgl_dataloader(g, df, batch_size, [10,25])
    dataloader_it = iter(dataloader)
    input_nodes, output_nodes, blocks  = uva_benchmark(dataloader_it)  
    assert len(output_nodes)==batch_size
    %timeit input_nodes, output_nodes, blocks  = uva_benchmark(dataloader_it) 

------------------------------Batch_Size:100--------------------
6.54 ms ± 159 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
------------------------------Batch_Size:500--------------------
11.8 ms ± 90.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
------------------------------Batch_Size:1000--------------------
23 ms ± 1.33 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
------------------------------Batch_Size:2500--------------------
33.5 ms ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
------------------------------Batch_Size:5000--------------------
62.9 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
------------------------------Batch_Size:10000--------------------
109 ms ± 9.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
------------------------------Batch_Size:20000--------------------
188 ms ± 390 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
------------------------------Batch_Size:300