In [1]:
!export RAPIDS_NO_INITIALIZE="1"
!export CUDF_SPILL="1"
!export LIBCUDF_CUFILE_POLICY="OFF"

from cugraph_bulk_sampling import start_dask_client, benchmark_cugraph_bulk_sampling, load_disk_dataset, construct_graph
from cugraph_bulk_sampling import sample_graph
import os

# Setup Cluster

In [2]:
dask_worker_devices='0,1,2,3,4,5,6,7'

In [3]:
client, cluster = start_dask_client(dask_worker_devices=dask_worker_devices,
                                    jit_unspill=False,
                                    rmm_pool_size=28e9,
                                    rmm_async=True)


Dask client/cluster created using LocalCUDACluster


# Setup Benchmark

In [4]:
dataset='ogbn_papers100M'
dataset_root="/datasets/abarghi"
output_root="/tmp/ramdisk"
reverse_edges=True
add_edge_types=False
batch_size=512
seeds_per_call=524288
fanout=[25,25]
replication_factor=2
seed=123

dataset_dir=dataset_root
output_path=output_root
persist=False


In [5]:
dask_edgelist_df, dask_label_df, node_offsets, edge_offsets, total_num_nodes = \
    load_disk_dataset(
        dataset,
        dataset_dir=dataset_dir,
        reverse_edges=reverse_edges,
        replication_factor=replication_factor,
        persist=False,
        add_edge_types=add_edge_types
    )
num_input_edges = len(dask_edgelist_df)
print(
f"Number of input edges = {num_input_edges:,}"
)

G = construct_graph(
dask_edgelist_df
)
del dask_edgelist_df
print('constructed graph')

Loading edge index for edge type paper__cites__paper
Loading node labels for node type paper (offset=0)
Number of input edges = 3,231,371,744
constructed graph


In [6]:
input_memory = G.edgelist.edgelist_df.memory_usage().sum().compute()
print(f'input memory: {input_memory}')

output_subdir = os.path.join(output_path, f'{dataset}[{replication_factor}]_b{batch_size}_f{fanout}')
os.makedirs(output_subdir, exist_ok=True)

output_sample_path = os.path.join(output_subdir, 'samples')
os.makedirs(output_sample_path,  exist_ok=True)

batches_per_partition = 200_000 // batch_size



input memory: 51701947904


# Benchmarking Sample Graph

In [7]:
%%timeit -n30 -r1


execution_time, allocation_counts = sample_graph(
    G,
    dask_label_df,
    output_sample_path,
    seed=seed,
    batch_size=batch_size,
    seeds_per_call=seeds_per_call,
    batches_per_partition=batches_per_partition,
    fanout=fanout,
    persist=persist,
)


created batches


Key:       ('repartition-split-70ef0a6cdf3b72d08b039f865dffa546', 0)
Function:  boundary_slice
args:      (('min_batch_id', -1), 0, 0, True)
kwargs:    {}
Exception: 'TypeError("object of type \'builtin_function_or_method\' has no len()")'

Key:       ('repartition-split-492dd0cc927669594e29f47a0a355603', 0)
Function:  boundary_slice
args:      (('max_batch_id', -1), 0, 0, True)
kwargs:    {}
Exception: 'TypeError("object of type \'builtin_function_or_method\' has no len()")'

Key:       _call_plc_uniform_neighbor_sample-984f9cdb-17cf-4922-9314-d10733f8db95
Function:  _call_plc_uniform_neighbor_sample
args:      (b'\xa6\xb7\xd8\xb5D\x89N\xe2\x94$\xe1\x9f\xcf\xe0\x93#', <pylibcugraph.graphs.MGGraph object at 0x7f89c578b3f0>, [          _START_  _BATCH_
0             602        0
1             684        0
2            1384        0
3            1525        0
4            2127        0
...           ...      ...
386691  125973990      755
386692  125974005      755
386693  125974008     

{"('repartition-merge-70ef0a6cdf3b72d08b039f865dffa546', 0)": ()}
{"('repartition-merge-492dd0cc927669594e29f47a0a355603', 0)": ()}


type: 'list' object has no attribute 'release'

In [8]:
import cudf, dask_cudf
df = dask_cudf.from_cudf(cudf.Series([0,5,6,1,2,3,4,0,-1]), npartitions=8)

In [10]:
m = df.min()

In [49]:
import dask.dataframe as dd
m.persist()


numpy.ndarray