In [1]:
import cugraph
import cudf
from cugraph.datasets import email_Eu_core

el = email_Eu_core.get_edgelist(download=True)

G = cugraph.Graph(directed=True)
G.from_cudf_edgelist(el, source="src", destination="dst")

seeds = cudf.DataFrame({
    'start': [49, 71],
    'batch': [0, 0],
})

In [2]:
sampling_results, offsets, renumber_map = cugraph.uniform_neighbor_sample(
    G,
    seeds,
    [5,5],
    with_replacement=False,
    with_edge_properties=True,
    with_batch_ids=True,
    deduplicate_sources=True,
    prior_sources_behavior='exclude',
    renumber=True,
    return_offsets=True,
    random_state=62,
    use_legacy_names=False,
    compress_per_hop=False,
    compression='CSR',
    include_hop_column=False,
)



In [3]:
sampling_results

Unnamed: 0,minors,weight,edge_id,edge_type,major_offsets
0,1,,,,0.0
1,2,,,,5.0
2,3,,,,10.0
3,5,,,,15.0
4,8,,,,20.0
5,0,,,,25.0
6,2,,,,30.0
7,4,,,,35.0
8,6,,,,40.0
9,7,,,,43.0


In [4]:
offsets

Unnamed: 0,offsets,batch_id,renumber_map_offsets
0,0,0.0,0.0
1,2,,36.0
2,9,,


In [5]:
renumber_map

Unnamed: 0,map
0,49
1,71
2,83
3,84
4,152
5,297
6,431
7,612
8,643
9,4


In [6]:
offsets.offsets.iloc[0]

0

In [7]:
import cupy
major_offsets = sampling_results['major_offsets'].iloc[
    offsets.offsets.iloc[0] : (offsets.offsets.iloc[1] + 1)
]
major_offsets

0     0
1     5
2    10
Name: major_offsets, dtype: int64

In [8]:
minors = sampling_results['minors'].iloc[major_offsets.iloc[0]:major_offsets.iloc[-1]]
minors.values

array([1, 2, 3, 5, 8, 0, 2, 4, 6, 7], dtype=int32)

In [9]:
import cupy
#major_offsets = sampling_results['major_offsets'].dropna().values
majors = cudf.Series(cupy.arange(len(major_offsets) - 1))
majors = majors.repeat(cupy.diff(major_offsets))
majors.values
#majors = sampling_results['majors']
majors.values

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [10]:
majors = renumber_map.map.iloc[majors]
majors.values

array([49, 49, 49, 49, 49, 71, 71, 71, 71, 71], dtype=int32)

In [11]:
minors = renumber_map.map.iloc[minors]
minors.values

array([ 71,  83,  84, 297, 643,  49,  83, 152, 431, 612], dtype=int32)

In [12]:
cudf.DataFrame({
    'majors':majors.values,
    'minors':minors.values
})

Unnamed: 0,majors,minors
0,49,71
1,49,83
2,49,84
3,49,297
4,49,643
5,71,49
6,71,83
7,71,152
8,71,431
9,71,612


In [13]:
for i in range(len(majors)):
    assert 1 == len(el[(el.src==majors.iloc[i]) & (el.dst==minors.iloc[i])])

In [14]:
from cugraph.gnn.data_loading.bulk_sampler_io import _write_samples_to_parquet_csr

_write_samples_to_parquet_csr(
    sampling_results.copy(deep=True),
    offsets.copy(deep=True),
    renumber_map.copy(deep=True),
    batches_per_partition=1,
    output_path='/home/nfs/abarghi',
    partition_info='sg'
)

results: 0 9


Series([], dtype: int64)

In [15]:
import cudf
cudf.read_parquet('/home/nfs/abarghi/batch=0-0.parquet')

Unnamed: 0,minors,weight,edge_id,edge_type,major_offsets,renumber_map_offsets,map,label_hop_offsets
0,1.0,,,,0.0,0.0,49,0.0
1,2.0,,,,5.0,36.0,71,2.0
2,3.0,,,,10.0,,83,9.0
3,5.0,,,,15.0,,84,
4,8.0,,,,20.0,,152,
5,0.0,,,,25.0,,297,
6,2.0,,,,30.0,,431,
7,4.0,,,,35.0,,612,
8,6.0,,,,40.0,,643,
9,,,,,,,4,
