# FeatureStore Benchmarks

In [1]:
import cudf
import cupy as cp
import numpy as np
from collections import defaultdict
import dask_cudf
from dask import dataframe as dd
from dask_cuda import LocalCUDACluster
from distributed import Client
import dask
import pandas as pd
import time
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Create Dask Server

In [2]:
def setup_cluster():
    cluster = LocalCUDACluster(protocol='tcp',
                               CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6',
                               rmm_pool_size='20gb',
                               local_directory='/raid/vjawa/dask-dir',
                              )
    client = Client(cluster)
    return cluster,client

cluster,client = setup_cluster()

2023-01-12 19:35:23,832 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize


# Feature Storage Class

In [4]:
from cugraph.gnn import FeatureStore

In [5]:
path = '/datasets/MAG240m/MAG240M_gp_etl/node_data/'
path_d = {name: path+f'{name}_feat.parquet' for name in ['author', 'institution', 'paper']}

# Dask Cupy based benchmarks

In [5]:
n_parts = 10
# author_df = dask_cudf.read_parquet(path_d['author']).partitions[0:n_parts]
# institution_df = dask_cudf.read_parquet(path_d['institution']).partitions[0:n_parts]
paper_df = dask_cudf.read_parquet(path_d['paper']).partitions[0:n_parts]
fs = FeatureStore(backend='dask_cupy', client=client)
# fs.add_feat_from_df(author_df, 'author', 'feat')
# fs.add_feat_from_df(institution_df, 'institution', 'feat')
fs.add_feat_from_df(paper_df, 'paper', 'feat')
## Persist feat_data
fs.persist_feat_data()

In [6]:
batch_size = 1024*20
range_max = len(paper_df)

indices = cp.random.randint(low=0,high=range_max, size=batch_size, dtype=cp.int32)
%timeit fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
assert isinstance(fetched_feat, cp.ndarray)

244 ms ± 5.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
if 'fs' in globals():
    del fs
time.sleep(2)
client.has_what()

Worker,Key count,Key list
tcp://127.0.0.1:33441,0,Expand
tcp://127.0.0.1:39221,0,Expand
tcp://127.0.0.1:40423,0,Expand
tcp://127.0.0.1:40549,0,Expand
tcp://127.0.0.1:40773,0,Expand
tcp://127.0.0.1:42559,0,Expand
tcp://127.0.0.1:44079,0,Expand


# Dask Numpy based

In [8]:
n_parts = 15
# author_df = dask_cudf.read_parquet(path_d['author']).partitions[0:n_parts]
# institution_df = dask_cudf.read_parquet(path_d['institution']).partitions[0:n_parts]
paper_df = dask_cudf.read_parquet(path_d['paper']).partitions[0:n_parts]
fs = FeatureStore(backend='dask_numpy', client=client)
# fs.add_feat_from_df(author_df, 'author', 'feat')
# fs.add_feat_from_df(institution_df, 'institution', 'feat')
fs.add_feat_from_df(paper_df, 'paper', 'feat')
# Persist feat_data
fs.persist_feat_data()

In [9]:
batch_size = 1024*20
range_max = len(paper_df)

indices = np.random.randint(low=0,high=range_max, size=batch_size, dtype=np.int32)
%timeit fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
assert isinstance(fetched_feat, np.ndarray)

321 ms ± 3.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
if 'fs' in globals():
    del fs
time.sleep(2)
client.has_what()

Worker,Key count,Key list
tcp://127.0.0.1:33441,0,Expand
tcp://127.0.0.1:39221,0,Expand
tcp://127.0.0.1:40423,0,Expand
tcp://127.0.0.1:40549,0,Expand
tcp://127.0.0.1:40773,0,Expand
tcp://127.0.0.1:42559,0,Expand
tcp://127.0.0.1:44079,0,Expand


# Numpy Based

In [2]:
numpy_path = '/raid/gnn/'
paper_feat_path = numpy_path + 'paper/node_feat.npy'
paper_ar = np.load(paper_feat_path, mmap_mode='r') 

In [6]:
fs = FeatureStore(backend='numpy')
fs.add_feat_from_df(paper_ar, 'paper', 'feat')

In [8]:
batch_size = 1024*20
range_max = len(paper_ar)
indices = np.random.randint(low=0,high=range_max, size=batch_size, dtype=np.int32)
%timeit fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
assert isinstance(fetched_feat, np.ndarray)

9.76 ms ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Torch Based

In [9]:
%%time
numpy_path = '/raid/gnn/'
paper_feat_path = numpy_path + 'paper/node_feat.npy'
paper_ar = np.load(paper_feat_path,  mmap_mode='r') 

CPU times: user 0 ns, sys: 2.63 ms, total: 2.63 ms
Wall time: 2.17 ms


In [10]:
fs = FeatureStore(backend='torch')
fs.add_feat_from_df(paper_ar, 'paper', 'feat')

  return torch.from_numpy(feat_obj)


In [11]:
batch_size = 1024*20
range_max = len(paper_ar)
indices = np.random.randint(low=0,high=range_max, size=batch_size, dtype=np.int32)
%timeit fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
fetched_feat = fs.get_data(indices=indices,type_name='paper', feat_name='feat')
assert torch.is_tensor(fetched_feat)

719 µs ± 40 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
