[FEA] New WholeGraph Feature Store for PyG (#4432)

Reimplements the WG feature store for PyG using the `FeatureStore` interface. Merge after #4384 Closes rapidsai/wholegraph#47 Closes #4399 Authors: - Alex Barghi (https://github.com/alexbarghi-nv) - Seunghwa Kang (https://github.com/seunghwak) - Tingyu Wang (https://github.com/tingyu66) - Ralph Liu (https://github.com/nv-rliu) Approvers: - Tingyu Wang (https://github.com/tingyu66) - Vibhu Jawa (https://github.com/VibhuJawa) - Brad Rees (https://github.com/BradReesWork) - Ray Douglass (https://github.com/raydouglass) URL: #4432
rapidsai · May 30, 2024 · 1667f7a · 1667f7a
1 parent 797a036
commit 1667f7a
Show file tree

Hide file tree

Showing 11 changed files with 855 additions and 121 deletions.
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
@@ -40,7 +40,7 @@ if ! rapids-is-release-build; then
     alpha_spec=',>=0.0.0a0'
 fi
 
-for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibraft ucx-py; do
+for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibwholegraph pylibraft ucx-py; do
     sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
 done
 

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -20,6 +20,7 @@ files:
       - depends_on_pylibraft
       - depends_on_raft_dask
       - depends_on_pylibcugraphops
+      - depends_on_pylibwholegraph
       - depends_on_cupy
       - python_run_cugraph
       - python_run_nx_cugraph
@@ -60,6 +61,7 @@ files:
     includes:
       - cuda_version
       - depends_on_cudf
+      - depends_on_pylibwholegraph
       - py_version
       - test_python_common
       - test_python_cugraph
@@ -98,6 +100,7 @@ files:
     includes:
       - test_python_common
       - test_python_cugraph
+      - depends_on_pylibwholegraph
   py_build_pylibcugraph:
     output: pyproject
     pyproject_dir: python/pylibcugraph
@@ -175,6 +178,7 @@ files:
       key: test
     includes:
       - test_python_common
+      - depends_on_pylibwholegraph
   py_build_cugraph_pyg:
     output: pyproject
     pyproject_dir: python/cugraph-pyg
@@ -198,6 +202,7 @@ files:
       key: test
     includes:
       - test_python_common
+      - depends_on_pylibwholegraph
   py_build_cugraph_equivariant:
     output: pyproject
     pyproject_dir: python/cugraph-equivariant
@@ -535,9 +540,7 @@ dependencies:
           - *numpy
           - python-louvain
           - scikit-learn>=0.23.1
-      - output_types: [conda]
-        packages:
-          - pylibwholegraph==24.6.*
+
   test_python_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -568,6 +571,27 @@ dependencies:
           - tensordict>=0.1.2
           - pyg>=2.5,<2.6
 
+  depends_on_pylibwholegraph:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibwholegraph_conda pylibwholegraph==24.6.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - pylibwholegraph-cu12==24.6.*
+          - matrix: {cuda: "11.*"}
+            packages:
+              - pylibwholegraph-cu11==24.6.*
+          - {matrix: null, packages: [*pylibwholegraph_conda]}
+
   depends_on_rmm:
     common:
       - output_types: conda

diff --git a/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst b/docs/cugraph/source/api_docs/cugraph-pyg/cugraph_pyg.rst
@@ -20,6 +20,7 @@ Feature Storage
    :toctree: ../api/cugraph-pyg/
 
    cugraph_pyg.data.feature_store.TensorDictFeatureStore
+   cugraph_pyg.data.feature_store.WholeFeatureStore
 
 Data Loaders
 ------------

diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
@@ -32,6 +32,7 @@ dependencies = [
 [project.optional-dependencies]
 test = [
     "pandas",
+    "pylibwholegraph==24.6.*",
     "pytest",
     "pytest-benchmark",
     "pytest-cov",

diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -15,7 +15,10 @@
 
 from cugraph_pyg.data.dask_graph_store import DaskGraphStore
 from cugraph_pyg.data.graph_store import GraphStore
-from cugraph_pyg.data.feature_store import TensorDictFeatureStore
+from cugraph_pyg.data.feature_store import (
+    TensorDictFeatureStore,
+    WholeFeatureStore,
+)
 
 
 def CuGraphStore(*args, **kwargs):

diff --git a/python/cugraph-pyg/cugraph_pyg/data/feature_store.py b/python/cugraph-pyg/cugraph_pyg/data/feature_store.py
@@ -20,6 +20,7 @@
 torch = import_optional("torch")
 torch_geometric = import_optional("torch_geometric")
 tensordict = import_optional("tensordict")
+wgth = import_optional("pylibwholegraph.torch")
 
 
 class TensorDictFeatureStore(
@@ -127,3 +128,149 @@ def get_all_tensor_attrs(
                 )
 
         return attrs
+
+
+class WholeFeatureStore(
+    object
+    if isinstance(torch_geometric, MissingModule)
+    else torch_geometric.data.FeatureStore
+):
+    """
+    A basic implementation of the PyG FeatureStore interface that stores
+    feature data in WholeGraph WholeMemory.  This type of feature store is
+    distributed, and avoids data replication across workers.
+
+    Data should be sliced before being passed into this feature store.
+    That means each worker should have its own partition.
+    """
+
+    def __init__(self, memory_type="distributed", location="cpu"):
+        """
+        Parameters
+        ----------
+        memory_type: str (optional, default='distributed')
+            The memory type of this store.
+        location: str(optional, default='cpu')
+            The location ('cpu' or 'cuda') where data is stored.
+        """
+        super().__init__()
+
+        self.__features = {}
+
+        self.__wg_comm = wgth.get_local_node_communicator()
+        self.__wg_type = memory_type
+        self.__wg_location = location
+
+    def _put_tensor(
+        self,
+        tensor: "torch_geometric.typing.FeatureTensorType",
+        attr: "torch_geometric.data.feature_store.TensorAttr",
+    ) -> bool:
+        wg_comm_obj = self.__wg_comm
+
+        if attr.is_set("index"):
+            if (attr.group_name, attr.attr_name) in self.__features:
+                raise NotImplementedError(
+                    "Updating an embedding from an index"
+                    " is not supported by WholeGraph."
+                )
+            else:
+                warnings.warn(
+                    "Ignoring index parameter "
+                    f"(attribute does not exist for group {attr.group_name})"
+                )
+
+        if len(tensor.shape) > 2:
+            raise ValueError("Only 1-D or 2-D tensors are supported by WholeGraph.")
+
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+
+        ld = torch.tensor(tensor.shape[0], device="cuda", dtype=torch.int64)
+        sizes = torch.empty((world_size,), device="cuda", dtype=torch.int64)
+        torch.distributed.all_gather_into_tensor(sizes, ld)
+
+        sizes = sizes.cpu()
+        ld = sizes.sum()
+
+        td = -1 if len(tensor.shape) == 1 else tensor.shape[1]
+        global_shape = [
+            int(ld),
+            td if td > 0 else 1,
+        ]
+
+        if td < 0:
+            tensor = tensor.reshape((tensor.shape[0], 1))
+
+        wg_embedding = wgth.create_wholememory_tensor(
+            wg_comm_obj,
+            self.__wg_type,
+            self.__wg_location,
+            global_shape,
+            tensor.dtype,
+            [global_shape[1], 1],
+        )
+
+        offset = sizes[:rank].sum() if rank > 0 else 0
+
+        wg_embedding.scatter(
+            tensor.clone(memory_format=torch.contiguous_format).cuda(),
+            torch.arange(
+                offset, offset + tensor.shape[0], dtype=torch.int64, device="cuda"
+            ).contiguous(),
+        )
+
+        wg_comm_obj.barrier()
+
+        self.__features[attr.group_name, attr.attr_name] = (wg_embedding, td)
+        return True
+
+    def _get_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Optional["torch_geometric.typing.FeatureTensorType"]:
+        if (attr.group_name, attr.attr_name) not in self.__features:
+            return None
+
+        emb, td = self.__features[attr.group_name, attr.attr_name]
+
+        if attr.index is None or (not attr.is_set("index")):
+            attr.index = torch.arange(emb.shape[0], dtype=torch.int64)
+
+        attr.index = attr.index.cuda()
+        t = emb.gather(
+            attr.index,
+            force_dtype=emb.dtype,
+        )
+
+        if td < 0:
+            t = t.reshape((t.shape[0],))
+
+        return t
+
+    def _remove_tensor(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> bool:
+        if (attr.group_name, attr.attr_name) not in self.__features:
+            return False
+
+        del self.__features[attr.group_name, attr.attr_name]
+        return True
+
+    def _get_tensor_size(
+        self, attr: "torch_geometric.data.feature_store.TensorAttr"
+    ) -> Tuple:
+        return self.__features[attr.group_name, attr.attr_name].shape
+
+    def get_all_tensor_attrs(
+        self,
+    ) -> List["torch_geometric.data.feature_store.TensorAttr"]:
+        attrs = []
+        for (group_name, attr_name) in self.__features.keys():
+            attrs.append(
+                torch_geometric.data.feature_store.TensorAttr(
+                    group_name=group_name,
+                    attr_name=attr_name,
+                )
+            )
+
+        return attrs