Merge pull request #3 from skojaku/add-torch-version

residual2vec with a stochastic gradient descent algorithm for embedding large networks
skojaku · Nov 23, 2021 · 708c75a · 708c75a
2 parents ccf6523 + 3fb66a3
commit 708c75a
Show file tree

Hide file tree

Showing 13 changed files with 854 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ This repository contains the code for
 
 - S. Kojaku, J. Yoon, I. Constantino, and Y.-Y. Ahn, Residual2Vec: Debiasing graph embedding using random graphs. NerurIPS (2021). [link will be added when available]
 
-- Preprint (arXiv): https://arxiv.org/abs/2110.07654
+- [Preprint (arXiv)](https://arxiv.org/abs/2110.07654)
 
 - BibTex entry:
 ```latex

diff --git a/libs/residual2vec/README.md b/libs/residual2vec/README.md
@@ -6,7 +6,7 @@ residual2vec is an algorithm to embed networks to a vector space while controlli
 
 - S. Kojaku, J. Yoon, I. Constantino, and Y.-Y. Ahn, Residual2Vec: Debiasing graph embedding using random graphs. NerurIPS (2021). [link will be added when available]
 
-- Preprint (arXiv): [link to arXiv]
+- [Preprint (arXiv)](https://arxiv.org/abs/2110.07654)
 
 - BibTex entry:
 ```latex
@@ -34,19 +34,26 @@ This code is tested in Python 3.7 and 3.8, and has dependencies with
 the following packages:
 
 ```
-- numpy==1.20.3
+- numpy==1.19.0
 - scipy==1.7.1
 - scikit-learn==1.0
 - faiss-cpu==1.7.0
+- numba==0.50.0
+- torch==1.10.0
+- tqdm==4.48.2
 ```
 
 
 ## Example
 
+residual2vec has two versions, one optimized with a matrix factorization, and the other optimized with a stochatic gradient descent aglorithm.
+
+The residual2vec with a matrix factorization is used in the original paper and runs faster than the other version for networks of upto 100k nodes.
+
 ```python
 import residual2vec as rv
 
-model = rv.residual2vec(window_length = 10, group_membership = None)
+model = rv.residual2vec_matrix_factorization(window_length = 10, group_membership = None)
 model.fit(G)
 emb = model.transform(dim = 64)
 # or equivalently emb = model.fit(G).transform(dim = 64)
@@ -56,3 +63,62 @@ emb = model.transform(dim = 64)
 - `group_membership`: an array of node labels. Used to debias the structural bias correlated with the node labels.
 - `dim`: Dimension of the embedding
 - `emb`: 2D numpy array of shape (`N`, `dim`), where `N` is the number of nodes. The `i`th row in the array (i.e., `emb[i, :]`) represents the embedding vector of the `i`th node in the given adjacency matrix `G`.
+
+
+A limitation of the matrix-factorization-based implementation is that it is memory demanding, especially for dense or large networks.
+The other version is implemented to circumvent this problem by using the stochastic gradient descent (SGD) algorithm, that
+incrementally updates the embedding with a small chunk of data instead of deriving the whole embedding in one go.
+
+```python
+import residual2vec as rv
+
+noise_sampler = rv.ConfigModelNodeSampler() # sampler for the negative sampling
+
+model = rv.residual2vec_sgd(noise_sampler, window_length = 10)
+model.fit(G)
+emb = model.transform(dim = 64)
+# or equivalently emb = model.fit(G).transform(dim = 64)
+```
+
+The `residual2vec_sgd` has an additional argument `noise_sampler`, which is a class that samples context nodes for a given center node.
+Several samplers are implemented in this package:
+- `ErdosRenyiNodeSampler`: Sampler based on the Erdos Renyi random graph (i.e., sample context node with the same probability)
+- `ConfigModelNodeSampler`: Sampler based on the configuration model (i.e., sample context node with probability proportional to its degree)
+- `SBMNodeSampler`: Sampler based on the stochastic block model (i.e., sample context node using the stochastic block model)
+
+The `SBMNodeSampler` is useful to negate the bias due to a group structure in networks (i.e., structure correlated with a discrete label of nodes):
+
+```python
+import residual2vec as rv
+
+group_membership = [0,0,0,0,1,1,1,1]
+noise_sampler = rv.SBMNodeSampler(window_length = 10, group_membership=group_membership) # sampler for the negative sampling
+
+model = rv.residual2vec_sgd(noise_sampler, window_length = 10)
+model.fit(G)
+emb = model.transform(dim = 64)
+# or equivalently emb = model.fit(G).transform(dim = 64)
+```
+
+An added bonus for the SGD-based approach is that it offers a way to customize the noise distribution, which is useful to debias a particular bias in embedding.
+Implement the following class inherited from `rv.NodeSampler`:
+
+```python
+import residual2vec as rv
+class CustomNodeSampler(rv.NodeSampler):
+    def fit(self, A):
+        #Fit the sampler
+        #:param A: adjacency matrix
+        #:type A: scipy.csr_matrix
+        pass
+
+    def sampling(self, center_node, n_samples):
+        #Sample context nodes from the graph for center nodes
+        #:param center_node: ID of center node
+        #:type center_node: int
+        #:param n_samples: number of samples per center node
+        #:type n_samples: int
+        pass
+```
+
+See the `residual2vec/node_samplers` for examples.
diff --git a/libs/residual2vec/requirements.txt b/libs/residual2vec/requirements.txt
@@ -2,3 +2,6 @@ faiss-cpu
 numpy
 scikit-learn
 scipy
+numba
+torch
+tqdm
diff --git a/libs/residual2vec/residual2vec/__init__.py b/libs/residual2vec/residual2vec/__init__.py
@@ -1 +1,3 @@
+from residual2vec.node_samplers import *
 from residual2vec.residual2vec import *
+from residual2vec.residual2vec_sgd import *
diff --git a/libs/residual2vec/residual2vec/node_samplers.py b/libs/residual2vec/residual2vec/node_samplers.py
@@ -0,0 +1,114 @@
+"""Graph module to store a network and generate random walks from it."""
+import numpy as np
+from scipy import sparse
+
+from residual2vec import utils
+
+
+class NodeSampler:
+    def fit(self, A):
+        """Fit the sampler.
+
+        :param A: adjacency matrix
+        :type A: scipy.csr_matrix
+        :raises NotImplementedError: [description]
+        """
+        raise NotImplementedError
+
+    def sampling(self, center_node, n_samples):
+        """Sample context nodes from the graph for center nodes.
+
+        :param center_node: ID of center node
+        :type center_node: int
+        :param n_samples: number of samples per center node
+        :type n_samples: int
+        :raises NotImplementedError: [description]
+        """
+        raise NotImplementedError
+
+
+class SBMNodeSampler(NodeSampler):
+    """Node Sampler based on the stochatic block model."""
+
+    def __init__(
+        self, window_length=10, group_membership=None, dcsbm=True,
+    ):
+        """Node Sampler based on the stochatic block model.
+
+        :param window_length: length of the context window, defaults to 10
+        :type window_length: int, optional
+        :param group_membership: group membership of nodes, defaults to None
+        :type group_membership: np.ndarray, optional
+        :param dcsbm: Set dcsbm=True to take into account the degree of nodes, defaults to True
+        :type dcsbm: bool, optional
+        """
+        if group_membership is None:
+            self.group_membership = None
+        else:
+            self.group_membership = np.unique(group_membership, return_inverse=True)[
+                1
+            ]  # reindex
+        self.window_length = window_length
+        self.dcsbm = dcsbm
+
+    def fit(self, A):
+        """Initialize the dcSBM sampler."""
+
+        # Initalize the parameters
+        self.n_nodes = A.shape[0]
+
+        # Initialize the group membership
+        if self.group_membership is None:
+            self.group_membership = np.zeros(self.n_nodes, dtype=np.int64)
+            self.node2group = utils.to_member_matrix(self.group_membership)
+        else:
+            self.node2group = utils.to_member_matrix(self.group_membership)
+
+        indeg = np.array(A.sum(axis=0)).reshape(-1)
+        Lambda = (self.node2group.T @ A @ self.node2group).toarray()
+        Din = np.array(np.sum(Lambda, axis=0)).reshape(-1)
+        Nin = np.array(self.node2group.sum(axis=0)).reshape(-1)
+        Psbm = np.einsum(
+            "ij,i->ij", Lambda, 1 / np.maximum(1, np.array(np.sum(Lambda, axis=1)))
+        )
+        Psbm_pow = utils.matrix_sum_power(Psbm, self.window_length) / self.window_length
+
+        if self.dcsbm:
+            self.block2node = (
+                sparse.diags(1 / np.maximum(1, Din))
+                @ sparse.csr_matrix(self.node2group.T)
+                @ sparse.diags(indeg)
+            )
+        else:
+            self.block2node = sparse.diags(1 / np.maximum(1, Nin)) @ sparse.csr_matrix(
+                self.node2group.T
+            )
+
+        # From block to block
+        self.block2block = sparse.csr_matrix(Psbm_pow)
+        self.block2block.data = utils._csr_row_cumsum(
+            self.block2block.indptr, self.block2block.data
+        )
+
+        # From block to node
+        self.block2node.data = utils._csr_row_cumsum(
+            self.block2node.indptr, self.block2node.data
+        )
+
+    def sampling(self, center_nodes, n_samples):
+        _center_nodes = np.repeat(center_nodes, n_samples)
+        block_ids = utils.csr_sampling(
+            self.group_membership[_center_nodes], self.block2block
+        )
+        context = utils.csr_sampling(block_ids, self.block2node)
+        return context.astype(np.int64).reshape((-1, n_samples))
+
+
+class ConfigModelNodeSampler(SBMNodeSampler):
+    def __init__(self):
+        super(ConfigModelNodeSampler, self).__init__(window_length=1, dcsbm=True)
+
+
+class ErdosRenyiNodeSampler(SBMNodeSampler):
+    def __init__(self):
+        super(ErdosRenyiNodeSampler, self).__init__(window_length=1, dcsbm=False)