Implementation of DeepWalk Algorithm to generate Node Embeddings.

In [13]:
import networkx as nx
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

In [162]:
dataset = "/home/student/Vansh/GraphGAN/CA-GrQc Dataset/CA-GrQc_train.txt"

In [163]:
df = pd.read_csv(dataset,
                sep = '\t',
                names = ["NodeIDfrom", "NodeIDto"],
                )
df

Unnamed: 0,NodeIDfrom,NodeIDto
0,4095,546
1,3213,2059
2,1882,3269
3,4897,2661
4,2621,1718
...,...,...
13041,830,5103
13042,1842,3805
13043,3468,1758
13044,4889,3784


5241

In [167]:
#create the graph networkx object from the above dataframe

G = nx.from_pandas_edgelist(df = df,
                             source = "NodeIDfrom",
                             target = "NodeIDto",
                             create_using=nx.Graph())
len(G)

5119

In [168]:
def partition_num(num, workers):
    if num % workers == 0:
        return [num // workers] * workers
    else:
        return [num // workers] * workers + [num % workers]

In [169]:
import itertools
import math
import random

import pandas as pd
from joblib import Parallel, delayed


class RandomWalker:
    def __init__(self, G, p=1, q=1, use_rejection_sampling=False):
        """
        :param G:
        :param p: Return parameter,controls the likelihood of immediately revisiting a node in the walk.
        :param q: In-out parameter,allows the search to differentiate between “inward” and “outward” nodes
        :param use_rejection_sampling: Whether to use the rejection sampling strategy in node2vec.
        """
        self.G = G
        self.p = p
        self.q = q
        self.use_rejection_sampling = use_rejection_sampling

    def deepwalk_walk(self, walk_length, start_node):

        walk = [start_node]

        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(self.G.neighbors(cur))
            if len(cur_nbrs) > 0:
                walk.append(random.choice(cur_nbrs))
            else:
                break
        return walk
    
    def simulate_walks(self, num_walks, walk_length, workers=1, verbose=0):

        G = self.G

        nodes = list(G.nodes())

        results = Parallel(n_jobs=workers, verbose=verbose, )(
            delayed(self._simulate_walks)(nodes, num, walk_length) for num in
            partition_num(num_walks, workers))

        walks = list(itertools.chain(*results))

        return walks

    def _simulate_walks(self, nodes, num_walks, walk_length, ):
        walks = []
        for _ in range(num_walks):
            random.shuffle(nodes)
            for v in nodes:
                if self.p == 1 and self.q == 1:
                    walks.append(self.deepwalk_walk(
                        walk_length=walk_length, start_node=v))
        return walks

In [170]:


class DeepWalk:
    def __init__(self, graph, walk_length, num_walks, workers=1):

        self.graph = graph
        self.w2v_model = None
        self._embeddings = {}

        self.walker = RandomWalker(
            graph, p=1, q=1, )
        self.sentences = self.walker.simulate_walks(
            num_walks=num_walks, walk_length=walk_length, workers=workers, verbose=1)

    def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["vector_size"] = embed_size
        kwargs["sg"] = 1  # skip gram
        kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["epochs"] = iter

        print("Learning embedding vectors...")
        model = gensim.models.Word2Vec(**kwargs)
        print("Learning embedding vectors done!")

        self.w2v_model = model
        return model

In [291]:
deepwalk = DeepWalk(G, walk_length=10, num_walks=10)
sentences = deepwalk.sentences

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


In [292]:
import gensim
model = gensim.models.Word2Vec(sentences=sentences,
                 vector_size=50,
                 epochs=3,
                 window = 5,
                 compute_loss=True)

In [293]:
embeddings = []
for i in G.nodes():
    embeddings.append(model.wv[i]) #/np.linalg.norm(model.wv[i]))
embeddings = np.array(embeddings)
embeddings[0], np.max(embeddings)

(array([ 0.36336377,  0.21993694, -0.54842895,  0.14171971, -0.76210135,
        -0.40528682, -0.07366546,  0.28831422, -0.6714414 ,  0.21538393,
        -0.05992492,  0.05475014, -0.43771958,  0.4424864 , -0.11950689,
         0.01076721,  0.1552783 , -0.44117147,  0.36395305, -0.20703627,
        -0.12031038,  0.0050295 ,  0.37687257, -0.3541179 ,  0.23381917,
        -0.1045076 , -0.20237887,  0.03758082, -0.42053217,  0.11995432,
         0.35173428, -0.19082142, -0.01299398,  0.40715382,  0.08225609,
         0.55181956,  0.13224381,  0.5249842 ,  0.23315273, -0.5763551 ,
        -0.1488933 ,  0.25717974,  0.14607301,  0.22010419,  0.9429984 ,
         0.22557394,  0.01911211, -1.0640045 , -0.22505623,  0.21754292],
       dtype=float32),
 4.228395)

In [290]:
import os
embedding_filename = r"/home/student/Vansh/GraphGAN/Pre-Train Embeddings/deepwalk_embeddings.emb"
index = np.array(G.nodes()).reshape(-1, 1)
embedding_matrix = np.hstack([index, embeddings])
embedding_list = embedding_matrix.tolist()
embedding_str = [str(int(emb[0])) + " " + " ".join([str(x) for x in emb[1:]]) + "\n"
                  for emb in embedding_list]
with open(embedding_filename, "w+") as f:
    lines = [str(5119) + "\t" + str(50) + "\n"] + embedding_str
    f.writelines(lines)
