# DEEPWALK IMPLEMENTATION

In [10]:
PATH = '/Users/silviaarellanogarcia/Documents/MSc MACHINE LEARNING/Advanced Machine Learning/Project/Datasets/BlogCatalog-dataset'
SAVE_PATH = '/Users/silviaarellanogarcia/Documents/MSc MACHINE LEARNING/Advanced Machine Learning/Project/embeddings_deepwalk'

In [2]:
!pip install umap-learn
!pip install networkx
!pip install gensim
!pip install seaborn matplotlib pandas
!pip install scikit-learn
!pip install scikit-multilearn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgr

In [3]:
import networkx as nx
import random
from gensim.models import Word2Vec # Needed for Word2Vec
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier

from multiprocessing import cpu_count

from concurrent.futures import ProcessPoolExecutor
from six import string_types
from gensim.models.word2vec import Vocab

from collections import defaultdict
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle as skshuffle

In [29]:
# HYPERPARAMETERS
window_size = 5
embedding_size = 128
walks_per_vertex = 10 ## 10 --> Value used in Khosla's comparative paper
walk_length = 40 ## 40 --> Value used in Khosla's comparative paper

### IMPLEMENTATION OF THE DEEPWALK METHOD.

In [5]:
def Random_Walk(G, vi, t):
  '''
  Inputs:
    G: Graph
    vi: initial vertex of the random walk
    t: walk length (the walks could have different length according to the paper)
  Output:
    Wvi: sequence of vertices visited in the random walk (starting from vi).
  '''
  Wvi = []
  Wvi.append(str(vi)) # The initial vertex is always visited
  last_visited = vi

  for i in range(t):
    neighbors_last_vi = list(G.neighbors(last_visited))
    last_visited = random.choice(neighbors_last_vi)
    Wvi.append(str(last_visited))

  return Wvi

In [6]:
# Optimization method
class Skipgram(Word2Vec):
    """A subclass to allow more customization of the Word2Vec internals."""

    def __init__(self, **kwargs):

        self.vocabulary_counts = None

        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["workers"] = kwargs.get("workers", cpu_count())
        kwargs["vector_size"] = 128
        kwargs["sentences"] = kwargs.get("sentences", None)
        kwargs["window"] = kwargs.get("window", 10)
        kwargs["sg"] = 1
        kwargs["hs"] = 1

        super(Skipgram, self).__init__(**kwargs)

In [17]:
def DeepWalk(G, w, d, gamma, t):
  '''
  Inputs:
    G: Graph with vertices V and edges E
    w: window size
    d: embedding size
    gamma: walks per vertex
    t: walk length
  Outputs:
    phi: matrix of vertex representations
  '''
  vertices_in_G = list(G.nodes)
  Wvi = [] # Array where each element is a list of arrays containing the walks that start from that vertex.

  for i in range(0, gamma):
    random.shuffle(vertices_in_G) # This variable receives the name O in the paper
    for vi in vertices_in_G:
      Wvi.append(Random_Walk(G, vi, t))

  print("Finish random walk")

  phi_model = Skipgram(sentences=Wvi, window=w, min_count=0, trim_rule=None)

  return phi_model

In [None]:
# Subroutine to label a node with MAX-VOTE --> NOT NECESSARY. The original paper uses this as a baseline
def Max_Vote(G, node, k, L):
  '''
  node: node to label --> v in the paper
  neighbors: neighbors of the node to label --> N(v) in the paper
  k: Number of labels that will be assigned to the vertex
  L: total number of possible labels

  Output:
    k_lab: most frequent k labels
  '''
  freq_labels = np.zeros(L)
  k_lab = np.full(k , -1) # Set to -1 all the chosen labels
  neighbors = list(G.neighbors(node))

  for n in neighbors:
    labels_neighbor = G.nodes[n]['group'] # Is it necessary to check if the neighbor is empty, or it won't have any effect?
    print(labels_neighbor)
    for l in labels_neighbor:
      freq_labels[l] += 1

  used_labels = np.count_nonzero(freq_labels)
  for i in range(min(k, used_labels)):
    k_lab[i] = np.argmax(freq_labels)
    freq_labels[k_lab[i]] = -1 # We mark it as used, but at the same time we differentiate this label and the ones that haven't appeared.

  if(k > used_labels): # In case we have to assign more classes than the ones of our neighbors, we choose randomly
    zero_indices = np.where(freq_labels == 0)[0]
    random_zero_indices = np.random.choice(zero_indices, size=(k - used_labels), replace=False)
    k_lab[used_labels:k] = random_zero_indices

  return k_lab

In [8]:
# Some methods need to have all the labels inside an array. Depending on the case, that array has to be composed with arrays/sets containing the labels assigned to each node

def get_set_labels(G):
    '''
    Input:
        G: Graph
    Output:
        labels: List of sets containing labels. Each set corresponds to the groups of id = index + 1
    '''

    labels = []

    for n in G.nodes:
        l = set(G.nodes[n].get('group_belonging', []))  # Ensure an empty list is converted to an empty set
        labels.append(l)

    return labels

def get_array_labels(G):
  '''
  Input:
    G: Graph
  Output:
    labels: Array with labels. Each position of the array will correspond to the groups of id = index + 1
  '''

  labels = []

  for n in G.nodes:
      l = G.nodes[n].get('group_belonging')
      labels.append(l)
  return labels

### Running DeepWalk in the YouTube Dataset

In [12]:
# Load the data

edges_path = PATH + '/data/edges.csv'
nodes_path = PATH + '/data/nodes.csv'
groups_path = PATH + '/data/groups.csv'
group_edges_path = PATH + '/data/group-edges.csv'

In [13]:
nodes_id = pd.read_csv(nodes_path, header=None, names=['id'])
groups_id = pd.read_csv(groups_path, header=None, names=['group'])
edges = pd.read_csv(edges_path, header=None, names=['id_1', 'id_2'])
user_group_membership = pd.read_csv(group_edges_path, header=None, names=['id', 'group'])

In [14]:
# Create a graph
G_YT = nx.Graph()

# Add nodes to the graph
G_YT.add_nodes_from(nodes_id['id'])

# Add edges to the graph
G_YT.add_edges_from(edges[['id_1', 'id_2']].values)

In [15]:
# Create a dictionary to store groups for each ID
group_dict = {}

# Populate the group_dict
for _, row in user_group_membership.iterrows():
    user_id = row['id']
    group_id = row['group']

    # Check if the user_id is already in the dictionary
    if user_id in group_dict:
        group_dict[user_id].append(group_id)
    else:
        group_dict[user_id] = [group_id]

# Add group labels to the nodes
for user_id, groups in group_dict.items():
    nx.set_node_attributes(G_YT, {user_id: groups}, 'group_belonging')

# Print basic graph information
print("Number of nodes:", G_YT.number_of_nodes())
print("Number of edges:", G_YT.number_of_edges())


Number of nodes: 10312
Number of edges: 333983


In [30]:
# Graph: G_YT
phi_model_YT = DeepWalk(G_YT, window_size, embedding_size, walks_per_vertex, walk_length) # Phi represents the learned embedding matrix.

Finish random walk


In [None]:
# Save the model in Word2Vec format
phi_model_YT.wv.save_word2vec_format(SAVE_PATH + '/model.embedding')

In [40]:
# Generate the embedding vectors
phi_vectors_YT = phi_model_YT.wv.vectors
print(phi_vectors_YT)

[[ 3.9029792e-02 -7.4509203e-02  1.9705916e-02 ... -2.2880552e-02
  -1.1057488e-01  8.4063724e-02]
 [ 1.4815228e-01 -4.0864438e-02 -1.5822094e-02 ... -1.7346462e-02
  -1.2010539e-01 -9.8569125e-02]
 [ 1.1528076e-01 -1.6438906e-01  2.1793266e-01 ...  1.4944337e-01
   7.8751571e-02 -5.7726976e-02]
 ...
 [ 1.5521358e-02 -2.7485590e-02  2.1701198e-02 ... -6.0926676e-03
  -2.3639269e-01 -1.6306630e-01]
 [-1.0369864e-01 -3.0257350e-01  2.3209524e-01 ...  4.2347886e-02
  -8.6039029e-02  2.4708234e-02]
 [-1.8214239e-04 -8.4668294e-02  1.7950159e-02 ...  4.3078694e-02
  -2.1720786e-01  9.6658140e-02]]


In [None]:
# Save the vectors in a numpy file
np.save(SAVE_PATH + 'phi_vectors_YT.npy', phi_vectors_YT)

In [None]:
# LOAD PHI VECTORS YT FROM FILE
phi_vectors_YT = np.load(SAVE_PATH + '/phi_vectors_YT.npy')

In [41]:
# To improve the access to the adjacency matrix, we pass the info to a dictionary, where all the nodes have a entry, and the values aare the neighbours
def adj_dictionary(x):
    G = defaultdict(lambda: set())
    cx = x.tocoo() # Return a COOrdinate representation of this matrix
    for i,j,v in zip(cx.row, cx.col, cx.data):
        G[i].add(j)
    return {str(k): [str(x) for x in v] for k, v in G.items()}

In [42]:
class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        assert X.shape[0] == len(top_k_list)
        probs = np.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            all_labels.append(labels)
        return all_labels

In [43]:
def get_array_str_labels(G):
  '''
  Input:
    G: Graph
  Output:
    labels: Array with labels. Each position of the array will correspond to the groups of id = index + 1
  '''

  labels = []

  for n in G.nodes:
      l = G.nodes[n].get('group_belonging')
      labels.append(str(l))
  return labels

In [44]:
# Method used when the dataset doesn't have all the nodes labeled and we need to evaluate
def filter_phi(phi, labels_set):
    '''
    Input:
      phi: Embedding matrix
      labels_array: Array containing the

    Output:
      phi_new: Phi Embedding matrix containing only the rows of the labeled nodes
    '''
    label_np = np.array(labels_set) # Convert the set to a NumPy array

    indices_containing_labels = [bool(node_set) for node_set in label_np] # List with True for those indices that correspond to nodes with labels, and False for those which don't have any

    # Filter nodes with labels
    filtered_labels = label_np[indices_containing_labels] # Numpy array containing the labels of those nodes that are labeled

    # Get the indices to keep in the original order
    indices_to_keep = np.where(indices_containing_labels)[0] # Id's of the indices (id_nodes - 1) that have a label

    # Filter the embedding matrix
    filtered_phi = phi[indices_to_keep, :]

    return filtered_phi, filtered_labels

### EVALUATION

In [45]:
labels_sets = get_set_labels(G_YT)
new_phi, new_labels = filter_phi(phi_vectors_YT, labels_sets)

features_matrix = np.asarray([phi_model_YT.wv[str(node+1)] for node in range(len(G_YT))])

In [46]:
adj_G_YT = nx.adjacency_matrix(G_YT, nodelist=None, dtype=None, weight='weight') # Adjacency matrix: Square matrix of N x N size used to represent the connections between the edges of a graph.
adj_graph = adj_dictionary(adj_G_YT)
num_labels = len(groups_id)
mlb = MultiLabelBinarizer(classes=range(1, len(groups_id) + 1))

labels_bin = mlb.fit_transform(new_labels)
labels_bin = np.array(labels_bin)

shuffles = []
for x in range(10): # Decide how many shuffles I want to make
    shuffles.append(skshuffle(features_matrix, labels_bin)) # The sklearn shuffle shuffles arrays or sparse matrices in a consistent way.


all_results = defaultdict(list)

training_percents = [0.1, 0.5, 0.9]

for train_percent in training_percents:
    for shuf in shuffles:
        X, y = shuf ## X corresponds to the phi_vectors and y to the labels.

        training_size = int(train_percent * len(X))

        X_train = X[:training_size, :]
        y_train_ = y[:training_size]

        y_train = [[] for x in range(len(y_train_))]

        y_train_sparse = coo_matrix(y_train_)
        cy =  y_train_sparse.tocoo()

        for i, j in zip(cy.row, cy.col):
            y_train[i].append(j)

        assert sum(len(l) for l in y_train) == np.count_nonzero(y_train_)

        X_test = X[training_size:, :]
        y_test_ = y[training_size:]

        y_test = [[] for _ in range(len(y_test_))]

        y_test_sparse = coo_matrix(y_test_)
        cy =  y_test_sparse.tocoo()
        for i, j in zip(cy.row, cy.col):
            y_test[i].append(j)

        clf = TopKRanker(LogisticRegression()) # creates an instance of TopKRanker with a LogisticRegression model as the base classifier.
        clf.fit(X_train, y_train_)

        # find out how many labels should be predicted
        top_k_list = [len(l) for l in y_test]
        preds = clf.predict(X_test, top_k_list)

        results = {}
        averages = ["micro", "macro"]
        for average in averages:
            results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)

        all_results[train_percent].append(results)


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

In [39]:
print ('Results, using embeddings of dimensionality', X.shape[1])
print ('-------------------')
for train_percent in sorted(all_results.keys()):
    print ('Train percent:', train_percent)
for index, result in enumerate(all_results[train_percent]):
    print ('Shuffle #%d:   ' % (index + 1), result)
avg_score = defaultdict(float)
for score_dict in all_results[train_percent]:
    for metric, score in score_dict.items():
        avg_score[metric] += score
for metric in avg_score:
    avg_score[metric] /= len(all_results[train_percent])
print ('Average score:', dict(avg_score))
print ('-------------------')

Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
Train percent: 0.5
Train percent: 0.9
Shuffle #1:    {'micro': 0.41722086104305217, 'macro': 0.28335589277764167}
Shuffle #2:    {'micro': 0.41140819964349373, 'macro': 0.29400088828594595}
Shuffle #3:    {'micro': 0.39352170916609236, 'macro': 0.24641517902967136}
Shuffle #4:    {'micro': 0.4069767441860465, 'macro': 0.26810620578097144}
Shuffle #5:    {'micro': 0.39710942876806604, 'macro': 0.27551900188635675}
Shuffle #6:    {'micro': 0.3849698688408366, 'macro': 0.2335600328613171}
Shuffle #7:    {'micro': 0.3978571428571429, 'macro': 0.27831425271751586}
Shuffle #8:    {'micro': 0.417910447761194, 'macro': 0.24580706933634586}
Shuffle #9:    {'micro': 0.40310624779385806, 'macro': 0.2812712832148978}
Shuffle #10:    {'micro': 0.42633451957295376, 'macro': 0.30187036638798037}
Average score: {'micro': 0.40564151696327355, 'macro': 0.27082201722786436}
-------------------
