In [None]:
import numpy as np
import networkx as nx
from collections import defaultdict
import scipy.io
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from scipy.sparse.linalg import svds
from copy import deepcopy
from collections import Counter
from tqdm import tqdm
import random
from scipy.special import softmax

In [None]:
import os
os.chdir('/content/drive/My Drive/wstm/code')
!pwd

/content/drive/My Drive/wstm/code


In [None]:
p=0.50 # Return parameter
q=0.25 # In-out parameter
walks=20 # Walks per node
length=30 # Length of each walk
d=64 # Dimension of output embeddings
window=10 # Window size for word2vec
workers=4 # Number of workers to assign for random walk and word2vec


In [None]:
def compute_transition_probs(G, probs, p, q):
    """
    Inputs: 
    G: nx.graph
    probs: this is a dictionary of transition scores that you will fill. 
    p: the return parameter
    q: the in-out parameter
    
    Follow the formula above to create transition_probs. 

    Steps to follow: 
    1. Iterate over all nodes s in G.
    2. In each iteration, iterate over all neighbors c of node s.
    3. For every neighbor d of node c, calculate the probability of a random walker
       visiting node d from node c if the random walker was on node s in the previous step. 
    4. Store the probability values in transition_probs. 

    Return: 
    transition_probs: This is a dictionary of transition probabilities during the random walk. 
    This is a nested dictionary.  
    transition_probs[s][c] = a list of normalized probability scores of visiting node d from node s via node c. 
    
    The format of transition_probs is given after this cell. 
    """
    transition_probs = deepcopy(probs)
    for s in list(G.nodes()):
        nbrs_s = list(G.neighbors(s))
        for c in nbrs_s:
            probs_list = []
            nbrs_c = list(G.neighbors(c))
            for d in nbrs_c:
                if d==s:
                    probs_list.append((d,1/p))
                # elif d in nbrs_s:
                #     probs_list.append((d,1))
                else:
                    probs_list.append((d,1/q))
            probs = np.array([item[1] for item in probs_list])
            # transition_probs[str(s)][str(c)] = [(item[0],item[1]/np.sum(probs)) for item in probs_list]
            transition_probs[str(s)][str(c)] = [item[1]/np.sum(probs) for item in probs_list]
    return transition_probs

In [None]:
def metapath2vec_fairwalk(G, transition_probs, walks_per_node, length, ratings_as_weight, fairwalk_strategy_flag):

    def fairwalk_equitable_selection(G,nbrs_x):

        men = [x for x in nbrs_x if G.nodes[x]['gender']=='M']
        women  = [x for x in nbrs_x if G.nodes[x]['gender']=='F']
        toss = random.random()
        if toss >= 0.5:
            if len(women) > 0:
                equitable_nbrs_x = women
            else:
                equitable_nbrs_x = men
        else:
            if len(men) > 0:
                equitable_nbrs_x = men
            else:
                equitable_nbrs_x = women
        return equitable_nbrs_x

    def select_d(G,this_walk,ratings_as_weight):

        this_s = this_walk[-2]
        this_c = this_walk[-1]
        
        nbrs_c = list(G.neighbors(this_c))
        if len(nbrs_c) == 0:
            return None
        #S was a user, C was a movie, D will be user, do fairwalk strategy and select randomly
        if str.startswith(this_c,'movie'):
            if fairwalk_strategy_flag:
                nbrs_c = fairwalk_equitable_selection(G,nbrs_c)   
            d = np.random.choice(nbrs_c)
        #S was a movie, C was a user, D will be movie, select with weighted ratings
        else:
            if ratings_as_weight:
                this_c_nbr_ratings_probs = softmax(np.array([G[this_c][nbr]['weight'] for nbr in nbrs_c]))
                d = np.random.choice(nbrs_c, p=this_c_nbr_ratings_probs)
            else:
                d = np.random.choice(nbrs_c)
        return d

    walks = []
    for s in tqdm(G.nodes(data=True)):
        for walk_number in range(0,walks_per_node):
            
            #S is a user, C  will be a movie, choose with or without ratings
            if str.startswith(s[0],'user'):
                nbrs_s = list(G.neighbors(s[0]))
                if ratings_as_weight:
                    this_s_nbr_ratings_probs = softmax(np.array([G[s[0]][nbr]['weight'] for nbr in nbrs_s]))
                    c = np.random.choice(nbrs_s, p=this_s_nbr_ratings_probs)
                else:
                    c = np.random.choice(nbrs_s)
                this_walk = [s[0],c]
                while(len(this_walk)<length):
                    d = select_d(G,this_walk,ratings_as_weight)
                    if d is None:
                        break
                    this_walk.append(d)

            else:
                #S is a movie, C will be a user, do fairwalk strategy and choose randomly 
                nbrs_s = list(G.neighbors(s[0]))
                if len(nbrs_s) == 0:
                    break
                if fairwalk_strategy_flag:
                    nbrs_s = fairwalk_equitable_selection(G,nbrs_s)
                c = np.random.choice(nbrs_s)
                this_walk = [s[0],c]
                while(len(this_walk)<length):
                    d = select_d(G,this_walk,ratings_as_weight)
                    if d is None:
                        break
                    this_walk.append(d)

            walks.append(this_walk)
    walks = [[str(j) for j in walk] for walk in walks]
    np.random.shuffle(walks)
    return walks

In [None]:
def metapath2vec_node2vec_fairwalk(G, transition_probs, walks_per_node, length, ratings_as_weight, fairwalk_strategy_flag):

    def fairwalk_equitable_selection(G,nbrs_x):

        men_indices = [i for i in range(0,len(nbrs_x)) if G.nodes[nbrs_x[i]]['gender']=='M']
        women_indices  = [i for i in range(0,len(nbrs_x)) if G.nodes[nbrs_x[i]]['gender']=='F']
        toss = random.random()
        if toss >= 0.5:
            if len(women_indices) > 0:
                indices = women_indices
            else:
                indices = men_indices
        else:
            if len(men_indices) > 0:
                indices = men_indices
            else:
                indices = women_indices
        equitable_nbrs_x = [nbrs_x[i] for i in indices]
        return indices,equitable_nbrs_x

    def select_d(G,this_walk,ratings_as_weight,transition_probs):

        this_s = this_walk[-2]
        this_c = this_walk[-1]
        
        nbrs_c = list(G.neighbors(this_c))
        if len(nbrs_c) == 0:
            return None
        #S was a user, C was a movie, D will be user, do fairwalk strategy and select using transition probs
        if str.startswith(this_c,'movie'):
            if fairwalk_strategy_flag:
                indices,nbrs_c = fairwalk_equitable_selection(G,nbrs_c)  
                probs = [transition_probs[str(this_s)][str(this_c)][i] for i in indices]
            d = np.random.choice(nbrs_c, p=softmax(np.array(probs)))
        #S was a movie, C was a user, D will be movie, select with weighted ratings
        else:
            if ratings_as_weight:
                #TODO
                array1 = transition_probs[str(this_s)][str(this_c)] 
                array2 = np.array([G[this_c][nbr]['weight'] for nbr in nbrs_c])
                product = np.multiply(array1,array2)
                this_c_nbr_ratings_probs = softmax(product)
                d = np.random.choice(nbrs_c, p=this_c_nbr_ratings_probs)
            else:
                d = np.random.choice(nbrs_c,p=transition_probs)
        return d

    walks = []
    for s in tqdm(G.nodes(data=True)):
        for walk_number in range(0,walks_per_node):
            
            #S is a user, C  will be a movie, choose with or without ratings
            if str.startswith(s[0],'user'):
                nbrs_s = list(G.neighbors(s[0]))
                if ratings_as_weight:
                    this_s_nbr_ratings_probs = softmax(np.array([G[s[0]][nbr]['weight'] for nbr in nbrs_s]))
                    c = np.random.choice(nbrs_s, p=this_s_nbr_ratings_probs)
                else:
                    c = np.random.choice(nbrs_s)
                this_walk = [s[0],c]
                while(len(this_walk)<length):
                    d = select_d(G,this_walk,ratings_as_weight,transition_probs)
                    if d is None:
                        break
                    this_walk.append(d)

            else:
                #S is a movie, C will be a user, do fairwalk strategy and choose randomly 
                nbrs_s = list(G.neighbors(s[0]))
                if len(nbrs_s) == 0:
                    break
                if fairwalk_strategy_flag:
                    indices,nbrs_s = fairwalk_equitable_selection(G,nbrs_s)
                    c = np.random.choice(nbrs_s)
                # except Exception:
                #     print(nbrs_s)
                this_walk = [s[0],c]
                while(len(this_walk)<length):
                    d = select_d(G,this_walk,ratings_as_weight,transition_probs)
                    if d is None:
                        break
                    this_walk.append(d)

            walks.append(this_walk)
    walks = [[str(j) for j in walk] for walk in walks]
    np.random.shuffle(walks)
    return walks

In [None]:
def generate_embeddings(walks, dimensions, window_size, num_workers, p, q):
    """
    Here we use word2vec code to generate node embeddings from the random walks. 
    Please refer to https://radimrehurek.com/gensim/models/word2vec.html for more information about word2vec.
    
    walks: Simulated random walks
    dimensions: Output dimension of node embeddings
    window_size: Window size for word2vec
    num_workers: Number of workers to assign for random walk and word2vec
    p: Return parameter
    q: In out parameter
    
    Return:
    model: the learned word2vec model
    embeddings: embeddings of all nodes generated using the word2vec model
    """
    model=None
    embeddings=None
    model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=num_workers,seed=0)
    embeddings = model.wv

    return model, embeddings

In [None]:
def generate_model_metapath2vec_fairwalk(G, probs, p, q, walks, length, workers, d, window, trans_probs, ratings_as_weight, fairwalk_strategy_flag):
  """
  Generate embeddings for node2vec using the above functions you just implemented. 
  Inputs:
  input: Path for input
  directed: True if you want a directed graph, False if you want a undirected graph
  p: Return parameter
  q: In out parameter
  walks: Length of walks for each node in graph G
  length: Length of each walk
  workers: Number of workers to assign for random walk and word2vec
  d: Output dimension of node embeddings
  window: Window size for word2vec

  Steps to follow:
  1. Call compute_transition_probs.
  2. Call generate_walks.
  3. Call generate_embeddings. 

  Return:
  walk: simulated walks from each node
  model: word2vec model
  node2vec_embeddings: embeddings generated from the model
  """

  model=None
  embeddings=None
  if trans_probs:
    print("\nBegin computing transition probabilities")
    transition_probs = compute_transition_probs(G, probs, p, q)
    print("\nComputed transition probabilities")
  else:
    transition_probs = None
  walk = metapath2vec_fairwalk(G, transition_probs, walks, length, ratings_as_weight, fairwalk_strategy_flag)
  print("\nGenerated walks")
  model, embeddings = generate_embeddings(walk, d, window, workers, p, q)  
  #######################################
  return walk, model, embeddings

In [None]:
def generate_model_metapath2vec_node2vec_fairwalk(G, probs, p, q, walks, length, workers, d, window, trans_probs, ratings_as_weight, fairwalk_strategy_flag):
  """
  Generate embeddings for node2vec using the above functions you just implemented. 
  Inputs:
  input: Path for input
  directed: True if you want a directed graph, False if you want a undirected graph
  p: Return parameter
  q: In out parameter
  walks: Length of walks for each node in graph G
  length: Length of each walk
  workers: Number of workers to assign for random walk and word2vec
  d: Output dimension of node embeddings
  window: Window size for word2vec

  Steps to follow:
  1. Call compute_transition_probs.
  2. Call generate_walks.
  3. Call generate_embeddings. 

  Return:
  walk: simulated walks from each node
  model: word2vec model
  node2vec_embeddings: embeddings generated from the model
  """

  model=None
  embeddings=None
  if trans_probs:
    print("\nBegin computing transition probabilities")
    transition_probs = compute_transition_probs(G, probs, p, q)
    print("\nComputed transition probabilities")
  else:
    transition_probs = None
  walk = metapath2vec_node2vec_fairwalk(G, transition_probs, walks, length, ratings_as_weight, fairwalk_strategy_flag)
  print("\nGenerated walks")
  model, embeddings = generate_embeddings(walk, d, window, workers, p, q)  
  #######################################
  return walk, model, embeddings

In [None]:
G = nx.read_gpickle("../data/train_graph.gpickle")
probs = defaultdict(dict)
for node in G.nodes():
    probs[str(node)] = dict()

In [None]:
walk, model, embeddings = generate_model_metapath2vec_node2vec_fairwalk(G, probs, p, q, walks, length, workers, d, window, True, True, True)

In [None]:
# Look for most similar nodes
print(model.wv.most_similar('user1')) # Output node names are always strings

# Save embeddings for later use
model.wv.save_word2vec_format('../data/metapath2vec_node2vec_without_ratings_as_weight/train_metapath2vec_node2vec_without_ratings_as_weight.model')

# Save model for later use
model.save('../data/metapath2vec_node2vec_without_ratings_as_weight/train_metapath2vec_node2vec_without_ratings_as_weight.embeddings')

[('movie104', 0.6251944303512573), ('movie247', 0.5665442943572998), ('user561', 0.5582504272460938), ('user778', 0.5413044691085815), ('user566', 0.5282551050186157), ('user453', 0.5131206512451172), ('user330', 0.5012733936309814), ('movie102', 0.5004125833511353), ('user254', 0.49272146821022034), ('user23', 0.4919901192188263)]
