In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from node2vec import Node2Vec
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from ipynb.fs.full.test_train_split import mask_test_edges
from scipy import sparse
import collections

In [2]:
foursquare_edgelist = pd.read_csv("../datasets/foursquare_edgelist.csv")

In [3]:
foursquare_edgelist.head()

Unnamed: 0.1,Unnamed: 0,user_id,venue_id,user2
0,0,123141,4840fe6bf964a52030501fe3,134107
1,1,123141,49f76cc2f964a5209d6c1fe3,134107
2,2,123141,4da7cf1981541df437af6cf7,134107
3,3,123141,4da7cf1981541df437af6cf7,134107
4,4,123141,4ad8795ff964a520a21121e3,134107


In [4]:
foursquare_edgelist.shape

(196270, 4)

In [5]:
#generating the graph
net = nx.Graph()

for i in range(len(foursquare_edgelist)):
    net.add_edge(foursquare_edgelist.iloc[i]['user_id'], foursquare_edgelist.iloc[i]['user2'])
    net.add_edge(foursquare_edgelist.iloc[i]['user_id'], foursquare_edgelist.iloc[i]['venue_id'])

In [6]:
# generate a dictionary for the node labels against their index in the adjacency matrix
node_index_map = dict()
for index, node in enumerate(net.nodes()):
    node_index_map[node] = index

In [7]:
node_index_map

{123141: 0,
 134107: 1,
 '4840fe6bf964a52030501fe3': 2,
 '49f76cc2f964a5209d6c1fe3': 3,
 '4da7cf1981541df437af6cf7': 4,
 '4ad8795ff964a520a21121e3': 5,
 '40f1d480f964a5206a0a1fe3': 6,
 '4858e403f964a520bf501fe3': 7,
 '4ca6342076d3a093b22fff6a': 8,
 '3fd66200f964a52005e71ee3': 9,
 '4569264af964a520de3d1fe3': 10,
 '3fd66200f964a5200ce41ee3': 11,
 '4197f180f964a520111e1fe3': 12,
 '4e498e05aeb7de71b38c15ff': 13,
 '45b9eeecf964a520db411fe3': 14,
 '3fd66200f964a520f6e41ee3': 15,
 '4aca9125f964a52060c220e3': 16,
 '4c7e5ec4d598a093dd83c562': 17,
 '4b53cfe9f964a52025ac27e3': 18,
 '4d10f5c038bb6ea8969bc7aa': 19,
 '4c2af1fa8ef52d7f95a930ba': 20,
 '4afefbd9f964a5207a3222e3': 21,
 '4cf406ab7bf3b60ca2176e7f': 22,
 '4e66ceadaeb7e985aa7b65a4': 23,
 '4fd49307e4b0b5691947492c': 24,
 '4c1573e1a1010f47841d4e18': 25,
 '50ad97c2e4b06b4737016a40': 26,
 '4d766d8f46a8b60cb0ee372a': 27,
 '4dcdba448877851243e6347f': 28,
 '4c53be6b06901b8d4da0a34a': 29,
 '50f6a5f0e4b017ec5540b9b5': 30,
 '50227b55e4b079961002339f'

In [13]:
np.random.seed(0) # make sure train-test split is consistent between notebooks
adj_sparse = nx.adjacency_matrix(net)

# Perform train-test split
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
    test_edges, test_edges_false = mask_test_edges(adj_sparse, test_frac=.3, val_frac=.1)

In [8]:
#loading the adjacency matrix
adj_train = sparse.load_npz("adj_train_matrix.npz")

In [9]:
train_edges = np.loadtxt('train_edges.txt', dtype=int)
train_edges_false = np.loadtxt('train_edges_false.txt', dtype=int)
val_edges = np.loadtxt('val_edges.txt', dtype=int)
val_edges_false = np.loadtxt('val_edges_false.txt', dtype=int)
test_edges = np.loadtxt('test_edges.txt', dtype=int)
test_edges_false = np.loadtxt('test_edges_false.txt', dtype=int)

In [10]:
# generating graph from the train data
g_train = nx.from_scipy_sparse_matrix(adj_train)

In [11]:
# fitting the graph using the node2vec model
node2vec = Node2Vec(g_train, dimensions=128, walk_length=80, num_walks=10, workers=4, p=0.5, q=6)

Computing transition probabilities: 100%|██████████| 19638/19638 [00:15<00:00, 1267.32it/s]


In [12]:
model = node2vec.fit(window=10, min_count=1)

In [13]:
#generating the mappings
emb_mappings = model.wv

In [14]:
model.wv.save_word2vec_format("user-friend.emb")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [15]:
# Create node embeddings matrix (rows = nodes, columns = embedding features)
emb_list = []
for node_index in range(0, adj_train.shape[0]):
    node_str = str(node_index)
    node_emb = emb_mappings[node_str]
    emb_list.append(node_emb)
emb_matrix = np.vstack(emb_list)

In [40]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

#Normal dot product score
score_matrix = np.dot(emb_matrix, emb_matrix.T)

In [56]:
# Create node embeddings matrix (rows = nodes, columns = embedding features)
def get_top_10(node1):
#     emb_target = emb_matrix[node1]
    emb_dict = dict()
    for node_index in range(0, adj_train.shape[0]):
        if node_index != node1:
#             emb_node = emb_matrix[node_index]
            emb_dict[node_index] = score_matrix[node1, node_index]
    
    nodes_descending = collections.Counter(emb_dict).most_common(20)
    
#     d_descending = OrderedDict(sorted(emb_dict.items(), key=lambda kv: kv[1], reverse=True))
    return nodes_descending

In [57]:
test_dict = dict()
for edges in test_edges:
    top_10 = get_top_10(edges[0])
    test_dict[edges[0]] = top_10    

In [58]:
test_dict

{1149: [(1140, 201.37892),
  (6393, 193.6422),
  (2893, 192.0033),
  (1094, 187.37386),
  (9800, 168.69128),
  (9801, 160.20155),
  (11439, 150.19937),
  (3569, 117.65865),
  (8068, 117.110115),
  (13125, 110.51524),
  (5355, 106.35595),
  (5421, 105.9891),
  (15203, 104.53857),
  (17787, 102.85478),
  (5681, 102.30528),
  (5137, 96.84227),
  (19088, 93.163704),
  (14095, 92.824745),
  (1481, 88.39919),
  (8222, 87.67916)],
 1827: [(9340, 124.42871),
  (16614, 121.72435),
  (14832, 115.96633),
  (12616, 93.0668),
  (799, 76.07945),
  (14830, 60.56566),
  (913, 57.39752),
  (10064, 55.80507),
  (16135, 53.943592),
  (14636, 53.391758),
  (5432, 52.95473),
  (13151, 52.23626),
  (3275, 52.147163),
  (5393, 51.916553),
  (10138, 51.331253),
  (16025, 50.84234),
  (343, 50.47235),
  (8703, 49.32274),
  (11916, 48.891975),
  (17233, 48.28029)],
 10396: [(10395, 77.56599),
  (1146, 70.1356),
  (16579, 63.37452),
  (17261, 60.564846),
  (18982, 60.197266),
  (1140, 58.57641),
  (17583, 55.919

In [72]:
test_edges[2]

array([10396, 10397])

In [81]:
emb_matrix[10396]

array([ 5.8264345e-02,  1.8340473e-01, -3.9503109e-01,  2.6105434e-01,
        7.2334195e-03,  1.2273674e-01, -3.5223842e-01, -1.5517251e-01,
       -5.5637079e-01, -1.7844819e-01, -1.2463984e+00,  7.0134348e-01,
       -3.4039044e-01, -2.9870248e-01, -1.0725394e+00,  1.3001788e-01,
       -5.1519138e-01,  9.4932836e-01,  1.9179912e-01,  7.7677511e-02,
        2.2646073e-01,  8.5062712e-01,  9.7957909e-01,  7.6432586e-01,
       -8.5255790e-01, -5.6634945e-01,  4.9060285e-01, -1.5301597e-01,
       -1.3033669e-01,  8.4512316e-02,  2.2304310e-01, -5.8061975e-01,
        3.4925878e-01, -2.6849127e-01, -5.8276427e-01,  8.3755451e-01,
        1.2084806e+00,  8.8203140e-02, -2.6801247e-03,  1.4855574e-01,
        2.8689390e-01, -4.8024145e-01,  2.3253955e-01,  2.3564002e-01,
        3.5126030e-01,  2.9942191e-01,  6.9113553e-01,  5.6223863e-01,
        1.0955040e+00, -1.1883137e+00, -4.5812249e-01,  8.1050962e-02,
       -8.6384249e-01,  5.1958483e-01,  9.3383670e-02, -5.9030968e-01,
      

In [74]:
test_dict[10396]

[(10395, 77.56599),
 (1146, 70.1356),
 (16579, 63.37452),
 (17261, 60.564846),
 (18982, 60.197266),
 (1140, 58.57641),
 (17583, 55.91986),
 (19088, 55.60359),
 (11339, 54.9985),
 (18914, 54.884743),
 (17755, 52.63194),
 (10983, 52.587963),
 (15501, 52.147278),
 (18259, 51.71701),
 (18103, 50.691113),
 (1775, 50.31117),
 (740, 49.61548),
 (10635, 49.180023),
 (16581, 48.32595),
 (9127, 47.94442)]

In [75]:
sigmoid(score_matrix[test_edges[2][0], 10397])

0.9999999999999969

In [80]:
list(node_index_map.keys())[list(node_index_map.values()).index(10396)]

1386757

In [76]:
actual_edges = list(net.edges(1386757))
actual_edges

[(1386757, 537615), (1386757, 1446783), (1386757, '4ad507bff964a520660121e3')]

In [77]:
pred_edges = []
for edges in test_dict[10396]:
    pred_edges.append(list(node_index_map.keys())[list(node_index_map.values()).index(edges[0])])

In [78]:
print(pred_edges)

[537615, '49b79f54f964a5202c531fe3', 63228, 44945, 247883, 92546, 685451, 184816, 68594, 301362, 90591, 495236, 41551, 125467, 1247613, 488752, '4e70a36d922e8e01baadf45c', 265550, 99162, 257202]
