In [1]:
!pip uninstall -y numpy
!python -m pip install numpy==1.20.0
!pip install csrgraph==0.1.26
!pip uninstall -y torch
!pip install torch==1.8.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install --no-index --no-cache-dir torch-scatter  torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.1+cu101.html
!pip install --no-cache-dir torch-cluster -f https://pytorch-geometric.com/whl/torch-1.8.1+cu101.html
!pip install --no-cache-dir torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.8.1+cu101.html
!pip install --no-cache-dir torch-geometric
!pip install wandb

Found existing installation: numpy 1.20.0
Uninstalling numpy-1.20.0:
  Successfully uninstalled numpy-1.20.0
Collecting numpy==1.20.0
  Using cached numpy-1.20.0-cp37-cp37m-manylinux2010_x86_64.whl (15.3 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
stellargraph 1.2.1 requires tensorflow>=2.1.0, which is not installed.
torchvision 0.7.0+cu101 requires torch==1.6.0, but you have torch 1.8.1+cu101 which is incompatible.
tensorflow-gpu 2.3.0 requires numpy<1.19.0,>=1.16.0, but you have numpy 1.20.0 which is incompatible.[0m
Successfully installed numpy-1.20.0
Found existing installation: torch 1.8.1+cu101
Uninstalling torch-1.8.1+cu101:
  Successfully uninstalled torch-1.8.1+cu101
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.1+cu101
  Using cached https://download.pytorc

In [1]:
from tqdm import tqdm
from collections import Counter

import pandas as pd
import numpy as np

import networkx as nx
import csrgraph as cg

from gensim.models import Word2Vec

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import ModuleList, Embedding
from torch.autograd import Variable

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import PNAConv, BatchNorm, global_add_pool
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling
from torch_geometric.utils import degree
from torch_geometric.utils import erdos_renyi_graph, to_networkx, from_networkx
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.data import HeteroData

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score



General data:

In [None]:
authors_edges_papers_general = pd.read_csv("../processed_data/SSORC_CS_2010_2021_authors_edges_papers_indices.csv", index_col = 0, \
                                   converters={"papers_indices": lambda x: x.strip("[]").replace("'","").split(", ")})
authors_edges_general = pd.read_csv("../processed_data/SSORC_CS_2010_2021_authors_edge_list.csv", index_col = 0)

In [3]:
papers_features_general = pd.read_csv("../processed_data/SSORC_CS_2010_2021_papers_features_vectorized_compressed_32.csv", index_col = 0)
authors_features_general = pd.read_csv("../processed_data/SSORC_CS_2010_2021_authors_features.csv", index_col = 0)

In [4]:
aev = authors_edges_general.values
edge_to_index = {(aev[i][0], aev[i][1]):i for i in tqdm(range(len(aev)))}
index_to_edge = {i:(aev[i][0], aev[i][1]) for i in tqdm(range(len(aev)))}

100%|██████████| 30796749/30796749 [00:35<00:00, 860725.36it/s]
100%|██████████| 30796749/30796749 [00:24<00:00, 1233261.66it/s]


Local data:

In [5]:
dataset_name = 'SSORC_CS_10_21_419_2055_primus' #'SSORC_CS_10_21_3340_16830_115907_primus'

In [6]:
authors_edges_papers = pd.read_csv("../datasets/" + dataset_name + "/" + dataset_name + "_" + "authors_edges_papers_indices.csv", index_col = 0,\
                                   converters={"papers_indices": lambda x: x.strip("[]").replace("'","").split(", ")})

In [7]:
authors_graph = nx.read_edgelist("../datasets/" + dataset_name + "/" + dataset_name + "_" + "authors.edgelist", create_using = nx.DiGraph)

In [8]:
citation_graph = nx.read_edgelist("../datasets/" + dataset_name + "/" + dataset_name + "_" + "papers.edgelist", create_using = nx.DiGraph)

_________________________________________________________


### Network aggregation

In [13]:
sAe = list(authors_graph.edges)
sAe = [(int(sAe[i][0]), int(sAe[i][1])) for i in range(len(sAe))]

In [14]:
authors_edges_papers_sub_2 = [authors_edges_papers["papers_indices"][edge_to_index[sAe[i]]] for i in tqdm(range(len(sAe)))]
authors_edges_papers_sub_2_edges = [sAe[i] for i in tqdm(range(len(sAe)))]
authors_edges_papers_sub_flat_2 = [int(item) for subarray in authors_edges_papers_sub_2 for item in subarray]
unique_papers_2 = list(set(authors_edges_papers_sub_flat_2))

100%|██████████| 1753/1753 [00:00<00:00, 80785.54it/s]
100%|██████████| 1753/1753 [00:00<00:00, 3793918.94it/s]


In [151]:
unique_nodes = list(authors_graph.nodes())
id_global_2_id_local_authors = {unique_nodes[i]:i for i in range(len(unique_nodes))}
id_local_2_id_global_authors = {i:unique_nodes[i] for i in range(len(unique_nodes))}

In [152]:
unique_papers = list(citation_graph.nodes())
id_global_2_id_local_papers = {unique_papers[i]:(i+len(unique_nodes)) for i in range(len(unique_papers))}
id_local_2_id_global_papers = {(i+len(unique_nodes)):unique_papers[i] for i in range(len(unique_papers))}

In [72]:
authors_edges = list(authors_graph.edges())

author_to_paper_edges = []
authors_papers = {}
for i in range(len(authors_edges_papers_sub_2)):
    node_a_1, node_a_2 = authors_edges[i]
    for j in range(len(authors_edges_papers_sub_2[i])):
        if node_a_1 not in authors_papers:
            authors_papers[node_a_1] = [authors_edges_papers_sub_2[i][j]]
        else:
            authors_papers[node_a_1].append(authors_edges_papers_sub_2[i][j])
        if node_a_2 not in authors_papers:
            authors_papers[node_a_2] = [authors_edges_papers_sub_2[i][j]]
        else:
            authors_papers[node_a_2].append(authors_edges_papers_sub_2[i][j])
        author_to_paper_edges.append([id_global_2_id_local_authors[node_a_1], \
                                      id_global_2_id_local_papers[authors_edges_papers_sub_2[i][j]]])
        author_to_paper_edges.append([id_global_2_id_local_authors[node_a_2], \
                                      id_global_2_id_local_papers[authors_edges_papers_sub_2[i][j]]])

In [199]:
authors_authors_edges = list(authors_graph.edges())

author_to_author_edges = []
for i in range(len(authors_authors_edges )):
    author_to_author_edges.append([id_global_2_id_local_authors[authors_authors_edges[i][0]],id_global_2_id_local_authors[authors_authors_edges[i][1]]])
    author_to_author_edges.append([id_global_2_id_local_authors[authors_authors_edges[i][1]], id_global_2_id_local_authors[authors_authors_edges[i][0]]])

In [196]:
citation_edges = list(citation_graph.edges())

paper_to_paper_edges = []
for i in range(len(citation_edges)):
    paper_to_paper_edges.append([id_global_2_id_local_papers[citation_edges[i][0]],id_global_2_id_local_papers[citation_edges[i][1]]])

In [203]:
aev = author_to_author_edges + paper_to_paper_edges + author_to_paper_edges
edges_list_t = [(aev[i][0], aev[i][1]) for i in tqdm(range(len(aev)))]
merged_graph = nx.DiGraph((x, y) for (x, y) in tqdm(Counter(edges_list_t)))    

100%|██████████| 17870/17870 [00:00<00:00, 2463831.32it/s]
100%|██████████| 12998/12998 [00:00<00:00, 677581.92it/s]


In [148]:
m_graph = cg.csrgraph(merged_graph, threads = 96)
m_node_names = m_graph.names

walks = m_graph.random_walks(walklen=3, # length of the walks
                epochs=1, # howm any times to start a walk from each node
                start_nodes=None, # the starting node. It is either a list (e.g., [2,3]) or None. If None it does it on all nodes and returns epochs*G.number_of_nodes() walks
                return_weight=1.,
                neighbor_weight=1.)

In [38]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print('Loss after epoch {}: {}'.format(self.epoch, loss))
        self.epoch += 1

In [42]:
walks = walks.tolist()

In [44]:
model = Word2Vec(walks, vector_size=128, window=5, min_count=1, epochs = 10000,
                 compute_loss=True, callbacks=[callback()])

Loss after epoch 0: 36317.98046875
Loss after epoch 1: 72754.84375
Loss after epoch 2: 108962.015625
Loss after epoch 3: 145296.90625
Loss after epoch 4: 181714.671875
Loss after epoch 5: 218047.25
Loss after epoch 6: 254305.140625
Loss after epoch 7: 291109.84375
Loss after epoch 8: 328014.1875
Loss after epoch 9: 364971.53125
Loss after epoch 10: 401848.53125
Loss after epoch 11: 438631.59375
Loss after epoch 12: 475048.25
Loss after epoch 13: 510909.1875
Loss after epoch 14: 545908.3125
Loss after epoch 15: 579838.0625
Loss after epoch 16: 612125.375
Loss after epoch 17: 642387.0625
Loss after epoch 18: 670621.8125
Loss after epoch 19: 697121.0625
Loss after epoch 20: 722199.875
Loss after epoch 21: 746270.5
Loss after epoch 22: 769573.3125
Loss after epoch 23: 792263.0
Loss after epoch 24: 814371.4375
Loss after epoch 25: 835979.4375
Loss after epoch 26: 857171.3125
Loss after epoch 27: 877957.5
Loss after epoch 28: 898371.1875
Loss after epoch 29: 918335.25
Loss after epoch 30: 93

In [50]:
author_vectors = model.wv

In [51]:
embeddings = []
for i in range(len(author_vectors)):
    embeddings.append(author_vectors[i])

In [52]:
len(unique_nodes)

419

In [55]:
pd.DataFrame(np.array(embeddings[0:len(unique_nodes)])).to_csv("pmne_na_embeddings_1.csv")

### Results aggregation

In [60]:
c_graph = cg.csrgraph(citation_graph, threads = 96)
c_node_names = c_graph.names

c_walks = c_graph.random_walks(walklen=3, # length of the walks
                epochs=1, # howm any times to start a walk from each node
                start_nodes=None, # the starting node. It is either a list (e.g., [2,3]) or None. If None it does it on all nodes and returns epochs*G.number_of_nodes() walks
                return_weight=1.,
                neighbor_weight=1.)

In [61]:
c_walks = c_walks.tolist()

In [66]:
c_model = Word2Vec(c_walks, vector_size=128, window=5, min_count=1, epochs = 10000,
                 compute_loss=True, callbacks=[callback()])

Loss after epoch 0: 25352.380859375
Loss after epoch 1: 50832.75390625
Loss after epoch 2: 76128.6015625
Loss after epoch 3: 101417.3046875
Loss after epoch 4: 126763.90625
Loss after epoch 5: 152135.671875
Loss after epoch 6: 177467.28125
Loss after epoch 7: 202699.515625
Loss after epoch 8: 228032.484375
Loss after epoch 9: 253234.96875
Loss after epoch 10: 278834.375
Loss after epoch 11: 304550.3125
Loss after epoch 12: 329915.09375
Loss after epoch 13: 355066.4375
Loss after epoch 14: 379635.6875
Loss after epoch 15: 403303.3125
Loss after epoch 16: 425854.5
Loss after epoch 17: 446883.125
Loss after epoch 18: 466479.59375
Loss after epoch 19: 484915.46875
Loss after epoch 20: 502355.9375
Loss after epoch 21: 519043.375
Loss after epoch 22: 535201.8125
Loss after epoch 23: 550902.0
Loss after epoch 24: 566286.0625
Loss after epoch 25: 581344.625
Loss after epoch 26: 596119.0
Loss after epoch 27: 610697.625
Loss after epoch 28: 625060.625
Loss after epoch 29: 639219.125
Loss after e

In [62]:
a_graph = cg.csrgraph(authors_graph, threads = 96)
a_node_names = a_graph.names

a_walks = a_graph.random_walks(walklen=3, # length of the walks
                epochs=1, # howm any times to start a walk from each node
                start_nodes=None, # the starting node. It is either a list (e.g., [2,3]) or None. If None it does it on all nodes and returns epochs*G.number_of_nodes() walks
                return_weight=1.,
                neighbor_weight=1.)

In [63]:
a_walks = a_walks.tolist()

In [65]:
a_model = Word2Vec(a_walks, vector_size=128, window=5, min_count=1, epochs = 10000,
                 compute_loss=True, callbacks=[callback()])

Loss after epoch 0: 4404.2958984375
Loss after epoch 1: 8860.9580078125
Loss after epoch 2: 13202.6884765625
Loss after epoch 3: 17582.236328125
Loss after epoch 4: 21907.82421875
Loss after epoch 5: 26303.591796875
Loss after epoch 6: 30775.158203125
Loss after epoch 7: 35184.890625
Loss after epoch 8: 39538.984375
Loss after epoch 9: 43937.0703125
Loss after epoch 10: 48374.46875
Loss after epoch 11: 52784.33203125
Loss after epoch 12: 57180.30859375
Loss after epoch 13: 61573.3515625
Loss after epoch 14: 65994.3671875
Loss after epoch 15: 70359.1953125
Loss after epoch 16: 74746.7421875
Loss after epoch 17: 79194.9296875
Loss after epoch 18: 83510.6171875
Loss after epoch 19: 87977.6796875
Loss after epoch 20: 92413.7421875
Loss after epoch 21: 96769.9921875
Loss after epoch 22: 101157.9453125
Loss after epoch 23: 105481.765625
Loss after epoch 24: 109889.40625
Loss after epoch 25: 114164.5625
Loss after epoch 26: 118428.5078125
Loss after epoch 27: 122715.4609375
Loss after epoch 2

In [110]:
author_vectors = a_model.wv
embeddings_a = []
for i in range(len(author_vectors)):
    embeddings_a.append(author_vectors[i].tolist())

In [111]:
papers_vectors = c_model.wv
embeddings_c = []
for i in range(len(papers_vectors)):
    embeddings_c.append(papers_vectors[i].tolist())

In [112]:
g_ids = list(c_node_names)
global_2_local_c = {g_ids[i]:i for i in range(len(g_ids))}  

In [123]:
extended_embeddings = []
for i in range(len(authors_papers)):
    author = str(a_node_names[i])
    papers = authors_papers[author]
    extensions = []
    for j in range(len(papers)):
        local_id = global_2_local_c[papers[j]]
        extensions.append(embeddings_c[local_id])
    extensions = np.array(extensions).sum(axis=0).tolist()
    extended_embeddings.append(embeddings_a[i] + extensions)

In [124]:
pd.DataFrame(np.array(extended_embeddings)).to_csv("pmne_ra_embeddings_1.csv")

### Layer Co-analysis

In [332]:
p = q = r = 0.5 # as in paper

def generate_walk(node, walk_len = 3):
    prev_node = -1
    walk = []
    for i in range(walk_len):
        walk.append(node)
        adj_edges = merged_graph.edges(node)
        prob_distr_unn, nodes = [], []
        for edge in adj_edges:
            nodes.append(edge[1])
            if node in id_local_2_id_global_papers and edge[1] in id_local_2_id_global_authors \
            or node in id_local_2_id_global_authors and edge[1] in id_local_2_id_global_papers:  
                if edge[1] == prev_node:
                    prob_distr_unn.append((1 - r)/p)
                else:
                    if prev_node != -1:
                        if edge[1] in np.array(list(merged_graph.edges(prev_node)))[:, 1]:
                            prob_distr_unn.append(1)
                        else:
                            prob_distr_unn.append((1 - r)/q)
                    else:
                        prob_distr_unn.append((1 - r)/q)
            else:
                if edge[1] == prev_node:
                    prob_distr_unn.append(r/p)
                else:
                    if prev_node != -1:
                        if edge[1] in np.array(list(merged_graph.edges(prev_node)))[:, 1]:
                            prob_distr_unn.append(1)
                        else:
                            prob_distr_unn.append(r/q)
                    else:
                        prob_distr_unn.append((1 - r)/q)

        prob_distr = [prob_distr_unn[i]/sum(prob_distr_unn) for i in range(len(prob_distr_unn))]
        prev_node = node
        if len(nodes) == 0:
            return walk
        node = choice(nodes, 1, p=prob_distr)[0]
    return walk

In [325]:
epochs = 10
mg_nodes = list(merged_graph.nodes())
walks = []
for i in tqdm(range(epochs)):
    for node in mg_nodes:
        walks.append(generate_walk(node, 10))

100%|██████████| 10/10 [00:07<00:00,  1.39it/s]


In [329]:
co_model = Word2Vec(walks, vector_size=128, window=5, min_count=1, epochs = 1000,
                 compute_loss=True, callbacks=[callback()])

Loss after epoch 0: 121311.53125
Loss after epoch 1: 223251.453125
Loss after epoch 2: 297396.84375
Loss after epoch 3: 363834.5625
Loss after epoch 4: 421940.5625
Loss after epoch 5: 469778.0625
Loss after epoch 6: 509923.75
Loss after epoch 7: 544093.375
Loss after epoch 8: 564101.125
Loss after epoch 9: 590841.1875
Loss after epoch 10: 610880.5625
Loss after epoch 11: 633688.1875
Loss after epoch 12: 655002.25
Loss after epoch 13: 674969.625
Loss after epoch 14: 694418.5
Loss after epoch 15: 712963.8125
Loss after epoch 16: 730848.25
Loss after epoch 17: 748365.0625
Loss after epoch 18: 765423.5
Loss after epoch 19: 781940.6875
Loss after epoch 20: 795477.5
Loss after epoch 21: 808617.6875
Loss after epoch 22: 824260.625
Loss after epoch 23: 839619.75
Loss after epoch 24: 854786.625
Loss after epoch 25: 869755.0
Loss after epoch 26: 884661.9375
Loss after epoch 27: 899270.875
Loss after epoch 28: 913799.75
Loss after epoch 29: 928001.9375
Loss after epoch 30: 942067.125
Loss after e

In [330]:
nodes_vectors = co_model.wv
embeddings_co = []
for i in range(len(nodes_vectors)):
    embeddings_co.append(nodes_vectors[i].tolist())

In [331]:
pd.DataFrame(np.array(embeddings_co[0:len(unique_nodes)])).to_csv("pmne_co_embeddings_1.csv")