In [69]:
from tqdm import tqdm
from collections import Counter

import pandas as pd
import numpy as np

import networkx as nx

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling
from torch_geometric.utils import erdos_renyi_graph, to_networkx, from_networkx
from torch_geometric.transforms import RandomLinkSplit

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, roc_auc_score, accuracy_score

### Notebook navigation

[Data load](#data_load)

[Data processing](#data_processing)

[Simple link prediction model](#model)

### Data load
<a id='data_load'></a>

In [3]:
authors_edges_general = pd.read_csv("processed_data/SSORC_CS_2010_2021_authors_edge_list.csv", index_col = 0)

  mask |= (ar1 == a)


In [4]:
authors_edges_papers_general = pd.read_csv("processed_data/SSORC_CS_2010_2021_authors_edges_papers_indices.csv", index_col = 0, \
                                   converters={"papers_indices": lambda x: x.strip("[]").replace("'","").split(", ")})

In [5]:
authors_papers_features_general = pd.read_csv("processed_data/SSORC_CS_2010_2021_papers_features_vectorized_compressed_32.csv", index_col = 0)

In [6]:
dataset = "SSORC_CS_10_21_22306_115907_primus"

In [7]:
authors_edges = pd.read_csv("datasets/" + dataset + "/" + dataset + "_authors_edge_list.csv", index_col = 0)
authors_nodes = pd.read_csv("datasets/" + dataset + "/" + dataset + "_authors_nodes.csv", index_col = 0)
papers_edges = pd.read_csv("datasets/" + dataset + "/" + dataset + "_papers_edge_list.csv", index_col = 0)
papers_nodes = pd.read_csv("datasets/" + dataset + "/" + dataset + "_papers_nodes.csv", index_col = 0)

### Data processing
<a id='data_processing'></a>

In [8]:
papers_node_features = authors_papers_features_general.iloc[papers_nodes["node_id"], :]

In [9]:
aev = papers_edges.values
edges_list_t = [(aev[i][0], aev[i][1]) for i in tqdm(range(len(aev)))]
citation_graph = nx.DiGraph((x, y) for (x, y) in tqdm(Counter(edges_list_t)))    

100%|██████████| 521901/521901 [00:00<00:00, 1320379.00it/s]
100%|██████████| 521901/521901 [00:00<00:00, 1029780.42it/s]
100%|██████████| 521901/521901 [00:01<00:00, 395576.91it/s]


In [10]:
for node in tqdm(citation_graph.nodes):
    citation_graph.nodes[node]['x'] = list(papers_node_features.loc[[node]].values[0])
    

100%|██████████| 115907/115907 [00:18<00:00, 6396.77it/s]


In [11]:
data = from_networkx(citation_graph)

In [12]:
data

Data(x=[115907, 32], edge_index=[2, 521901])

In [15]:
transform = RandomLinkSplit(is_undirected=False)

In [16]:
train_data, val_data, test_data = transform(data)

### Simple link prediction model
<a id='model'></a>

In [43]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(32, 128)
        self.conv2 = GCNConv(128, 64)

    def forward(self):
        # node embeddings construction
        x = self.conv1(train_data.x, train_data.edge_index)
        x = x.relu()
        x = self.conv2(x, train_data.edge_index)        
        return x

In [96]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [101]:
model, train_data, val_data, test_data = Net().to(device), train_data.to(device), val_data.to(device), test_data.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)

In [102]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model()
    edge_index = train_data.edge_label_index
    link_embeddings = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
    link_labels = train_data.edge_label
    loss = F.binary_cross_entropy_with_logits(link_embeddings, link_labels)
    loss.backward()
    optimizer.step()
    return loss

@torch.no_grad()
def test():
    model.eval()
    perfs = []
    z = model()
    for sample in [val_data, test_data]: 
        edge_index = sample.edge_label_index
        link_embeddings = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)  # dot product 
        link_probs = link_embeddings.sigmoid()
        link_labels = sample.edge_label
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu()))
    return perfs

In [103]:
epochs = 1000
for i in range(epochs):
    train_loss = train()
    if i % 100 == 0:
        metrics = test()
        print("Loss:", float(train_loss), "Val ROC AUC:", metrics[0], "Test ROC AUC:", metrics[1])

Loss: 0.6335689425468445 Val ROC AUC: 0.8505025176076874 Test ROC AUC: 0.8490941032168304
Loss: 0.5926809310913086 Val ROC AUC: 0.8903955727449643 Test ROC AUC: 0.8886420504273429
Loss: 0.5264583826065063 Val ROC AUC: 0.9136736409894999 Test ROC AUC: 0.9116997942687413
Loss: 0.49002188444137573 Val ROC AUC: 0.9167409251008178 Test ROC AUC: 0.9151879533126581
Loss: 0.4737541675567627 Val ROC AUC: 0.9268957836087657 Test ROC AUC: 0.9253539349457912
Loss: 0.4624984562397003 Val ROC AUC: 0.9342709171585935 Test ROC AUC: 0.9327097880509485
Loss: 0.4548185169696808 Val ROC AUC: 0.9388555830225324 Test ROC AUC: 0.9373728798477243
Loss: 0.4490970969200134 Val ROC AUC: 0.9419713107379807 Test ROC AUC: 0.9405718301564496
Loss: 0.44466134905815125 Val ROC AUC: 0.9440818148979653 Test ROC AUC: 0.9427475215105124
Loss: 0.4411512017250061 Val ROC AUC: 0.9456004963073411 Test ROC AUC: 0.9442823092558214
