In [1]:
# Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import random
import community as comm
import time
from gensim.models import Word2Vec

from sklearn.cluster import KMeans
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

import warnings
warnings.filterwarnings('ignore')

import torch 
import torchvision 
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import GATConv
from torch_geometric.utils import to_networkx
import torchvision
import torchvision.transforms as transforms

%run ./helper.ipynb

In [2]:
# Load PubMed dataset

from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='PubMed', name='PubMed')

In [3]:
# Creating dataframe to store the Computation Time and Accuracy of Node Embedding for the different GNN models

node_emb = pd.DataFrame(columns=['GCN','GraphSage','GAT'], index = ["Accuracy", "Execution Time (sec)", "CPU Execution Time (sec)"])

In [4]:
# Get the node embeddings for the desired GNN Model (GCN, GraphSage, GAT)

node_dict = {}

for i, j in zip([0,1,2],['GCN','GraphSage','GAT']):
    
    print(f'Generating Node Embeddings for {j}...')
    
    st, st_cpu = time.time(), time.process_time()
    node_dict[j], acc = gnn_embed(dataset, i)
    ct, ct_cpu = time.time() - st, time.process_time() - st_cpu
    
    print(f'Node Embeddings for {j} are generated!')
    
    node_emb.loc["Accuracy",j] = acc
    node_emb.loc["Execution Time (sec)",j] = ct
    node_emb.loc["CPU Execution Time (sec)",j] = ct_cpu

Generating Node Embeddings for GCN...
Epoch: 0, Train Accuracy: 0.5833
Epoch: 1, Train Accuracy: 0.6667
Epoch: 2, Train Accuracy: 0.8500
Epoch: 3, Train Accuracy: 0.8333
Epoch: 4, Train Accuracy: 0.8833
Epoch: 5, Train Accuracy: 0.9000
Epoch: 6, Train Accuracy: 0.9000
Epoch: 7, Train Accuracy: 0.8833
Epoch: 8, Train Accuracy: 0.9167
Epoch: 9, Train Accuracy: 0.9000
Epoch: 10, Train Accuracy: 0.9000
Epoch: 11, Train Accuracy: 0.9000
Epoch: 12, Train Accuracy: 0.9167
Epoch: 13, Train Accuracy: 0.9167
Epoch: 14, Train Accuracy: 0.9000
Epoch: 15, Train Accuracy: 0.9333
Epoch: 16, Train Accuracy: 0.9333
Epoch: 17, Train Accuracy: 0.9167
Epoch: 18, Train Accuracy: 0.9333
Epoch: 19, Train Accuracy: 0.9500
Epoch: 20, Train Accuracy: 0.9500
Epoch: 21, Train Accuracy: 0.9500
Epoch: 22, Train Accuracy: 0.9500
Epoch: 23, Train Accuracy: 0.9333
Epoch: 24, Train Accuracy: 0.9500
Epoch: 25, Train Accuracy: 0.9500
Epoch: 26, Train Accuracy: 0.9500
Epoch: 27, Train Accuracy: 0.9500
Epoch: 28, Train Acc

In [5]:
# Displaying Node Embedding metrics

node_emb

Unnamed: 0,GCN,GraphSage,GAT
Accuracy,0.767612,0.735406,0.74489
Execution Time (sec),7.453904,15.921325,21.635907
CPU Execution Time (sec),13.328125,27.015625,28.4375


In [6]:
# Creating dataframe to store the Computation Time and Accuracy of Link Prediction for the different GNN models

link_pred = pd.DataFrame(columns=['GCN','GraphSage','GAT'], index = ["Accuracy", "Execution Time (sec)", "CPU Execution Time (sec)"])

In [7]:
# Get the Link Prediction probabilities of each edge along with the edge list for each GNN model (GCN, GraphSage, GAT)

probs, vlists = {}, {}

for i in ['GCN','GraphSage','GAT']:

    G = to_networkx(dataset[0])
    G1 = G.to_undirected()

    print(f'Getting Link Prediction probabilities of edges along with edge list for {i}...')
    st, st_cpu = time.time(), time.process_time()
    prob, vlist, acc = makeLinkPrediction(G1, node_dict[i])
    ct, ct_cpu = time.time() - st, time.process_time() - st_cpu
    
    probs[i] = prob
    vlists[i] = vlist
    
    print(f'Edge Probabilities and the corresponding Edge list for {i} are generated!\n')
        
    link_pred.loc["Accuracy",i] = acc
    link_pred.loc["Execution Time (sec)",i] = ct
    link_pred.loc["CPU Execution Time (sec)",i] = ct_cpu

Getting Link Prediction probabilities of edges along with edge list for GCN...
Link Prediction Accuracy: 0.8191457362009325
Edge Probabilities and the corresponding Edge list for GCN are generated!

Getting Link Prediction probabilities of edges along with edge list for GraphSage...
Link Prediction Accuracy: 0.8657692886148293
Edge Probabilities and the corresponding Edge list for GraphSage are generated!

Getting Link Prediction probabilities of edges along with edge list for GAT...
Link Prediction Accuracy: 0.8708076402466536
Edge Probabilities and the corresponding Edge list for GAT are generated!



In [8]:
# Displaying Link Prediction metrics

link_pred

Unnamed: 0,GCN,GraphSage,GAT
Accuracy,0.819146,0.865769,0.870808
Execution Time (sec),2764.818487,2768.979584,2651.446158
CPU Execution Time (sec),2353.234375,2564.1875,2522.78125


In [9]:
# Getting updated probabilities of each GNN model (GCN, GraphSage, GAT) for Heuristic 2

probs_upd = {}

for k in ['GCN','GraphSage','GAT']:

#     k_int = {i:1-j for i, j in zip(vlists[k],probs[k])}
#     sorted_k_int = sorted(k_int.items(), key=lambda x:x[1], reverse = True)
    
#     prob_int, vlist_int = [], []
#     for i,j in sorted_k_int:
#         vlist_int.append(i)
#         prob_int.append(i)
           
#     probs_upd[k] = {'Edge': vlist_int, 'Prob': prob_int}
    probs_upd[k] = [1-i for i in probs[k]]

In [10]:
# Creating a base copy of the graph

G = to_networkx(dataset[0])
OG = G.to_undirected()

In [11]:
# Creating dataframe to store the Computation Time for Random Heuristic for the different GNN models

random_cp_time = pd.DataFrame(columns=[0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6], index = ['GCN','GraphSage','GAT', 'GCN (CPU)','GraphSage (CPU)','GAT (CPU)'])

In [None]:
# Sparsify the graph using Random Heuristic

random_sparsified_dict = {}

for p in [0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6]:
    for g in ['GCN','GraphSage','GAT']:
        
        if p not in random_sparsified_dict.keys():
            random_sparsified_dict[p] = {'GCN':{},'GraphSage':{},'GAT':{}}
        
        if p == 0:
            random_sparsified_dict[p][g]['Graph'] = OG.copy()
            random_sparsified_dict[p][g]['Mask'] = [False] * OG.number_of_edges()
            
            g_new = g + ' (CPU)'
            random_cp_time.loc[g,p] = 0
            random_cp_time.loc[g_new,p] = 0
            
            continue
        
        st, st_cpu = time.time(), time.process_time()
        G = OG.copy()
        G_new, mask = sparsify_graph(G, list(G.edges()), probs_upd[g], p, 0)
        ct, ct_cpu = time.time() - st, time.process_time() - st_cpu
    
        # Print number of edges before and after sparsification using Heuristic
        print(f"Model: {g}, Sparsified %: {p}, Before Sparsification (edges): {len(OG.edges)}, After Sparsification (edges): {len(G_new.edges)}")

        g_new = g + ' (CPU)'
        random_cp_time.loc[g,p] = ct
        random_cp_time.loc[g_new,p] = ct_cpu
            
        random_sparsified_dict[p][g]['Graph'] = G_new
        random_sparsified_dict[p][g]['Mask'] = mask
        
    print("\n")
#     break



Model: GCN, Sparsified %: 0.05, Before Sparsification (edges): 44324, After Sparsification (edges): 42108
Model: GraphSage, Sparsified %: 0.05, Before Sparsification (edges): 44324, After Sparsification (edges): 42108
Model: GAT, Sparsified %: 0.05, Before Sparsification (edges): 44324, After Sparsification (edges): 42108


Model: GCN, Sparsified %: 0.1, Before Sparsification (edges): 44324, After Sparsification (edges): 39892
Model: GraphSage, Sparsified %: 0.1, Before Sparsification (edges): 44324, After Sparsification (edges): 39892
Model: GAT, Sparsified %: 0.1, Before Sparsification (edges): 44324, After Sparsification (edges): 39892


Model: GCN, Sparsified %: 0.2, Before Sparsification (edges): 44324, After Sparsification (edges): 35460
Model: GraphSage, Sparsified %: 0.2, Before Sparsification (edges): 44324, After Sparsification (edges): 35460
Model: GAT, Sparsified %: 0.2, Before Sparsification (edges): 44324, After Sparsification (edges): 35460


Model: GCN, Sparsified %: 

In [None]:
# Displaying Computation times of sparsification using Random Heuristic 

random_cp_time

In [None]:
# Plotting the Computation times of the Sparsification of the graph

plt.figure(figsize=(20,10))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

ax1.plot(random_cp_time.columns, random_cp_time.loc['GCN',:],label='GCN')
ax1.plot(random_cp_time.columns, random_cp_time.loc['GraphSage',:],label='GraphSage')
ax1.plot(random_cp_time.columns, random_cp_time.loc['GAT',:],label='GAT')
ax1.legend()

ax2.plot(random_cp_time.columns, random_cp_time.loc['GCN (CPU)',:],label='GCN')
ax2.plot(random_cp_time.columns, random_cp_time.loc['GraphSage (CPU)',:],label='GraphSage')
ax2.plot(random_cp_time.columns, random_cp_time.loc['GAT (CPU)',:],label='GAT')
ax2.legend()


In [None]:
# Creating dataframe to store the Computation Time and Accuracy for Random Heuristic for the different GNN models

random_time_acc = pd.DataFrame(columns=[0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6], index = ['GCN','GraphSage','GAT', 'GCN (CPU)','GraphSage (CPU)','GAT (CPU)', 'GCN (Acc)','GraphSage (Acc)','GAT (Acc)'])

# Getting the embeddings and finding the node classification accuracy for all the graphs

for p in [0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6]:
    for g in ['GCN','GraphSage','GAT']:
    
        print(f"Configuration - Model:{g}, Sparsified %: {p*100}")
    
        st, st_cpu = time.time(), time.process_time()
    
        G = random_sparsified_dict[p][g]['Graph']
        walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
        embeddings_deepwalk = get_embedding(G,walks_deepwalk)
        acc = node_classification(embeddings_deepwalk, G, dataset)
        
        ct, ct_cpu = time.time() - st, time.process_time() - st_cpu
        
        g_new = g + ' (CPU)'
        g_acc = g + ' (Acc)'
        random_time_acc.loc[g,p] = ct
        random_time_acc.loc[g_new,p] = ct_cpu
        random_time_acc.loc[g_acc,p] = acc
        
        print("\n")
        

In [None]:
# Displaying Computation times and Accuracy of getting embedding and find node classification accuracy using Random Heuristic 

random_time_acc

In [None]:
# Plotting the Computation times of getting embedding and find node classification accuracy for Random Heuristic

plt.figure(figsize=(20,10))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

ax1.plot(random_time_acc.columns, random_time_acc.loc['GCN',:],label='GCN')
ax1.plot(random_time_acc.columns, random_time_acc.loc['GraphSage',:],label='GraphSage')
ax1.plot(random_time_acc.columns, random_time_acc.loc['GAT',:],label='GAT')
ax1.legend()

ax2.plot(random_time_acc.columns, random_time_acc.loc['GCN (CPU)',:],label='GCN')
ax2.plot(random_time_acc.columns, random_time_acc.loc['GraphSage (CPU)',:],label='GraphSage')
ax2.plot(random_time_acc.columns, random_time_acc.loc['GAT (CPU)',:],label='GAT')
ax2.legend()

In [None]:
# Plotting the Accuracies of getting embedding and find node classification accuracy for Random Heuristic

plt.figure(figsize=(20,10))

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

plt.plot(random_time_acc.columns, random_time_acc.loc['GCN (Acc)',:],label='GCN')
plt.plot(random_time_acc.columns, random_time_acc.loc['GraphSage (Acc)',:],label='GraphSage')
plt.plot(random_time_acc.columns, random_time_acc.loc['GAT (Acc)',:],label='GAT')
plt.legend()
plt.show()

In [None]:
# Creating a base copy of graph

G = to_networkx(dataset[0])
OG = G.to_undirected()


In [None]:
# Creating dataframe to store the Computation Time for Random Heuristic for the different GNN models

heuristic_cp_time = pd.DataFrame(columns=[0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6], index = ['GCN','GraphSage','GAT', 'GCN (CPU)','GraphSage (CPU)','GAT (CPU)'])


# Sparsify the graph using Random-Link Prediction Heuristic

heuristic_sparsified_dict = {}

for p in [0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6]:
    for g in ['GCN','GraphSage','GAT']:
        
        if p not in heuristic_sparsified_dict.keys():
            heuristic_sparsified_dict[p] = {'GCN':{},'GraphSage':{},'GAT':{}}
        
        if p == 0:
            heuristic_sparsified_dict[p][g]['Graph'] = OG.copy()
            heuristic_sparsified_dict[p][g]['Mask'] = [False] * OG.number_of_edges()
            
            g_new = g + ' (CPU)'
            heuristic_cp_time.loc[g,p] = 0
            heuristic_cp_time.loc[g_new,p] = 0
            
            continue
        
        st, st_cpu = time.time(), time.process_time()
        G = OG.copy()
        G_new, mask = sparsify_graph(G, list(G.edges()), probs_upd[g], p, 1)
        ct, ct_cpu = time.time() - st, time.process_time() - st_cpu
    
        # Print number of edges before and after sparsification using Heuristic
        print(f"Model: {g}, Sparsified %: {p}, Before Sparsification (edges): {len(OG.edges)}, After Sparsification (edges): {len(G_new.edges)}")

        g_new = g + ' (CPU)'
        heuristic_cp_time.loc[g,p] = ct
        heuristic_cp_time.loc[g_new,p] = ct_cpu
            
        heuristic_sparsified_dict[p][g]['Graph'] = G_new
        heuristic_sparsified_dict[p][g]['Mask'] = mask
        
    print("\n")
#     break

In [None]:
# Displaying Computation times of sparsification using Random-Link Prediction Heuristic 

heuristic_cp_time

In [None]:
# Plotting the Computation times of the Sparsification of the graph

plt.figure(figsize=(20,10))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

ax1.plot(heuristic_cp_time.columns, heuristic_cp_time.loc['GCN',:],label='GCN')
ax1.plot(heuristic_cp_time.columns, heuristic_cp_time.loc['GraphSage',:],label='GraphSage')
ax1.plot(heuristic_cp_time.columns, heuristic_cp_time.loc['GAT',:],label='GAT')
ax1.legend()

ax2.plot(heuristic_cp_time.columns, heuristic_cp_time.loc['GCN (CPU)',:],label='GCN')
ax2.plot(heuristic_cp_time.columns, heuristic_cp_time.loc['GraphSage (CPU)',:],label='GraphSage')
ax2.plot(heuristic_cp_time.columns, heuristic_cp_time.loc['GAT (CPU)',:],label='GAT')
ax2.legend()
# ax2.plot(x, -y)

In [None]:
# Creating dataframe to store the Computation Time and Accuracy for Random-Link Prediction Heuristic for the different GNN models

heuristic_time_acc = pd.DataFrame(columns=[0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6], index = ['GCN','GraphSage','GAT', 'GCN (CPU)','GraphSage (CPU)','GAT (CPU)', 'GCN (Acc)','GraphSage (Acc)','GAT (Acc)'])

# Getting the embeddings and finding the node classification accuracy for all the graphs

for p in [0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.33, 0.4, 0.5, 0.6]:
    for g in ['GCN','GraphSage','GAT']:
    
        print(f"Configuration - Model:{g}, Sparsified %: {p*100}")
    
        st, st_cpu = time.time(), time.process_time()
    
        G = heuristic_sparsified_dict[p][g]['Graph']
        walks_deepwalk = deepwalk_walks(G, walk_length=10, num_walks=80)
        embeddings_deepwalk = get_embedding(G,walks_deepwalk)
        acc = node_classification(embeddings_deepwalk, G, dataset)
        
        ct, ct_cpu = time.time() - st, time.process_time() - st_cpu
        
        g_new = g + ' (CPU)'
        g_acc = g + ' (Acc)'
#         print(g_new, g_acc)
        heuristic_time_acc.loc[g,p] = ct
        heuristic_time_acc.loc[g_new,p] = ct_cpu
        heuristic_time_acc.loc[g_acc,p] = acc
        
        print("\n")
        
        

In [None]:
# Displaying Computation times and Accuracy of getting embedding and find node classification accuracy 
# using Random-Link Prediction Heuristic 

heuristic_time_acc

In [None]:
# Plotting the Computation times of getting embedding and find node classification accuracy for Random-Link Prediction Heuristic

plt.figure(figsize=(20,10))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

ax1.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GCN',:],label='GCN')
ax1.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GraphSage',:],label='GraphSage')
ax1.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GAT',:],label='GAT')
ax1.legend()

ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GCN (CPU)',:],label='GCN')
ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GraphSage (CPU)',:],label='GraphSage')
ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GAT (CPU)',:],label='GAT')
ax2.legend()

plt.show()

In [None]:
# Plotting the Accuracies of getting embedding and find node classification accuracy for Random-Link Prediction Heuristic

plt.figure(figsize=(20,10))

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

plt.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GCN (Acc)',:],label='GCN')
plt.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GraphSage (Acc)',:],label='GraphSage')
plt.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GAT (Acc)',:],label='GAT')
plt.legend()
plt.title("Accuracy of different GNN models with Random-Link Prediction Heuristic")
plt.show()

In [None]:
# Plotting the Computation times of getting embedding and find node classification accuracy for both the Heuristics

plt.figure(figsize=(20,10))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

ax1.plot(random_time_acc.columns, random_time_acc.loc['GCN',:],label='GCN')
ax1.plot(random_time_acc.columns, random_time_acc.loc['GraphSage',:],label='GraphSage')
ax1.plot(random_time_acc.columns, random_time_acc.loc['GAT',:],label='GAT')
ax1.legend()
ax1.set_ylim([30, 80])
ax1.set_title("Computation times of different GNN models with Random Heuristic")

ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GCN',:],label='GCN')
ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GraphSage',:],label='GraphSage')
ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GAT',:],label='GAT')
ax2.legend()
ax2.set_ylim([30, 80])
ax2.set_title("Computation times of different GNN models with Random-Link Prediction Heuristic")

plt.show()

In [None]:
# Plotting the Accuracies of getting embedding and find node classification accuracy for both the Heuristics

# plt.figure(figsize=(20,10))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,6))

ax1.plot(random_time_acc.columns, random_time_acc.loc['GCN (Acc)',:],label='GCN')
ax1.plot(random_time_acc.columns, random_time_acc.loc['GraphSage (Acc)',:],label='GraphSage')
ax1.plot(random_time_acc.columns, random_time_acc.loc['GAT (Acc)',:],label='GAT')
ax1.legend()
ax1.set_ylim([0.5, 0.9])
ax1.set_title("Accuracy of different GNN models with Random Heuristic")

ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GCN (Acc)',:],label='GCN')
ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GraphSage (Acc)',:],label='GraphSage')
ax2.plot(heuristic_time_acc.columns, heuristic_time_acc.loc['GAT (Acc)',:],label='GAT')
ax2.legend()
ax2.set_ylim([0.5, 0.9])
ax2.set_title("Accuracy of different GNN models with Random-Link Prediction Heuristic")

plt.show()