In [20]:
from rdflib import URIRef, Literal, Namespace
import networkx as nx
import pickle

with open('graph.pkl', 'rb') as f:
    g = pickle.load(f)


# Define the ARXIV namespace
ARXIV = Namespace("http://arxiv.org/")

### Pagerank

In [21]:
# Function to convert RDF graph to NetworkX graph
def rdf_to_nx(g):
    nx_graph = nx.MultiDiGraph()
    for s, p, o in g:
        if isinstance(o, Literal):
            continue  # skip literals
        if p == ARXIV.cites:
            nx_graph.add_edge(s, o)
    return nx_graph

# Convert RDF graph to NetworkX graph
nx_graph = rdf_to_nx(g)

# Compute PageRank
pagerank_scores = nx.pagerank(nx_graph)

# Get the paper IDs with highest PageRank scores
top_papers = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

# Print the paper IDs with highest PageRank scores
for paper, score in top_papers[:5]:
    print(f'Paper ID: {paper}, PageRank Score: {score}')

Paper ID: http://arxiv.org/1605.02688, PageRank Score: 0.0005306937790931206
Paper ID: http://arxiv.org/1011.0352, PageRank Score: 0.0004946854805135191
Paper ID: http://arxiv.org/1412.6980, PageRank Score: 0.0004745626324922857
Paper ID: http://arxiv.org/quant-ph/9705052, PageRank Score: 0.0004000020722544107
Paper ID: http://arxiv.org/1105.4464, PageRank Score: 0.0003678372466247786


### HIT Score

In [22]:
# Compute HITS scores
hub_scores, authority_scores = nx.hits(nx_graph)

# Get the nodes with the highest authority scores
top_authorities = sorted(authority_scores.items(), key=lambda x: x[1], reverse=True)

# Print the nodes with the highest authority scores
for node, score in top_authorities[:5]:
    print(f'Node: {node}, Authority Score: {score}')

print()

top_hubs = sorted(hub_scores.items(), key=lambda x: x[1], reverse=True)
for node, score in top_hubs[:5]:
    print(f'Node: {node}, Hub Score: {score}')

Node: http://arxiv.org/1207.7235, Authority Score: 0.001044848186215161
Node: http://arxiv.org/1207.7214, Authority Score: 0.0010448481862151608
Node: http://arxiv.org/1201.4330, Authority Score: 0.0010203597688444158
Node: http://arxiv.org/1711.09572, Authority Score: 0.0009996509720124243
Node: http://arxiv.org/1812.01491, Authority Score: 0.0009996509720124243

Node: http://arxiv.org/2009.00516, Hub Score: 0.8604648613325917
Node: http://arxiv.org/2008.06494, Hub Score: 0.03426114842935146
Node: http://arxiv.org/1805.00736, Hub Score: 0.019742846036246896
Node: http://arxiv.org/2007.08542, Hub Score: 0.018594643479388797
Node: http://arxiv.org/1802.09886, Hub Score: 0.01853516844900223


In [30]:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph

# Convert RDF graph to NetworkX graph
G = rdflib_to_networkx_digraph(g)

# Get all nodes which are papers (URIs that start with http://arxiv.org/)
paper_nodes = [n for n in G.nodes() if str(n).startswith("http://arxiv.org/")]

# Create a subgraph of G that includes only paper nodes and the edges between them
G_paper_subgraph = G.subgraph(paper_nodes)

# Remove parallel edges, if any, by converting DiGraph to a simple Graph 
G_paper_simple = nx.Graph(G_paper_subgraph)

# Calculate eigenvector centrality
centrality = nx.eigenvector_centrality_numpy(G_paper_simple)

# Print the top 5 papers by eigenvector centrality
sorted_centrality = sorted(centrality.items(), key=lambda x: x[1], reverse=True)

for paper_id, centrality_score in sorted_centrality[:5]:
    print(f'Paper ID: {paper_id}, Eigenvector Centrality Score: {centrality_score}')

Paper ID: http://arxiv.org/2009.00516, Eigenvector Centrality Score: 0.6946376053239086
Paper ID: http://arxiv.org/Paper, Eigenvector Centrality Score: 0.12768624963298275
Paper ID: http://arxiv.org/2008.06494, Eigenvector Centrality Score: 0.05400320088322981
Paper ID: http://arxiv.org/1805.00736, Eigenvector Centrality Score: 0.043124126025963445
Paper ID: http://arxiv.org/2012.07714, Eigenvector Centrality Score: 0.03892029018082457
