### Importing packages and Loading the knowledge graph

In [12]:
import numpy as np
import pickle
import networkx as nx
from rdflib import Literal, Namespace
from rdflib.extras.external_graph_libs import rdflib_to_networkx_digraph
from sklearn.preprocessing import MinMaxScaler

with open('graph.pkl', 'rb') as f:
    g = pickle.load(f)


# Define the ARXIV namespace
ARXIV = Namespace("http://arxiv.org/")

### Pagerank
PageRank is an algorithm used by Google Search to rank websites in their search engine results, based on the concept that more important websites are likely to receive more links from other sites.

In [13]:
# Function to convert RDF graph to NetworkX graph
def rdf_to_nx(g):
    nx_graph = nx.MultiDiGraph()
    for s, p, o in g:
        if isinstance(o, Literal):
            continue  # skip literals
        if p == ARXIV.cites:
            nx_graph.add_edge(s, o)
    return nx_graph

# Convert RDF graph to NetworkX graph
nx_graph = rdf_to_nx(g)

# Compute PageRank
pagerank_scores = nx.pagerank(nx_graph)

# Get the paper IDs with highest PageRank scores
top_papers = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

# Print the paper IDs with highest PageRank scores
for paper, score in top_papers[:5]:
    paper = paper.replace("http://arxiv.org/", "")
    print(f'Paper ID: {paper}, PageRank Score: {score}')

Paper ID: 1605.02688, PageRank Score: 0.0005306937790931206
Paper ID: 1011.0352, PageRank Score: 0.0004946854805135191
Paper ID: 1412.6980, PageRank Score: 0.0004745626324922857
Paper ID: quant-ph/9705052, PageRank Score: 0.0004000020722544107
Paper ID: 1105.4464, PageRank Score: 0.0003678372466247786


### HIT Score
HITS (Hyperlink-Induced Topic Search) is a link analysis algorithm that assigns two scores for every page, 'authority', which estimates the value of the content of the page, and 'hub', which estimates the value of its links to other pages.

In [17]:
# Compute HITS scores
hub_scores, authority_scores = nx.hits(nx_graph)

# Get the nodes with the highest authority scores
top_authorities = sorted(authority_scores.items(), key=lambda x: x[1], reverse=True)

# Print the nodes with the highest authority scores
for node, score in top_authorities[:5]:
    node = node.replace("http://arxiv.org/", "")
    print(f'Node: {node}, Authority Score: {score}')

print()

top_hubs = sorted(hub_scores.items(), key=lambda x: x[1], reverse=True)
for node, score in top_hubs[:5]:
    node = node.replace("http://arxiv.org/", "")
    print(f'Node: {node}, Hub Score: {score}')

Node: 1207.7214, Authority Score: 0.0010448481862151613
Node: 1207.7235, Authority Score: 0.001044848186215161
Node: 1201.4330, Authority Score: 0.001020359768844416
Node: 1910.06275, Authority Score: 0.000999650972012424
Node: 1712.09737, Authority Score: 0.000999650972012424

Node: 2009.00516, Hub Score: 0.8604648613325913
Node: 2008.06494, Hub Score: 0.03426114842935145
Node: 1805.00736, Hub Score: 0.01974284603624696
Node: 2007.08542, Hub Score: 0.0185946434793888
Node: 1802.09886, Hub Score: 0.018535168449002274


### Eigenvector Centrality Score
Eigenvector Centrality Score is a measure used in network analysis that assigns relative scores to all nodes in the network based on the principle that connections to high-scoring nodes contribute more to the score of the node in question than equal connections to low-scoring nodes.

In [15]:
# Convert RDF graph to NetworkX graph
G = rdflib_to_networkx_digraph(g)

# Get all nodes which are papers (URIs that start with http://arxiv.org/)
paper_nodes = [n for n in G.nodes() if str(n).startswith("http://arxiv.org/")]

# Create a subgraph of G that includes only paper nodes and the edges between them
G_paper_subgraph = G.subgraph(paper_nodes)

# Remove parallel edges, if any, by converting DiGraph to a simple Graph 
G_paper_simple = nx.Graph(G_paper_subgraph)

# Calculate eigenvector centrality
centrality = nx.eigenvector_centrality_numpy(G_paper_simple)

# Print the top 5 papers by eigenvector centrality
sorted_centrality = sorted(centrality.items(), key=lambda x: x[1], reverse=True)

for paper_id, centrality_score in sorted_centrality[:5]:
    paper_id = paper_id.replace("http://arxiv.org/", "")
    print(f'Paper ID: {paper_id}, Eigenvector Centrality Score: {centrality_score}')

Paper ID: 2009.00516, Eigenvector Centrality Score: 0.6946376053239093
Paper ID: Paper, Eigenvector Centrality Score: 0.12768624963298328
Paper ID: 2008.06494, Eigenvector Centrality Score: 0.05400320088322996
Paper ID: 1805.00736, Eigenvector Centrality Score: 0.043124126025963494
Paper ID: 2012.07714, Eigenvector Centrality Score: 0.03892029018082468


### Normalization of  PageRank scores, Hub scores, and Eigenvector Centrality for each paper.

In [16]:
# Normalizing the scores using MinMaxScaler
scaler = MinMaxScaler()

pagerank_scores = {k: v for k, v in sorted(pagerank_scores.items(), key=lambda item: item[1])}
hub_scores = {k: v for k, v in sorted(hub_scores.items(), key=lambda item: item[1])}
centrality = {k: v for k, v in sorted(centrality.items(), key=lambda item: item[1])}

scores = [pagerank_scores, hub_scores, centrality]
normalized_scores = []

for score in scores:
    # Reshape the scores to fit the scaler
    data = np.array(list(score.values())).reshape(-1, 1)
    # Fit the scaler and transform the data
    normalized = scaler.fit_transform(data)
    # Map the normalized scores back to the paper ids
    normalized_score = {k: v for k, v in zip(score.keys(), normalized)}
    normalized_scores.append(normalized_score)

# Calculate the final scores by averaging the normalized scores
final_scores = {}
weights = [1/3, 1/3, 1/3]  # weights for each score

for paper_id in pagerank_scores.keys():
    final_scores[paper_id] = sum(normalized_scores[i][paper_id]*weights[i] for i in range(3))

# Sort the final scores and print the top 5 papers
top_papers_final = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

for paper, score in top_papers_final[:5]:
    paper = paper.replace("http://arxiv.org/", "")
    print(f'Paper ID: {paper}, Final Score: {score}')


Paper ID: 2009.00516, Final Score: [0.66838086]
Paper ID: 1605.02688, Final Score: [0.33351089]
Paper ID: 1011.0352, Final Score: [0.29804897]
Paper ID: 1412.6980, Final Score: [0.27853691]
Paper ID: quant-ph/9705052, Final Score: [0.20516886]
