In [1]:
from rdflib import URIRef, Literal, Namespace
import networkx as nx
import pickle

with open('graph.pkl', 'rb') as f:
    g = pickle.load(f)


# Define the ARXIV namespace
ARXIV = Namespace("http://arxiv.org/")

### Pagerank

In [5]:
# Function to convert RDF graph to NetworkX graph
def rdf_to_nx(g):
    nx_graph = nx.MultiDiGraph()
    for s, p, o in g:
        if isinstance(o, Literal):
            continue  # skip literals
        if p == ARXIV.cites:
            nx_graph.add_edge(s, o)
    return nx_graph

# Convert RDF graph to NetworkX graph
nx_graph = rdf_to_nx(g)

# Compute PageRank
pagerank_scores = nx.pagerank(nx_graph)

# Get the paper IDs with highest PageRank scores
top_papers = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)

# Print the paper IDs with highest PageRank scores
for paper, score in top_papers[:5]:
    print(f'Paper ID: {paper}, PageRank Score: {score}')


Paper ID: http://arxiv.org/1605.02688, PageRank Score: 0.0005306937790931204
Paper ID: http://arxiv.org/1011.0352, PageRank Score: 0.0004946854805135189
Paper ID: http://arxiv.org/1412.6980, PageRank Score: 0.00047456263249228553
Paper ID: http://arxiv.org/quant-ph/9705052, PageRank Score: 0.00040000207225441046
Paper ID: http://arxiv.org/1105.4464, PageRank Score: 0.00036783724662477836


### Instance Counting

In [12]:
# Compute the number of instances of each discipline
discipline_counts = {}

for s, p, o in g:
    if p == ARXIV.discipline:
        if str(o) in discipline_counts:
            discipline_counts[str(o)] += 1
        else:
            discipline_counts[str(o)] = 1

# Print the disciplines with the most instances
for discipline, count in sorted(discipline_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'Discipline: {discipline}, Paper Count: {count}')


Discipline: Physics, Paper Count: 445
Discipline: Computer Science, Paper Count: 163
Discipline: Mathematics, Paper Count: 106
Discipline: Statistics, Paper Count: 43
Discipline: Quantitative Biology, Paper Count: 18
Discipline: Electrical Engineering and Systems Science, Paper Count: 16
Discipline: Quantitative Finance, Paper Count: 4


In [13]:
# Compute the number of instances of each author
author_counts = {}

for s, p, o in g:
    if p == ARXIV.Author:
        # Assuming the authors are separated by commas
        authors = str(o).split(',')
        for author in authors:
            author = author.strip()  # Remove leading/trailing whitespace
            if author in author_counts:
                author_counts[author] += 1
            else:
                author_counts[author] = 1

# Print the authors with the most papers
for author, count in sorted(author_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'Author: {author}, Paper Count: {count}')


Author: Sansit Patnaik, Paper Count: 7
Author: Sai Sidhardh, Paper Count: 6
Author: Kishor D. Kucche, Paper Count: 6
Author: ATLAS Collaboration, Paper Count: 5
Author: Fabio Semperlotti, Paper Count: 5
Author: Siham Aouissi, Paper Count: 5
Author: Markus Sch\"oberl, Paper Count: 4
Author: Narsireddy Anugu, Paper Count: 4
Author: Earl T. Campbell, Paper Count: 4
Author: Niranjan Saikumar, Paper Count: 4
Author: Ashwini D. Mali, Paper Count: 4
Author: Mohamed Talbi, Paper Count: 4
Author: Snehasish Bhattacharjee, Paper Count: 4
Author: Xin Wang, Paper Count: 4
Author: Zhi-Gang Wang, Paper Count: 4
Author: Shihao Song, Paper Count: 4
Author: Anup Das, Paper Count: 4
Author: Nagarajan Kandasamy, Paper Count: 4
Author: Shrikant Utagi, Paper Count: 4
Author: Haofu Liao, Paper Count: 3
Author: Conrad Gst\"ottner, Paper Count: 3
Author: Bernd Kolar, Paper Count: 3
Author: Stefan A. Maier, Paper Count: 3
Author: Riccardo Sapienza, Paper Count: 3
Author: Wolfgang Kilian, Paper Count: 3
Author: 

In [14]:
# Compute the number of instances of each citeid
citeid_counts = {}

for s, p, o in g:
    if p == ARXIV.citeid:
        citeid = str(o)
        if citeid in citeid_counts:
            citeid_counts[citeid] += 1
        else:
            citeid_counts[citeid] = 1

# Print the citeids with the most instances
for citeid, count in sorted(citeid_counts.items(), key=lambda x: x[1], reverse=True):
    print(f'CiteID: {citeid}, Count: {count}')


CiteID: 6c46679a13c5ef429dd8b9ebdf066e921c444c21, Count: 44
CiteID: 736fa9c7fecd3722d7c6a85110de64ac36eb8c7f, Count: 43
CiteID: e9a6763ea828d5ce6485c8e9ac1f3a06ce4b6865, Count: 41
CiteID: 774fe0f5525e7ef0c701b8ec5db81c91ce8298f0, Count: 39
CiteID: 7f668ae326cbb60a6fdbaab2a9e17ae797e50ef9, Count: 37
CiteID: ef65888f058826d3dfb350a279e3c9cb77fe13db, Count: 35
CiteID: cb697c534090a46990932d30dbf4986f48d1825e, Count: 34
CiteID: cd5ca7e5090d2f88ba3f20e8bde54ca3663767ca, Count: 34
CiteID: c2b8a4cb27d063b0fb14a62022e7b482ccbca8df, Count: 32
CiteID: b2127104a07ba8239fa18e05b8f58b758d07efb4, Count: 32
CiteID: d4d1949aaf00eb8e1a55a82189d71900bd413271, Count: 31
CiteID: da5a7f39c2fc7945d9cd50a176c9c66e3fe0a086, Count: 29
CiteID: 31105e93e48ea143e1bb8b0fdb52d16301208b8f, Count: 28
CiteID: 7c681412671f1fec58a717e656007fca5cff52df, Count: 28
CiteID: 2326a3b9b3d8013b701ea81bb750e88212d22f19, Count: 27
CiteID: 59c4acaa4aeb4575e9a5588811aaaee36176bfe6, Count: 27
CiteID: 3268987508240bd43b0e3095808513d0

In [18]:
# Choose your keyword
keyword = "machine learning"  # replace with your keyword

# Make sure the keyword is in lowercase for case-insensitive search
keyword = keyword.lower()

# Initialize the count
keyword_count = 0

# Iterate over each statement in the graph
for s, p, o in g:
    if p in [ARXIV.title, ARXIV.abstract, ARXIV.body]:
        # Convert the object to lowercase and count the occurrences of the keyword
        keyword_count += str(o).lower().count(keyword)

print(f'The keyword "{keyword}" appears {keyword_count} times.')


The keyword "machine learning" appears 859 times.


In [22]:
from collections import Counter
from nltk.corpus import stopwords
import re
import nltk

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Add specific words to the stop words list
stop_words.update(['also', 'ref', 'cite', 'one', 'two', '1', 'e', 'using', 'figure', 'given'])

# Initialize a Counter object
word_counter = Counter()

# Iterate over each statement in the graph
for s, p, o in g:
    if p in [ARXIV.title, ARXIV.abstract, ARXIV.body]:
        # Convert the object to lowercase, tokenize it into words using a regular expression,
        # remove stop words, and update the counter
        words = re.findall(r'\b\w+\b', str(o).lower())
        words = [word for word in words if word not in stop_words]
        word_counter.update(words)

# Find the most common words
common_words = word_counter.most_common(10)

for word, count in common_words:
    print(f'Word: {word}, Count: {count}')



Word: formula, Count: 331853
Word: model, Count: 12048
Word: state, Count: 10932
Word: time, Count: 9631
Word: data, Count: 9564
Word: quantum, Count: 9484
Word: system, Count: 8434
Word: order, Count: 8034
Word: case, Count: 7961
Word: number, Count: 7951


### HIT Score

In [23]:
# Compute HITS scores
hub_scores, authority_scores = nx.hits(nx_graph)

# Get the nodes with the highest authority scores
top_authorities = sorted(authority_scores.items(), key=lambda x: x[1], reverse=True)

# Print the nodes with the highest authority scores
for node, score in top_authorities[:5]:
    print(f'Node: {node}, Authority Score: {score}')

print()

top_hubs = sorted(hub_scores.items(), key=lambda x: x[1], reverse=True)
for node, score in top_hubs[:5]:
    print(f'Node: {node}, Hub Score: {score}')

Node: http://arxiv.org/1207.7214, Authority Score: 0.0010448481862151621
Node: http://arxiv.org/1207.7235, Authority Score: 0.001044848186215162
Node: http://arxiv.org/1201.4330, Authority Score: 0.0010203597688444156
Node: http://arxiv.org/1907.13234, Authority Score: 0.0009996509720124245
Node: http://arxiv.org/1608.01902, Authority Score: 0.0009996509720124245

Node: http://arxiv.org/2009.00516, Hub Score: 0.8604648613325919
Node: http://arxiv.org/2008.06494, Hub Score: 0.03426114842935147
Node: http://arxiv.org/1805.00736, Hub Score: 0.01974284603624697
Node: http://arxiv.org/2007.08542, Hub Score: 0.018594643479388804
Node: http://arxiv.org/1802.09886, Hub Score: 0.018535168449002236
