In [1]:
import json
from rdflib import Graph, Literal, BNode, Namespace, RDF, URIRef
import holoviews as hv
import networkx as nx

### Graph Creation

In [2]:
g = Graph()

def create_rdf_graph(data):
    ARXIV = Namespace("http://arxiv.org/")

    # Create a URI for this paper
    paper = URIRef(ARXIV + data['paper_id'])

    # Add triples using store's add method.
    g.add((paper, RDF.type, ARXIV.Paper))
    g.add((paper, ARXIV.title, Literal(data['metadata']['title'])))
    g.add((paper, ARXIV.date, Literal(data['metadata']['update_date'])))
    g.add((paper, ARXIV.discipline, Literal(data['discipline'])))
    g.add((paper, ARXIV.abstract, Literal(data['abstract']['text'])))
    g.add((paper, ARXIV.Author, Literal(data['metadata']['authors'])))

    # Iterating over body_text (assuming it's a list of dictionaries)
    for section in data['body_text']:
        if 'text' in section:
            g.add((paper, ARXIV.body, Literal(section['text'])))

    # Iterating over bib_entries (assuming it's a dictionary)
    for id, bib_entry in data['bib_entries'].items():
        bib_entry_bnode = BNode()
        g.add((bib_entry_bnode, RDF.type, ARXIV.BibEntry))
        g.add((bib_entry_bnode, ARXIV.bib_entry_raw, Literal(bib_entry['bib_entry_raw'])))
        # Adding contained_arXiv_ids if they exist
        if 'contained_arXiv_ids' in bib_entry:
            for arxiv_id in bib_entry['contained_arXiv_ids']:
                linked_paper = URIRef(ARXIV + arxiv_id['id'])
                g.add((bib_entry_bnode, ARXIV.contained_arXiv_id, linked_paper))
                g.add((paper, ARXIV.cites, linked_paper))
                g.add((bib_entry_bnode, ARXIV.contained_arXiv_text, Literal(arxiv_id['text'])))

    # Iterating over ref_entries (assuming it's a dictionary)
    for id, ref_entry in data['ref_entries'].items():
        ref_entry_bnode = BNode()
        g.add((ref_entry_bnode, RDF.type, ARXIV.RefEntry))
        g.add((ref_entry_bnode, ARXIV.ref_type, Literal(ref_entry['type'])))
        # Adding caption or latex depending on the type of ref_entry
        if 'caption' in ref_entry and ref_entry['type'] == 'figure':
            g.add((ref_entry_bnode, ARXIV.captionfigure, Literal(ref_entry['caption'])))
        elif 'caption' in ref_entry and ref_entry['type'] == 'table':
            g.add((ref_entry_bnode, ARXIV.captiontable, Literal(ref_entry['caption'])))

# Open your jsonl file
with open('dataset.jsonl', 'r') as file:
    for line in file:
        data = json.loads(line)
        create_rdf_graph(data)

### Save the graph to use it in other files

In [3]:
import pickle
with open('graph.pkl', 'wb') as f:
    pickle.dump(g, f)

### Visualization of the subgraphs obtained using SPARQL querying

In [4]:
hv.extension('bokeh')

# Convert SPARQL results to networkx graph
def sparql_to_networkx_graph(results):
    G = nx.Graph()
    for row in results:
        G.add_edge(str(row[0]), str(row[1]))
    return G

# Convert networkx graph to holoviews graph
def networkx_to_holoviews_graph(G):
    hv_graph = hv.Graph.from_networkx(G, nx.layout.spring_layout)
    return hv_graph

In [5]:
# Define the SPARQL query
query = """
    SELECT ?subject ?predicate ?object
    WHERE {
        ?subject ?predicate ?object .
    }
    LIMIT 500
"""

# Execute the query
sub = g.query(query)

hv.extension('bokeh')

# Convert SPARQL results to networkx graph
def sparql_to_networkx_graph(results):
    G = nx.Graph()
    for row in results:
        G.add_edge(str(row[0]), str(row[1]))
    return G

# Convert networkx graph to holoviews graph
def networkx_to_holoviews_graph(G):
    hv_graph = hv.Graph.from_networkx(G, nx.layout.spring_layout)
    return hv_graph

# Graphs for subgraph1
G1 = sparql_to_networkx_graph(sub)
hv_g1 = networkx_to_holoviews_graph(G1)

# Plot with holoviews
hv_g1.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)


In [6]:
# Query to provide valuable insights on the number and titles of papers in the "Computer Science" discipline. 

q1 = '''
    PREFIX ARXIV: <http://arxiv.org/>
    SELECT ?paper ?title
    WHERE {
        ?paper a ARXIV:Paper ;
               ARXIV:title ?title ;
               ARXIV:discipline "Statistics" .
    }
'''

# Perform the query on the graph
subgraph1 = g.query(q1)

# Graphs for subgraph1
G1 = sparql_to_networkx_graph(subgraph1)
hv_g1 = networkx_to_holoviews_graph(G1)

# Plot with holoviews
hv_g1.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)


In [7]:
# Query to Find all papers on a specific topic.
# This could be useful for a researcher looking to find all papers on a specific topic, e.g., "differential privacy".

q2 = '''
PREFIX arxiv: <http://arxiv.org/>

SELECT ?paper ?title 
WHERE {
  ?paper a arxiv:Paper.
  ?paper arxiv:title ?title.
  FILTER (CONTAINS(?title, "Quantum"))
}
'''

# Perform the query on the graph
subgraph2 = g.query(q2)

# Graphs for subgraph2
G2 = sparql_to_networkx_graph(subgraph2)
hv_g2 = networkx_to_holoviews_graph(G2)

# Plot with holoviews
hv_g2.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)

In [8]:
# Query to Retrieve all papers written by a particular author.
# This can be beneficial for someone who wants to explore all papers published by a certain author, e.g., "Sansit Patnaik".

q3 = '''
PREFIX arxiv: <http://arxiv.org/>

SELECT ?paper ?title
WHERE {
  ?paper a arxiv:Paper.
  ?paper arxiv:Author ?author.
  ?paper arxiv:title ?title.
  FILTER (CONTAINS(?author, "Sansit Patnaik"))
}

'''

# Perform the query on the graph
subgraph3 = g.query(q3)

# Graphs for subgraph3
G3 = sparql_to_networkx_graph(subgraph3)
hv_g3 = networkx_to_holoviews_graph(G3)

# Plot with holoviews
hv_g3.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)

In [9]:
# Query to Find all papers citing a specific paper.
# This could be helpful for a researcher trying to gauge the impact of a specific work, e.g., a paper with citeID "1807.06209"

q4 = '''
PREFIX arxiv: <http://arxiv.org/>

SELECT DISTINCT ?paper ?title
WHERE {
  ?paper a arxiv:Paper.
  ?paper arxiv:title ?title.
  ?paper arxiv:cites ?linked.
  FILTER (?linked = <http://arxiv.org/1807.06209>)
}
'''

# Perform the query on the graph
subgraph4 = g.query(q4)

# Graphs for subgraph4
G4 = sparql_to_networkx_graph(subgraph4)
hv_g4 = networkx_to_holoviews_graph(G4)

# Plot with holoviews
hv_g4.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)

In [10]:
# Query to give insights into the papers that discuss a specific topic or concept ("ML" in this case).

q5 = '''
    PREFIX ARXIV: <http://arxiv.org/>
    SELECT ?paper ?title
    WHERE {
        ?paper a ARXIV:Paper ;
               ARXIV:title ?title ;
               ARXIV:abstract ?abstract .
        FILTER(contains(?abstract, "ML"))
    }
'''

# Perform the query on the graph
subgraph5 = g.query(q5)

# Graphs for subgraph4
G5 = sparql_to_networkx_graph(subgraph5)
hv_g5 = networkx_to_holoviews_graph(G5)

# Plot with holoviews
hv_g5.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)

In [11]:
# Query to get all citations in the knowledge graph

q6 = """
    SELECT ?paper ?cited_paper WHERE {
        ?paper <http://arxiv.org/cites> ?cited_paper.
    }
"""

# for citation in citations:
#     print(f'Paper {citation[0]} cites {citation[1]}')

# Perform the query on the graph
subgraph6 = g.query(q6)

# Graphs for subgraph4
G6 = sparql_to_networkx_graph(subgraph6)
hv_g6 = networkx_to_holoviews_graph(G6)

# Plot with holoviews
hv_g6.opts(padding=0.1, width=400, height=400, xaxis=None, yaxis=None)
