# Graphing the Provenance of a Paper

I want to visually represent the history of a field leading up to a given paper.

I think a cool way to do this would be as a force-directed graph (in which the connections between nodes are springs in a physics simulation) where the x axis of nodes (that is, papers) is fixed to a timeline.

The first order of business: get the data from the SemanticScholar API.

In [922]:
import numpy as np
import pandas as pd
import requests
from datetime import datetime

In [1195]:
def get_provenance(seed_paper):
    """Given seed paper, generate nodes (dict) and edge list (dataframe) for its parents and grandparents"""
    nodes = dict()
    # Get metadata and references of seed paper
    http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=title,year,publicationDate,journal,authors,references.title,references.publicationDate,references.year,references.journal,references.authors" %seed_paper)
    if http.status_code == 429:
        print("Waiting 5 Minutes for access to the API...")
        time.sleep(300)
        http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=title,year,publicationDate,journal,authors,references.title,references.publicationDate,references.year,references.journal,references.authors" %seed_paper)
    json = http.json()
    # Put seed paper metadata into nodes dict
    nodes[json['paperId']] = [json['title'], datetime.strptime(json['publicationDate'], '%Y-%m-%d'), json['year'], json['journal'], json['authors']]
    # Put corpus-listed references into nodes dict
    references_df = pd.DataFrame(json['references']).dropna()
    for index, row in references_df.iterrows():
        nodes[row['paperId']] = [row['title'], datetime.strptime(row['publicationDate'], '%Y-%m-%d'), row['year'], row['journal'], row['authors']]
    # Make edges list with corpus-listed references
    edges = pd.DataFrame({"referencing": [json['paperId']]*len(references_df),
                          "referenced": references_df['paperId']})
    # For each reference, get its references and their metadata, and add them to the dicts
    for index, row in references_df.iterrows():
        # Get metadata and references of referenced paper
        temp_http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=title,year,publicationDate,journal,authors,references.title,references.publicationDate,references.year,references.journal,references.authors" %row['paperId'])
        if temp_http.status_code == 429:
            print("Waiting 5 Minutes for access to the SemanticScholar API...")
            time.sleep(300)
            temp_http = requests.get("https://api.semanticscholar.org/graph/v1/paper/%s?fields=title,year,publicationDate,journal,authors,references.title,references.publicationDate,references.year,references.journal,references.authors" %row['paperId'])
        if temp_http.status_code == 404:
            continue
        temp_json = temp_http.json()
        ## no need to put referenced paper metadata into nodes dict
        # Put corpus-listed reference references into nodes dict
        temp_references_df = pd.DataFrame(temp_json['references']).dropna()
        if len(temp_references_df) == 0:
            continue
        for i, r in temp_references_df.iterrows():
            nodes[r['paperId']] = [r['title'], datetime.strptime(r['publicationDate'], '%Y-%m-%d'), r['year'], r['journal'], r['authors']]
        # Make edges list with corpus-listed reference references, and append to main edge list
        temp_edges = pd.DataFrame({"referencing": [temp_json['paperId']]*len(temp_references_df),
                                   "referenced": temp_references_df['paperId']})
        edges = pd.concat([edges, temp_edges])
        json[row['paperId']] = temp_json
    edges = edges.set_index("referencing").reset_index()
    # # Drop self references
    # loops = 0
    # for index, row in edges.iterrows():
    #     if not ((row['referencing'] in nodes) and (row['referenced'] in nodes) and (row['referencing'] != row['referenced'])):
    #         edges = edges.drop(index = index)
    #         loops += 1
    # print(loops, 'self references or unindexed nodes dropped from edge list')
    return nodes, edges

Now I'll use my function to get the provenance of my paper of choice!

In [1194]:
nodes, edges = get_provenance('10.5334/joc.257')

In [1196]:
# list of direct references
direct_refs = list(edges.iloc[:,1][edges['referencing'] == edges.iloc[0,0]])

# filter out grandparent papers that were only referenced once or twice (otherwise the graph will be really cluttered)
referenced_enough = edges.iloc[:,1].value_counts() > 2
referenced_enough = referenced_enough[referenced_enough].to_dict()
for index, row in edges.iterrows():
    if ((row['referenced'] not in referenced_enough) and (row['referenced'] not in direct_refs)):
        print("Dropped referenced: "+row['referenced'])
        edges = edges.drop(index)

Dropped referenced: aedad0c48f45ad9b8ed0ededa752e164bacddb0c
Dropped referenced: 038b4c5e2fc06f06058905342a832ab4d09edb8b
Dropped referenced: 03cbfd8cea1b11fdbc7f6bbac0859520b1f71ee3
Dropped referenced: 7c324b84e37987d1da71e0a75e9519c9f946ec5e
Dropped referenced: 45c783115f49045250185056a6684f23fc3afe72
Dropped referenced: ca80892f2e5c05f894b6663010752540bab5f22b
Dropped referenced: 87a775279bdd43847ca692d4258e306052250088
Dropped referenced: 4a0709039e83440e3e792de640712f552da1279a
Dropped referenced: d1e0d17f6b9ddd86ce480682339c41afb5e2bfdf
Dropped referenced: 2309e509e82bf9dba72f1208ba2b81116e0d87db
Dropped referenced: ccb47b50c280bbec3fe841dcf4476d905bf2241e
Dropped referenced: 54c35a6b1b87c631cb76ba2d8a762df888f4d442
Dropped referenced: 4a529dd70012b8527ab8da53f2e4a1d8b9dd7da3
Dropped referenced: fcef282918801d04e2d2a2fba3804f19942fb6a3
Dropped referenced: 0e60bcfd05f7ac3f2d55bb78183bd2e3e2ee4bd8
Dropped referenced: ca0d27a1ecb3976cf57256af91d7543b2b3b61a5
Dropped referenced: baaa

Now that we have the data, let's format it as a network with appropriate node attributes.

In [1199]:
import networkx as nx
import textwrap

G = nx.from_pandas_edgelist(edges,
                            source = 'referencing',
                            target = 'referenced',
                            create_using = nx.Graph())

# Label nodes by name
node_label = dict()
for i in nodes:
    title = nodes[i][0]
    year = str(round(nodes[i][2]))
    if 'name' in nodes[i][3]:
        journal = nodes[i][3]['name']
    else:
        journal = "Unknown Journal"
    if len(nodes[i][4]) == 0:
        authors = "Unknown Authors"
    elif len(nodes[i][4]) == 1:
        authors = nodes[i][4][0]['name'].split()[-1]
    elif len(nodes[i][4]) == 2:
        authors = nodes[i][4][0]['name'].split()[-1]+" & "+nodes[i][4][1]['name'].split()[-1]
    else:
        authors = nodes[i][4][0]['name'].split()[-1]+" et al."
    node_label[i] = authors+", "+year+"\n"+textwrap.fill(title, 25)
    
nx.set_node_attributes(G, node_label, 'label')
    
# Define node level as publication date
node_level = dict()
for i in nodes:
    node_level[i] = nodes[i][1].timestamp()/31536000

nx.set_node_attributes(G, node_level, 'level')

# Vary node size by number of citations (except source node)
node_citations = (edges.loc[:,'referenced'].value_counts()*5).to_dict()
node_citations[edges.iloc[0,0]] = 50
nx.set_node_attributes(G, node_citations, 'size')

# Vary node color by seed/parent/grandparent
node_color = dict()
for i in nodes:
    if i == edges.iloc[0,0]:
        node_color[i] = "#A5243D"
    elif i in direct_refs:
        node_color[i] = "#B48291"
    else:
        node_color[i] = "#AFAAB9"

nx.set_node_attributes(G, node_color, 'color')

Now time to visualize!

In [1205]:
from pyvis.network import Network

net = Network(bgcolor = '#222222', 
              font_color = 'white', 
              layout = True, 
              directed = False,
              heading = "<b>Provenance of Kessler &amp; Oberauer, 2018</b><br>Red nodes are direct references of the seed paper, while grey nodes are references of references. All second-order references with fewer than two connections have been removed to avoid clutter.")

net.from_nx(G)

# Set appropriate options
net.set_options("""
const options = {
  "nodes": {
    "borderWidthSelected": 3,
    "opacity": 0.8,
    "font": {
      "size": 12
    },
    "size": null
  },
  "edges": {
    "color": {
      "opacity": 0.5
    },
    "hoverWidth": 5,
    "scaling": {
      "max": 25
    },
    "selectionWidth": 5,
    "selfReferenceSize": null,
    "selfReference": {
      "angle": 0.7853981633974483
    },
    "smooth": false,
    "width": 5
  },
  "layout": {
    "hierarchical": {
      "enabled": true,
      "levelSeparation": 50,
      "direction": "LR"
    }
  },
  "interaction": {
    "hover": true
  },
  "physics": {
    "hierarchicalRepulsion": {
      "centralGravity": 1.55,
      "springLength": 150,
      "springConstant": 0.2,
      "nodeDistance": 200,
      "avoidOverlap": null
    },
    "minVelocity": 0.75,
    "solver": "hierarchicalRepulsion"
  }
}
""")

net.show('name.html')