# Build Network Data from database

###  Imports and setup

In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm.notebook import tqdm

In [2]:
import networkx as nx

In [3]:
import requests
from requests.exceptions import HTTPError

import json, sys

In [4]:
import itertools

In [5]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [6]:
import pymongo
from pymongo import MongoClient
db_client = MongoClient('172.19.31.5', 27017)
db = db_client['inspirehep']   # use db mongo
articles = db['cern']        # use collection

##  Clean and Prepare Data

### Fetch docs from database

In [7]:
cursor = articles.find()

convert db cursor to pandas dataframe (this might take a while...)

In [8]:
df = pd.DataFrame(list(cursor))

The number of documents is

In [9]:
df.shape

(67031, 6)

and they look like

In [10]:
df.sample(10)

Unnamed: 0,_id,citations_ids,created,links,metadata,updated
5321,1729547,,2019-04-15T00:00:00+00:00,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 11, ...",2021-01-09T17:04:31.132996+00:00
22230,1769799,,2019-12-11T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 1, '...",2020-11-27T03:59:57.994804+00:00
42235,1467838,[],2016-06-07T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 0, '...",2021-01-06T18:57:30.759053+00:00
14344,1694007,"[1769945, 1705328, 1800660, 1714713, 1748796, ...",2018-09-14T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'collaborations': [{'value': 'ATLAS'}], 'cita...",2020-11-27T02:56:48.596468+00:00
39076,900828,,2011-05-22T00:00:00+00:00,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 2, '...",2020-11-26T20:23:26.224020+00:00
15794,1702422,,2018-11-08T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 17, ...",2021-01-09T19:04:26.793668+00:00
8974,1281063,,2014-02-13T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 17, ...",2020-11-26T22:52:31.379904+00:00
54599,1364718,"[1764575, 1686740, 1497401, 1393059, 1749069, ...",2015-04-27T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 48, ...",2021-01-09T12:08:30.616275+00:00
43040,1764201,,2019-11-12T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 0, '...",2021-01-07T01:03:38.200305+00:00
27039,1089854,,2012-02-21T00:00:00+00:00,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 25, ...",2020-11-26T21:04:55.717528+00:00


notice that the "numerical" values are of type string...

### Docs with no citations

Now we extract from the dataframe those rows with missing values

In [11]:
has_NaN = df.isnull().any(axis=1)

In [12]:
df[has_NaN].sample(5)

Unnamed: 0,_id,citations_ids,created,links,metadata,updated
16925,1265470,,2013-11-22T00:00:00+00:00,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 62, ...",2020-11-26T22:42:50.664315+00:00
33787,920977,,2011-07-28T00:00:00+00:00,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 2, '...",2021-01-07T05:33:05.796318+00:00
30237,1611290,,2017-07-25T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 13, ...",2020-11-27T01:58:18.979559+00:00
60192,1429613,,2016-03-17T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 0, '...",2021-01-16T10:29:44.455734+00:00
39594,1625356,,2017-09-26T00:00:00+00:00,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 0, '...",2020-11-27T02:06:25.826075+00:00


This `NaN` values are related to the articles that are citing those in the collaboration. To double-check this lets run a count

In [13]:
def check_collab(metadata_dict):
    if 'collaborations' in metadata_dict.keys():
        return np.any([(collab['value'] in ['ATLAS','CMS','ALICE','LHCb']) for collab in metadata_dict['collaborations']])
    else:
        return 'Not in Collab'

df['metadata'].map( 
    check_collab
).value_counts()

Not in Collab    46610
True             16595
False             3826
Name: metadata, dtype: int64

here 
- `True` means that the doc is in any of the collaborations (ATLAS, CMS, ALICE, LHCb), 
- `False` that the doc is in a collaboration other than the ones listed above,
- `Not in Collab`... is quite self explanatory ;)

Lets replace `NaN` values with an empty list

In [14]:
df['citations_ids'] = df['citations_ids'].fillna(0).apply(lambda value: [] if value==0 else value)

### Unpack values of interest

Unpack values of interest fro the metadata column and add them to a new column

#####  title

In [15]:
df['title'] = df['metadata'].map( 
    lambda metadata_dict: metadata_dict['titles'][0]['title'],
    na_action='ignore'
)

check for missing values

In [16]:
df['title'].isnull().any()

False

#####  document type

(this is a list because a document can have several document types)

In [17]:
df['document_type'] = df['metadata'].map( 
    lambda metadata_dict: metadata_dict['document_type'],
    na_action='ignore'
)

#####  journal info

First we want to know what are the available keys nested in `publication_info` 

In [18]:
def extract_keys(metadata_dict: dict):
    if ('publication_info' in metadata_dict.keys()): 
        key_list = [publication.keys() for publication in metadata_dict['publication_info']]
        return key_list[0]
    else: 
        return []

journal_info_keys = df['metadata'].map( 
    extract_keys
)

In [19]:
journal_info_keys.sample(10)

10308             (cnum, conference_record, parent_record)
30326    (artid, journal_record, page_start, year, jour...
9924     (artid, journal_record, journal_volume, page_s...
260                              (conference_record, cnum)
38122    (artid, journal_record, year, journal_issue, j...
65918    (artid, journal_record, year, parent_record, c...
23983    (artid, journal_issue, page_start, year, journ...
39114                                                   []
21708                                                   []
20071         (artid, journal_title, year, journal_volume)
Name: metadata, dtype: object

with the possible keys being

In [20]:
np.unique(list(itertools.chain.from_iterable(journal_info_keys)))

array(['artid', 'cnum', 'conf_acronym', 'conference_record',
       'journal_issue', 'journal_record', 'journal_title',
       'journal_volume', 'material', 'page_end', 'page_start',
       'parent_isbn', 'parent_record', 'parent_report_number',
       'pubinfo_freetext', 'year'], dtype='<U20')

Lets then just check some specific examples to get the feeling...

In [21]:
journal_info_keys[5]

dict_keys(['conference_record', 'cnum'])

In [22]:
df['metadata'][5]

{'arxiv_eprints': [{'categories': ['nucl-ex', 'hep-ex'],
   'value': '2101.03857'}],
 'author_count': 1,
 'authors': [{'bai': 'V.Feuillard.1', 'full_name': 'Feuillard, Victor'}],
 'citation_count': 0,
 'citation_count_without_self_citations': 0,
 'collaborations': [{'value': 'ALICE'}],
 'control_number': 1840193,
 'document_type': ['conference paper'],
 'inspire_categories': [{'source': 'arxiv', 'term': 'Experiment-Nucl'},
  {'source': 'arxiv', 'term': 'Experiment-HEP'}],
 'preprint_date': '2021-01-11',
 'publication_info': [{'cnum': 'C20-07-07',
   'conference_record': {'ref': 'https://inspirehep.net/api/conferences/1778269'}}],
 'referenced_authors_bais': ['A.Abrahantes.Quintana.1',
  'A.Abramyan.2',
  'A.A.Capon.2',
  'A.Adare.1',
  'A.Adler.2',
  'A.Agostinelli.1',
  'A.Ahmad.Masoodi.2',
  'A.A.Ivanov.1',
  'A.Alfonso.Albero.1',
  'A.Alkin.1',
  'A.Andronic.3',
  'A.Anzo.1',
  'A.A.P.Suaide.1',
  'A.Arefiev.1',
  'A.Arend.1',
  'A.Argentieri.1',
  'A.Asryan.1',
  'A.Augustinus.1',


In [23]:
journal_info_keys[6]

dict_keys(['artid', 'journal_title', 'journal_issue', 'year', 'journal_volume'])

In [24]:
df['metadata'][6]

{'arxiv_eprints': [{'categories': ['physics.gen-ph'], 'value': '1503.00620'}],
 'author_count': 3,
 'authors': [{'affiliations': [{'record': {'ref': 'https://inspirehep.net/api/institutions/903784'},
     'value': 'Denmark, Tech. U.'}],
   'bai': 'O.L.Trinhammer.1',
   'full_name': 'Trinhammer, Ole L.'},
  {'affiliations': [{'record': {'ref': 'https://inspirehep.net/api/institutions/903784'},
     'value': 'Denmark, Tech. U.'}],
   'bai': 'H.G.Bohr.2',
   'full_name': 'Bohr, Henrik G.'},
  {'affiliations': [{'record': {'ref': 'https://inspirehep.net/api/institutions/903784'},
     'value': 'Denmark, Tech. U.'}],
   'bai': 'M.S.Jensen.1',
   'full_name': 'Jensen, Mogens Stibius'}],
 'citation_count': 9,
 'citation_count_without_self_citations': 1,
 'control_number': 1347055,
 'document_type': ['article'],
 'inspire_categories': [{'term': 'General Physics'}],
 'preprint_date': '2014-12-07',
 'publication_info': [{'artid': '1550078',
   'journal_issue': '14',
   'journal_title': 'Int.J.Mo

In [25]:
journal_info_keys[67004]

dict_keys(['year', 'parent_record', 'conference_record', 'journal_volume', 'page_start', 'journal_title', 'page_end', 'cnum'])

So, apparently conference recods/papers appear with the keys `cnum` and articles published in journals have the `journal_title` key. 

NOTE: I decide then to only keep the jornal name IF the paper is published on a journal. Those that are conference papers/conference records are discarded. I do so because we are interested in articles that are published in journals.

In [26]:
def extract_journal(metadata_dict: dict):
    if ('publication_info' in metadata_dict.keys()): 
         return list(np.unique([publication_info['journal_title'] for publication_info in metadata_dict['publication_info'] if ('journal_title' in publication_info.keys())]))
    else: 
        return []

journal_name = df['metadata'].map( 
    extract_journal
)

...and finally assign to column in dataframe. Notice that the result is a column in which each row contains a list with the name of the journals.

In [27]:
df['journal'] = journal_name

##### Category

Extract INSPIRE category of each document

In [28]:
def extract_category(metadata_dict: dict):
    if ('inspire_categories' in metadata_dict.keys()): 
         return [category['term'] for category in metadata_dict['inspire_categories']]
    else: 
        return []

categories = df['metadata'].map( 
    extract_category
)

In [29]:
categories.sample(10)

55952                                    [Instrumentation]
22233                                     [Experiment-HEP]
16914      [Experiment-HEP, Instrumentation, Astrophysics]
61787                                     [Experiment-HEP]
65677                                    [Instrumentation]
65332    [Phenomenology-HEP, Theory-HEP, Phenomenology-...
24995                                       [Astrophysics]
55057                         [Computing, Instrumentation]
49363                         [Phenomenology-HEP, Lattice]
10443                           [Phenomenology-HEP, Other]
Name: metadata, dtype: object

In [32]:
df['categories'] = categories

##### Date

To normalize the date (dates are in different formats) transform the `created` colum into a datetime object and then back to string

In [37]:
df['created'] = pd.to_datetime(df['created']).map(lambda date: date.strftime("%Y-%m-%d"))

---

Lets look at the end result of data cleaning and preparation

In [38]:
df.sample(10)

Unnamed: 0,_id,citations_ids,created,links,metadata,updated,title,document_type,journal,categories
53113,1444354,[],2016-04-07,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 0, '...",2021-01-07T03:57:30.683080+00:00,Inclusive production of $H\rightarrow b\bar{b}...,[article],[EPL],[Phenomenology-HEP]
45857,809498,[],2009-07-03,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 2, '...",2021-01-07T08:55:16.890119+00:00,Study of ATLAS sensitivity to asymmetries in s...,[thesis],[],[Experiment-HEP]
37942,1328513,[],2014-11-18,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 79, ...",2021-01-09T11:39:19.440685+00:00,Introduction to parton-shower event generators,[conference paper],[],[Phenomenology-HEP]
13620,1294485,[],2014-05-06,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 0, '...",2020-11-26T23:01:01.226993+00:00,Quarkonium results at the LHC,[conference paper],[Nuovo Cim.C],[Experiment-HEP]
26875,1692797,[],2018-09-06,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 0, '...",2021-01-07T02:10:06.642220+00:00,Studying flavor-changing neutral tqZ couplings...,[article],[Mod.Phys.Lett.A],"[Phenomenology-HEP, Phenomenology-HEP]"
19765,1514166,[],2017-02-20,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 6, '...",2020-11-27T01:31:00.332102+00:00,The Full Event Interpretation for Belle II,[thesis],[],[Experiment-HEP]
42964,1469256,[],2016-06-15,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 11, ...",2021-01-06T18:42:29.553351+00:00,$h\to\mu\tau$ and muon g-2 in the alignment li...,[article],[Nucl.Phys.B],[Phenomenology-HEP]
28645,1611681,"[1749017, 1768528, 1726238, 1664648, 1713226, ...",2017-07-27,{'bibtex': 'https://inspirehep.net/api/literat...,"{'citation_count_without_self_citations': 6, '...",2021-01-07T17:12:28.405614+00:00,Observation of $D^0$ meson decays to $\pi^+\pi...,[article],[Phys.Rev.Lett.],[Experiment-HEP]
53865,701785,[],2006-01-06,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 3, '...",2020-11-26T18:01:06.991410+00:00,Recent advancements in the development of radi...,[conference paper],[Nucl.Instrum.Meth.A],[Instrumentation]
6015,1834020,[],2020-12-01,{'citations': 'https://inspirehep.net/api/lite...,"{'citation_count_without_self_citations': 0, '...",2021-01-07T02:45:02.644891+00:00,Collider constraints on dark mediators,[article],[],[Phenomenology-HEP]


## Build Network

Now that the data is ready we build the network.

In [53]:
G = nx.DiGraph()

### Nodes

First lets drop the unnecessary colums

In [40]:
df = df.drop(columns=['links','metadata','updated'])

and then build the nodes: a list of tuples where each element is a node. Each tuple (a.k.a. node) contains on its first element the node `id` and the second element is a dict with additional data.

To acchieve this lets get the `_id` column and convert the rest of the columns into a list of dicts. After, they can be zipped into the required list.

In [54]:
nodes = zip(df['_id'], df.drop(columns=['_id']).to_dict('records'))

(notice `zip` returns an iterable and not a list)

Add nodes to the network

In [42]:
G.add_nodes_from(nodes)

and export them to a file as well

In [43]:
df.to_csv('nodes.csv', index=False)

###  Edges

Now we build the edges

In [None]:
edges = []
#iterate over dataframe rows (aka each article)
for idx in tqdm(range(df.shape[0])):
    # and then over every citation
    for citation_id in df.iloc[idx]['citations_ids']:
        edge = { 
            'article_id': df.iloc[idx]['_id'],
            'article_title': df.iloc[idx]['title'],
            'citation_id': citation_id,
            'citation_title': df[df['_id'] == str(citation_id)]['title'].values[0]    # find the citation title on the dataframe
        }
        edges.append(edge)
        
        # add edge to the networkx graph
        G.add_edge(edge['citation_id'], edge['article_id'], 
                   citation_title=edge['citation_title'], 
                   article_title=edge['article_title'] 
                  )
        
edges_df = pd.DataFrame(edges)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=67031.0), HTML(value='')))

In [45]:
edges_df.sample(10)

Unnamed: 0,article_id,article_title,citation_id,citation_title
9860,1749379,Search for supersymmetry in proton-proton coll...,1757727,Axiogenesis
90225,1093733,Measurement of the azimuthal anisotropy for ch...,1114983,Study on initial geometry fluctuations via par...
109986,1082456,Measurement of the charge asymmetry in top-qua...,1207587,Data driving the top quark forward--backward a...
7094,1286320,First combination of Tevatron and LHC measurem...,1286587,Is Higgs inflation ruled out?
188880,1426744,Search for electroweak production of charginos...,1464830,Electroweakino pair production at the LHC: NLO...
36241,1696607,Combined measurements of Higgs boson couplings...,1735333,Search for lepton flavour violating decays of ...
37803,1691634,Observation of $H \rightarrow b\bar{b}$ decays...,1815813,Evidence for Higgs boson decay to a pair of muons
32374,1219960,Improved luminosity determination in pp collis...,1254228,Search for dark matter in events with a hadron...
212266,1394580,Transverse momentum dependence of D-meson prod...,1490655,Effective field theory approach to open heavy ...
36327,1696607,Combined measurements of Higgs boson couplings...,1809953,Measurement of the properties of Higgs boson p...


In [46]:
edges_df.to_csv('edges.csv', index=False)

#####  Export the data to gexf and json

In [None]:
from networkx.readwrite import json_graph
from bson import json_util

In [48]:
data_json = json_graph.node_link_data(G)
with open('network.json', 'w') as outfile:
    json.dump(data_json,outfile, default=json_util.default)

In [None]:
nx.write_gexf(G, "network.gexf")