## GFIF papers

In [1]:
import pandas as pd

Make a GFIF query

In [2]:
q='(fc p or fc e) and aff antioquia u. and collection:published'
q=q.replace(' ','+')
df=pd.read_json(
    'https://inspirehep.net/search?p=find+{}&rg=250&of=recjson'.format(q))

In [3]:
df.shape

(195, 40)

explore a row for a published paper based on the previous query

In [54]:
df[df.title.astype(str).str.contains('Singlet-Doublet Dirac Dark Matter and Neutrino Masses')].reset_index(drop=True).to_dict()

{'FIXME_OAI': {0: {'id': 'oai:inspirehep.net:1741056',
   'set': ['INSPIRE:HEP', 'ForSCOAP3']}},
 'abbreviated_title': {0: nan},
 'abstract': {0: [{'number': 'APS',
    'summary': 'We examine an extension of the Standard Model that addresses the dark matter puzzle and generates Dirac neutrino masses through the radiative seesaw mechanism. The new field content includes a scalar field that plays an important role in setting the relic abundance of dark matter. We analyze the phenomenology in the light of direct, indirect, and collider searches of dark matter. In this framework, the dark matter candidate is a Dirac particle that is a mixture of new singlet-doublet fields with mass mχ10≲1.1\u2009\u2009TeV. We find that the allowed parameter space of this model is broader than the well-known Majorana dark matter scenario.'},
   {'number': 'arXiv',
    'summary': 'We examine an extension of the Standard Model that addresses the dark matter puzzle and generates Dirac neutrinos masses through 

In [26]:
df[:3][['number_of_citations','number_of_authors','reference','recid']]

Unnamed: 0,number_of_citations,number_of_authors,reference,recid
0,1,2302,"[{'authors': 'Albajar, C.', 'title': 'Phys.Let...",1764794
1,3,2318,"[{'authors': 'P. Ramond', 'title': 'Phys.Rev.,...",1757579
2,1,3,"[{'authors': 'M. Tanabashi', 'title': 'Phys.Re...",1755250


Example: get citations for some recid

In [28]:
r=df.loc[0,'recid']
p=pd.read_json('http://inspirehep.net/search?ln=en&p=refersto:recid:{}&of=recjson'.format(r)
                      )

Filter published papers (query already filtered!)

In [31]:
p[['title','authors','publication_info']]

Unnamed: 0,title,authors,publication_info
0,{'title': 'Beauty 2019 -- Conference summary'},"[{'last_name': 'Koppenburg', 'full_name': 'Kop...",{'cnum': 'C19-09-30.1'}


In [4]:
dfp=df[~df['publication_info'].isna()].reset_index(drop=True)

Get the Journal info from crossref api

In [7]:
journal={}

In [8]:
import time
def get_journal_info(row,journal=journal,abrv_journal='publication_info'
                                        ,abrv_journal_title='title'
                                        ,doi='doi'
                                        ,container_title='container_title'
                                        ,issn='ISSN'):
    
    try:
        j=row.get(abrv_journal).get(abrv_journal_title)
    except:
        j=None
    if j and not journal.get(j):
        journal.update( {j:{}} )
    
    if not journal.get(j) and row.get(doi):
        strdoi=row.get(doi)
        try:
            di=pd.read_json( 'https://api.crossref.org/works/{}'.format( strdoi  ) )
        except:
            di=pd.DataFrame()
        if j in journal and not journal.get(j).get(container_title):
            try:
                ct=di.loc['container-title'][0][0]
                journal[j].update({container_title:ct})

            except:
                pass
        if j in journal and not journal.get(j).get(issn):
            try:
                gis=di.loc['ISSN'].get('message')[0]
                journal[j].update({issn:gis})
            except:
                pass
        #Get a new journal crossref metadata here

        time.sleep(1)
        return di

In [9]:
di=dfp.apply(get_journal_info,axis=1)

In [10]:
journal

{'Eur.Phys.J.': {'ISSN': '1434-6044',
  'container_title': 'The European Physical Journal C'},
 'Europhys.Lett.': {'ISSN': '0295-5075',
  'container_title': 'Europhysics Letters (EPL)'},
 'Int.J.Mod.Phys.': {'ISSN': '0217-751X',
  'container_title': 'International Journal of Modern Physics A'},
 'J.Phys.': {'ISSN': '0954-3899',
  'container_title': 'Journal of Physics G: Nuclear and Particle Physics'},
 'JCAP': {'ISSN': '1475-7516',
  'container_title': 'Journal of Cosmology and Astroparticle Physics'},
 'JHEP': {'ISSN': '1029-8479',
  'container_title': 'Journal of High Energy Physics'},
 'JINST': {'ISSN': '1748-0221',
  'container_title': 'Journal of Instrumentation'},
 'Mod.Phys.Lett.': {'ISSN': '0217-7323',
  'container_title': 'Modern Physics Letters A'},
 'Nucl.Phys.': {'ISSN': '0550-3213', 'container_title': 'Nuclear Physics B'},
 'Phys.Lett.': {'ISSN': '0370-2693', 'container_title': 'Physics Letters B'},
 'Phys.Rept.': {'ISSN': '0370-1573', 'container_title': 'Physics Reports'

Normalize data

In [11]:
dfp['title_TeX']=dfp['title'].apply(
                lambda d: d[0] if isinstance(d,list) else d
            ).apply(
                lambda d: d.get('title') if isinstance(d,dict) else d)

In [16]:
dfp['DOI']=dfp['doi'].apply(lambda s: s[0] if isinstance(s,list) else s) 

Add Journal metadata

In [13]:
dfp['Year']=dfp['publication_info'].apply(
    lambda d: d.get('year') if isinstance(d,dict) else None).apply(
    lambda y: int(y) if isinstance(y,str) else 0)

dfp['Journal_name']=dfp['publication_info'].apply(lambda d: journal.get( 
                                        d.get('title') ).get('container_title') 
                                        if isinstance(d,dict) 
                                        else None)
dfp['ISSN']=dfp['publication_info'].apply(lambda d: journal.get( 
                                        d.get('title') ).get('ISSN') 
                                        if isinstance(d,dict) 
                                        else None)

Extract author metadata for affiliation UdeA

In [14]:
dfp['udea_authors']=dfp['authors'].apply(lambda l: [ #loop for each author in the list
        '{} {}'.format(d.get('first_name'), d.get('last_name')) #Extract full name 
         for d in l 
           if str(d.get('affiliation')).find('Antioquia U.')>-1 #For UdeA authors
                               ])
dfp['first_udea_author']=dfp['udea_authors'].str[0]

Prepare "Sostenibilidad" output

In [17]:
sost=dfp[['title_TeX','Journal_name','first_udea_author','Year','ISSN','DOI']]

In [18]:
sost[sost.Year>2017].to_excel('sost.xlsx',index=False)

In [None]:
%%bash
git commit -am