<a href="https://colab.research.google.com/github/restrepo/inspire/blob/master/newinspire.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## New inspire hep
Use the API of [inspirehep.net](https://inspirehep.net/) to get the CMS articles with UDEA authors.

> In order to avoid overwhelming the server, we enforce rate limits per IP address: every IP address is allowed 50 requests, then at most 2 requests per second. If you exceed those limits, you will receive a response with HTTP status code 429 and a x-retry-in header telling you how long to wait before retrying.

In [1]:
import pandas as pd
import time

# Search formula for CMS articles with UDEA authors:
q='(cn cms and aff antioquia u.) or aff antioquia u.'
size=10 # must be low for articles from collaborations
page=1
def query(q,size,page):
    return 'https://inspirehep.net/api/literature?q={}&size={}&page={}&doc_type=published'.format(
            q.replace(' ','+'),size,page)
df=pd.read_json( query(q,size,page) )
ih=pd.DataFrame( [d.get('metadata') for d in df.loc['hits'].get('hits') ] )
total=df.loc['total'].get('hits')
for page in range(2,total//size+2):
    time.sleep(1)
    df=pd.read_json( query(q,size,page) )
    #Get the real hep json → https://inspirehep.net/schemas/records/hep.json
    ih=ih.append( pd.DataFrame( [d.get('metadata') for d in df.loc['hits'].get('hits') ] ),
                 sort=False  ).reset_index(drop=True)
del(df)
print('Number of articles → {}'.format(ih.shape[0]))

Number of articles → 327


Extract UDEA authors from each article

In [2]:
ih['authors_udea']=ih['authors'].apply(lambda l: [  [d.get('full_name') for dd in d.get('affiliations')  if dd.get('value')=='Antioquia U.'  ]  
                                  for d in l if d.get('affiliations')  ]  
             ).apply(lambda l: [ll[0] for ll in l if ll]).apply('; '.join )

Simplify main columns

In [21]:
ih['doi']=ih['dois'].apply(lambda l: [d.get('value') for d in l  if d.get('value')] if isinstance(l,list) else ['']).str[0]
ih['title']=ih['titles'].apply(lambda l:  [d.get('title') for d in l if d.get('title')]).str[0]
ih['journal']=ih['publication_info'].apply(  lambda l:  [d.get('journal_title') for d in l 
                                                         if d.get('journal_title')]).str[0]
ih['year']=ih['publication_info'].apply(  lambda l:  [d.get('year') for d in l 
                                                         if d.get('year')]).str[0]

Export to excel

In [28]:
ih.shape

(327, 51)

In [29]:
ih.year

0      2017
1      2014
2      2014
3      2019
4      2019
       ... 
322    1998
323    1980
324    2008
325    2019
326    2013
Name: year, Length: 327, dtype: int64

In [51]:
pb=ih[(ih['doi']!='') & (ih['year']>2016) & (ih['year']<2020)].sort_values('year',ascending=False).reset_index(drop=True)

In [52]:
pb['author_udea']=pb['authors_udea'].str.split(';').str[0]

In [42]:
pb[['title', 'authors_udea', 'doi','journal','author_count','citation_count', 'citation_count_without_self_citations']
   ].to_excel('data/cms.xlsx',index=False)

In [61]:
pb['journal']=pb['journal'].str.replace('^Phys.Rev.Lett.','Physical Review Letters'
            ).str.replace('^Phys.Rev.D','Physical Review D'
            ).str.replace('^Phys.Lett.B','Pysics Letters B'
            ).str.replace('^JHEP','Journl of High Energy Physics'
            ).str.replace('^JCAP','Journal of Cosmology and Astrpoarticle Physics'
            ).str.replace('^Eur.Phys.J.C','European Journal of Physics C'
            ).str.replace('^J.Phys.G','Journal of Physics G'
                         )

In [62]:
pb.journal.unique()

array(['Physical Review Letters', 'Physical Review D', 'Pysics Letters B',
       'Journl of High Energy Physics',
       'Journal of Cosmology and Astrpoarticle Physics',
       'European Journal of Physics C', 'Class.Quant.Grav.',
       'Mod.Phys.Lett.A', 'Int.J.Mod.Phys.A', 'Int.J.Mod.Phys.D', 'JINST',
       'Journal of Physics G'], dtype=object)

In [64]:
pb[['title','author_udea','journal','year','authors_udea']].to_excel('data/gfif.xlsx',index=False)

In [6]:
!mkdir -p data

In [7]:
ih[['title', 'authors_udea', 'doi','journal','author_count','citation_count', 'citation_count_without_self_citations']
   ].to_excel('data/cms.xlsx',index=False)

Check excel file:

In [8]:
pd.read_excel('data/cms.xlsx')[:2]

Unnamed: 0,title,authors_udea,doi,journal,author_count,citation_count,citation_count_without_self_citations
0,Search for a heavy resonance decaying to a top...,"Ruiz Alvarez, José David",10.1007/JHEP09(2017)053,JHEP,2230,15,4
1,Search for New Physics in Events with Same-Sig...,"Ruiz Alvarez, José David",10.1007/JHEP01(2014)163,JHEP,2210,181,107
