# Get INSPIRE-HEP info for authors of a set of countries:
## from Latin America:

In [1]:
import pandas as pd
import requests
import time
import json

In [2]:
#From https://github.com/inspirehep/rest-api-doc#rate-limiting
#every IP address is allowed 15 requests in a 5s window.
collaborations=['STAR', 'MicroBooNE', 'CLAS', 'DUNE', 'NA62', 'PHENIX', 'BESIII', 'DES', 'Super-Kamiokande', 'LIGO Scientific', 'KM3NeT', 'HAWC', 
                'Pierre Auger', 'IceCube', 'CTA Observatory', 'Telescope Array', 'MAGIC', 'SuperCDMS', 'CUPID', 'CUORE', 'KATRIN', 'REDTOP', 'KAGRA', 
                'Muon Collider', 'Mu2e-II', 'ILC International Development Team', 'Daya Bay', 'Virgo', 'NEMO-3', 'ATLAS', 'Euclid', 'LHCb', 'CMS', 
                'ANTARES', 'GERDA', 'HADES', 'n_TOF', 'PANDA', 'Jefferson Lab Hall A', 'LUX-ZEPLIN', 'Belle', 'LHAASO', 'H.E.S.S.', 'JUNO', 'AMS', 
                'XENON', 'SHiP', 'ALICE', 'NA61/SHINE', 'DAMPE', 'H1', 'NOvA', 'CTA LST Project', 'OPERA', 'COMPASS', 'GlueX', 'LEGEND', 
                'Event Horizon Telescope', 'MPD', 'LiteBIRD', 'Double Chooz', 'Fermi-LAT', 'nEXO', 'SDSS-IV', 'NEXT', 'CREX', 'BaBar', 
                'CDF', 'DarkSide', 'EXO-200', 'Tibet ASγ', 'SPT-3G', 'Mice', 'CTA Consortium', 'SNO+', 'CSNS Back-n', 'WA105', 'NA48/2', 
                'Muon g-2', 'DEAP', 'T2K', 'Charm', 'HESS', 'ICARUS', 'Hess', 'CTA', 'Mu3e', 'Planck', 'Hyper-Kamiokande', 'MOONS Consortium', 
                'HEP Software Foundation', 'LSPE', 'CMB-S4', 'TianQin', 'QUBIC', 'KLF', 'Particle Data Group', 'DARWIN', 'LZ', 'Insight-HXMT Team', 
                'WASA-at-COSY', 'BOREXINO', 'TOTEM', 'SBND', 'RD42', 'SPTpol', 'IAXO', 'aLIGO', 'LIGO', 'LHeC', 'D0', 'ACT', 'LHC Reinterpretation Forum', 
                'ILD Concept Group', 'RD53', 'Insight-HXMT', 'CPEDM', 'ILD', 'LArIAT', 'Rd42', 'BDX', 'Jefferson Lab E97-110', 'AEDGE', 'AMEGO', 'Simons Observatory', 
                'LUX', 'LISA Pathfinder', 'MSE Science Team', 'CALICE', 'ZEUS', 'E97-110', 'PREX', 'SPD proto', 'AdvLIGO', 'HERMES-SP']
sleep=0.4

Scheme: dicts and lists are the same than in INSPIRE-HEP
```python
{'author_id':str,
 'name':dict, #of author
 'LA_institution_id':str,
 'external_system_identifiers':list, #of institution
 'LA_country':str,
 'LA_institution':str,
 'email_addresses':list, #of author
 'positions':list, #of author
 'ids':list, #of author
 'primary_arxiv_category', list # literature
 'Fields of Study': list # in ['Particle physics','Cosmology','Astrophysics','Mathematical physics']+collaborations
}
```

Sort by Publication Year to get the last affiliations

In [3]:
%%writefile inspirehep.py
import time
import requests
import json

global db,db,LA_countries,fields_of_study,sep,sleep
LA_countries=('Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Cuba', #5
              'Costa Rica', 'Ecuador', 'El Salvador', 'Guatemala', 'Honduras', #10
              'Mexico', 'Nicaragua', 'Panama', 'Paraguay', 'Peru', #15
              'Dominican Republic','Uruguay','Venezuela') #18
fields_of_study={'Particle physics','Cosmology','Astrophysics','Mathematical physics'}
sep='arxiv.org/abs/'
#From https://github.com/inspirehep/rest-api-doc#rate-limiting
#every IP address is allowed 15 requests in a 5s window.
sleep=0.4

def get_index(db,index='author_id'):
    i=0
    d_index={}
    for d in db:
        d_index[ d.get(index) ]=i
        i=i+1
    return d_index

def get_work(db,a,idsLA,idsAU,db_index,FoS,PAC):
    """
    Use author dictionary from INSPIRE-HEP literature API 'authors list, to get author info:
    * author_id
    * url_author
    and update 'Fields of Study' and 'primary_arxiv_category' if author alread in db
    """
    author_id=None
    url_author=None        
    try:
        url_author=a.get('record').get('$ref')
    except:
        return True,author_id,url_author,FoS,PAC #next author (Not author Profile)
    if isinstance(url_author,str):
        author_id=url_author.split('/')[-1]
    else:
        return True,author_id,url_author,FoS,PAC #next author
    if author_id in idsAU:
        try:
            FoS=list(set(db[db_index[author_id]].get('Fields of Study')).union(FoS))
            try: #update db
                db[db_index[author_id]]['Fields of Study']=FoS
            except:
                pass
        except:
            pass
        try:
            PAC=list(set(db[db_index[author_id]].get('primary_arxiv_category')).union(PAC))
            try: #update db
                db[db_index[author_id]]['primary_arxiv_category']=PAC
            except:
                pass                
        except:
            pass

        #print('DEBUG:already there')
        return True,author_id,url_author,FoS,PAC #next author
    return False,author_id,url_author,FoS,PAC
    
def get_institutions(db,i,idsLA,idsAU,db_index,FoS,sleep):
    """
    * Use affiliation dictionary from affilitions list of author dictionary to get:
     `inst_name`
     `inst_url`
    * Use INSPIRE-HEP insitution API to get: 
      `country`
      'external_system_identifiers'
    """
    CONTINUE=True
    country=None
    esi=None
    inst_id=None
    inst_name=None
    try:
        inst_name=i.get('value')
        inst_url=i.get('record').get('$ref')
    except:
        #Next institution
        return CONTINUE,inst_id,inst_name,country,esi
    inst_id=inst_url.split('/')[-1]

    if inst_id in idsLA:
        #print('DEBUG: Institution already there')
        country=[ d.get('LA_country') for d in db 
                  if d.get('LA_institution_id')==inst_id ][0]                
        esi=[ d.get('external_system_identifiers') for d in db 
                  if d.get('LA_institution_id')==inst_id ][0]
        CONTINUE=False
        return CONTINUE,inst_id,inst_name,country,esi        
    else:
        #new institution
        time.sleep(sleep)
        ri=requests.get(inst_url)
        if ri.status_code==200:
            di=ri.json()
        else:
            return CONTINUE,inst_id,inst_name,country,esi

        try:
            country=di.get('metadata').get('addresses')[0].get('country')
        except:
            country=None
        if country not in LA_countries:
            return CONTINUE,inst_id,inst_name,country,esi
        else:
            #Get insitution metada from inspire
            esi=di.get('metadata').get('external_system_identifiers')
            if not esi:
                esi=[]
            CONTINUE=False
            return CONTINUE,inst_id,inst_name,country,esi
        
def get_author(url_author):
    """
    Get author profile from INSPIRE-HEP author API
    """
    if isinstance(url_author,str):
        time.sleep(sleep)
        epa=requests.get(url_author)

    if epa.status_code==200:
        epa=epa.json()

    positions=epa.get('metadata').get('positions')
    if not positions:
        positions=[]

    email_addresses=epa.get('metadata').get('email_addresses')

    if not email_addresses:
        email_addresses=[]

    name=epa.get('metadata').get('name')
    if not name:
        name={}

    aids=epa.get('metadata').get('ids')
    if not aids:
        aids=[]
    return positions,email_addresses,name,aids            

Overwriting inspirehep.py


In [4]:
from inspirehep import *

In [5]:
db=[]

### INSPIRE-HEP APIs:
For each DOI or arXiv identifier:
* (1) From literature API→ get authors info
    * `if` author info not already in db `for` each author:
        * (2) institution API → get institutions info
            * `if` institution is from Latin America `for` each one of the author's institutions:
                * (3) author API → get author profile
  * `else` autor already in db: update `'Field of Study'` and `'primary_arXiv_category'`

In [6]:
LA_countries[10:11]

('Honduras',)

In [7]:
ii=0
len_db_old=0
store=1000
for c in LA_countries[10:11]:
    url=f'https://inspirehep.net/api/institutions?q={c}'
    r=requests.get(url)
    time.sleep(sleep)

    if r.status_code!=200 or not isinstance(r.json(),dict):
        print(f'WARNING: {c} failed')
        continue

    #Institutions of each country
    for l in r.json().get('hits').get('hits'):
        #============Prepare Output files===========
        if ii>0 and ii%store==0 and len(db)!=len_db_old:
            len_db_old=len(db)
            print('')
            print(f'{ii} → db_LA.json updated with size: {len(db)}')
            f=open('db_LA.json','w')
            json.dump(db,f)
            f.close()    
        #==================================
        #UPDATE LISTS
        if db:
            idsLA=[d.get('LA_institution_id') for d in db]
            idsAU=[d.get('author_id') for d in db]
            db_index=get_index(db)
        else:
            idsLA=[]
            idsAU=[]
            db_index={}
        
        if l.get('metadata').get('number_of_papers')==0:
            continue
        aff=l.get('metadata').get('legacy_ICN').replace(' ','+')
        urlaff=f'https://inspirehep.net/api/literature?size=250&page=1&q=aff+{aff}+and+ac+1->+10'
        rl=requests.get( urlaff  )
        time.sleep(sleep)
        
        if rl.status_code!=200 or not isinstance(rl.json(),dict):
            print(f'WARNING: {c}→{aff} failed')
            continue
        if not rl.json().get('hits').get('hits'):
            continue
            
        for ll in rl.json().get('hits').get('hits'):
            ii=ii+1
            print(ii,end='\r')
            lit=ll.get('metadata')

            la=lit.get('authors')
            FoS=[]
            try:
                PAC=lit['primary_arxiv_category']
            except:
                PAC=[]
            for a in la:
                CONTINUE,author_id,url_author,FoS,PAC=get_work(db,a,idsLA,idsAU,db_index,FoS,PAC)
                if CONTINUE:
                    continue #Next author
                #affiliations... → LA required
                li=a.get('affiliations')
                if not li:
                    continue #Next author
                for i in li:
                    #(2) institutions API
                    CONTINUE,inst_id,inst_name,country,esi=get_institutions(db,i,idsLA,idsAU,db_index,FoS,sleep)
                    if CONTINUE: #Not LA institutiion
                        continue #next institution
                    #(3) author API
                    positions,email_addresses,name,aids=get_author(url_author)
                    db.append(
                        {'author_id':author_id,
                         'name':name,
                         'LA_institution_id':inst_id,
                         'external_system_identifiers':esi,
                         'LA_country':country,
                         'LA_institution':inst_name,
                         'email_addresses':email_addresses,
                         'positions':positions,
                         'ids':aids,
                         'primary_arxiv_category':PAC,
                         'Fields of Study': FoS # in ['Particle physics','Cosmology','Astrophysics','Mathematical physics']
                        }
                    )
            #raise Exception('stop')

https://inspirehep.net/api/literature?size=250&page=1&q=aff+Honduras+U.+and+ac+1->+10
https://inspirehep.net/api/literature?size=250&page=1&q=aff+Zamorano,+San+Antonio+de+Oriente+and+ac+1->+10


In [9]:
pd.DataFrame(db)

Unnamed: 0,author_id,name,LA_institution_id,external_system_identifiers,LA_country,LA_institution,email_addresses,positions,ids,primary_arxiv_category,Fields of Study
0,1262260,"{'value': 'Cruz Torres, Melissa Maria', 'name_...",904629,"[{'value': 'INST-26174', 'schema': 'SPIRES'}]",Honduras,Honduras U.,[{'value': 'melissa.maria.cruz.torres@cern.ch'...,[{'record': {'$ref': 'https://inspirehep.net/a...,"[{'value': '0000-0003-2607-131X', 'schema': 'O...",[],[]
1,1010084,"{'value': 'Ferreira, Erasmo', 'preferred_name'...",903159,"[{'value': 'grid.4839.6', 'schema': 'GRID'}, {...",Brazil,"Rio de Janeiro, Pont. U. Catol.","[{'value': 'erasmo@if.ufrj.br', 'current': True}]","[{'rank': 'SENIOR', 'record': {'$ref': 'https:...","[{'value': '0000-0002-3126-2500', 'schema': 'O...",[],[]
2,1010084,"{'value': 'Ferreira, Erasmo', 'preferred_name'...",903159,"[{'value': 'grid.4839.6', 'schema': 'GRID'}, {...",Brazil,"Rio de Janeiro, Pont. U. Catol.","[{'value': 'erasmo@if.ufrj.br', 'current': True}]","[{'rank': 'SENIOR', 'record': {'$ref': 'https:...","[{'value': '0000-0002-3126-2500', 'schema': 'O...",[],[]
3,1074631,"{'value': 'Agón, César Alfonso', 'name_variant...",904287,"[{'value': 'grid.9486.3', 'schema': 'GRID'}, {...",Mexico,"Mexico U., ICN","[{'value': 'cesar.agon@stonybrook.edu', 'curre...","[{'rank': 'POSTDOC', 'record': {'$ref': 'https...","[{'value': '0000-0002-8903-7074', 'schema': 'O...",[hep-th],[]
4,1007297,"{'value': 'Guijosa, Alberto', 'preferred_name'...",904287,"[{'value': 'grid.9486.3', 'schema': 'GRID'}, {...",Mexico,"Mexico U., ICN","[{'value': 'alberto@nucleares.unam.mx', 'curre...","[{'rank': 'SENIOR', 'record': {'$ref': 'https:...","[{'value': '0000-0003-2754-8394', 'schema': 'O...",[hep-th],[]
5,1273983,"{'value': 'Larios-López, Bryan O.', 'name_vari...",904629,"[{'value': 'INST-26174', 'schema': 'SPIRES'}]",Honduras,Honduras U.,[],"[{'rank': 'SENIOR', 'record': {'$ref': 'https:...","[{'value': '0000-0001-5368-5566', 'schema': 'O...",[hep-th],[]
6,1835921,"{'value': 'Flores-Alfonso, Daniel', 'name_vari...",905319,"[{'value': 'grid.7220.7', 'schema': 'GRID'}, {...",Mexico,"Mexico City U., Iztapalapa","[{'value': 'daniel.flores@cinvestav.mx', 'curr...","[{'rank': 'POSTDOC', 'record': {'$ref': 'https...","[{'value': 'D.Flores.Alfonso.1', 'schema': 'IN...",[gr-qc],[]
7,1273983,"{'value': 'Larios-López, Bryan O.', 'name_vari...",904629,"[{'value': 'INST-26174', 'schema': 'SPIRES'}]",Honduras,Honduras U.,[],"[{'rank': 'SENIOR', 'record': {'$ref': 'https:...","[{'value': '0000-0001-5368-5566', 'schema': 'O...",[gr-qc],[]
8,1273983,"{'value': 'Larios-López, Bryan O.', 'name_vari...",910964,"[{'value': 'grid.440446.6', 'schema': 'GRID'},...",Mexico,Chiapas Autonoma U.,[],"[{'rank': 'SENIOR', 'record': {'$ref': 'https:...","[{'value': '0000-0001-5368-5566', 'schema': 'O...",[gr-qc],[]
9,1065381,"{'value': 'Avilez, Ana A.', 'name_variants': [...",908048,"[{'value': 'grid.411659.e', 'schema': 'GRID'},...",Mexico,"Puebla U., Inst. Fis.",[],"[{'rank': 'PHD', 'record': {'$ref': 'https://i...","[{'value': 'A.Avilez.1', 'schema': 'INSPIRE BA...",[hep-th],[]


## Search for collaborations
* https://inspirehep.net/literature?sort=mostrecent&q=ac%3E10
* Search athors from db and get check if institution is froma LA

### [collaborations.ipynb](./collaborations.ipynb)

## Other countries
https://inspirehep.net/institutions?sort=mostrecent&size=25&page=1&q=Dominican+Republic

## Load database and simplify fields

In [358]:
import pandas as pd

In [359]:
df=pd.read_json('data/db_LA.json',dtype={'author_id':str, 'LA_institution_id':str})

In [360]:
df.shape

(3105, 11)

In [361]:
df=df[df['email_addresses'].apply(len)>0].reset_index(drop=True)

In [362]:
df.shape

(1960, 11)

In [363]:
df['author_id'].drop_duplicates().shape

(1669,)

In [364]:
df['full_name']=df['name'].str.get('value')

In [365]:
def get_email(l):
    current=False
    for d in l:
        if d.get('current'):
            email=d.get('value')
            current=True
    if l and not current:
        email=l[0].get('value')
    return email

In [366]:
df['email']=df['email_addresses'].apply(get_email)

In [367]:
df['current_position']=df['positions'].apply(lambda l: [d.get('rank') for d in l if d.get('rank') and d.get('current')]).str[0].fillna('Unknown')

In [368]:
df.columns

Index(['author_id', 'name', 'LA_institution_id', 'external_system_identifiers',
       'LA_country', 'LA_institution', 'email_addresses', 'positions', 'ids',
       'primary_arxiv_category', 'Fields of Study', 'full_name', 'email',
       'current_position'],
      dtype='object')

In [369]:
def get_institution(row):
    instituion=''
    l=[d.get('institution') for d in row.get('positions') if d.get('rank') and d.get('current')]
    if l:
        instituion=l[0]
    else:
        instituion=row.get('LA_institution')
    return instituion

In [370]:
df['current_institution']=df.apply(get_institution, axis='columns')

In [371]:
df['ORCID']=df['ids'].apply(lambda l: [d.get('value') for d in l if d.get('schema')=='ORCID']).str[0].fillna('')

In [372]:
df['Fields_of_Study']=df['Fields of Study'].apply(lambda l: '; '.join(l))

In [373]:
df['primary_arXiv_category']=df['primary_arxiv_category'].apply(lambda l: '; '.join(l))

In [374]:
pd.set_option('display.max_rows', 50)

In [378]:
#df[:1]

In [379]:
dfx=df[['author_id', 'full_name', 'email','LA_institution_id', 
       'LA_country', 'LA_institution', 'current_institution',  'current_position', 'ORCID',
       'Fields_of_Study','primary_arXiv_category']]
dfx.to_csv('data/db_LA.csv',index=False)
dfx.to_excel('data/db_LA.xlsx',index=False)

In [None]:
from IPython import display

In [380]:
display.JSON(dfx[dfx.full_name.str.contains('Rosenfeld')].iloc[-1].to_dict())

<IPython.core.display.JSON object>

In [381]:
set(collaborations).intersection(df['Fields of Study'].apply(pd.Series).stack().unique())

{'ACT',
 'AEDGE',
 'ALICE',
 'AMS',
 'ANTARES',
 'ATLAS',
 'AdvLIGO',
 'BDX',
 'CDF',
 'CMS',
 'COMPASS',
 'CTA',
 'CTA Consortium',
 'CTA LST Project',
 'CTA Observatory',
 'CUPID',
 'D0',
 'DEAP',
 'DES',
 'DUNE',
 'DarkSide',
 'Daya Bay',
 'Double Chooz',
 'Euclid',
 'Event Horizon Telescope',
 'Fermi-LAT',
 'GlueX',
 'H1',
 'HADES',
 'HAWC',
 'HERMES-SP',
 'HESS',
 'Hess',
 'Hyper-Kamiokande',
 'IAXO',
 'ILC International Development Team',
 'IceCube',
 'JUNO',
 'KAGRA',
 'KLF',
 'KM3NeT',
 'LArIAT',
 'LHC Reinterpretation Forum',
 'LHCb',
 'LHeC',
 'LSPE',
 'LUX-ZEPLIN',
 'LiteBIRD',
 'MAGIC',
 'MOONS Consortium',
 'MPD',
 'MSE Science Team',
 'Muon Collider',
 'NA62',
 'NEXT',
 'NOvA',
 'PHENIX',
 'Particle Data Group',
 'Pierre Auger',
 'QUBIC',
 'RD42',
 'REDTOP',
 'SBND',
 'SDSS-IV',
 'SHiP',
 'SNO+',
 'SPT-3G',
 'STAR',
 'Simons Observatory',
 'Super-Kamiokande',
 'T2K',
 'TOTEM',
 'Virgo',
 'WA105',
 'XENON'}

In [382]:
set(collaborations).intersection(df[df['LA_country']=='Colombia']['Fields of Study'].apply(pd.Series).stack().unique())

{'ATLAS',
 'CMS',
 'D0',
 'DUNE',
 'LHCb',
 'LHeC',
 'LSPE',
 'NEXT',
 'NOvA',
 'Pierre Auger',
 'TOTEM'}

In [383]:
set(collaborations).intersection(df[df['LA_country']=='Ecuador']['Fields of Study'].apply(pd.Series).stack().unique())

{'CMS', 'CTA Observatory', 'D0', 'TOTEM'}

In [385]:
set(collaborations).intersection(df[df['LA_country']=='Peru']['Fields of Study'].apply(pd.Series).stack().unique())

{'ALICE', 'DUNE', 'Simons Observatory'}