# Get INSPIRE-HEP info for authors of HEP articles in https://lens.org
## from Latin America:

https://link.lens.org/CuptXEnD9Hk

[Fields of Study](https://docs.openalex.org/about-the-data/concept)

* Level 1: Physics
  * Level 2: Particle physics
  * Level 2: Cosmology
  * Level 2: Astrophysics
  * Level 2: Mathematical physics

Countries with non-zero counts of articles in `LA_countries`

In [1]:
import pandas as pd
import requests
import time
import json

In [2]:
#LA → 17 countries
#Argentina, Bolivia, Brazil, Chile, Colombia, Costa Rica, Ecuador, El Salvador, Guatemala, Honduras, Mexico, Nicaragua, Panama, Paraguay, Peru, the Dominican Republic and Uruguay.

In [3]:
global db,LA_countries,fields_of_study,sep,sleep
LA_countries=( 'Brazil'  , 'Mexico'  , 'Colombia'  , 'Chile'  , 'Argentina'  , 'Ecuador'  , 'Guatemala'  , 'Peru'  ,
  'Venezuela'  , 'Uruguay'  , 'Bolivia'  , 'Costa Rica'  , 'Paraguay'  , 'Nicaragua'  , 'Panama'  , 'El Salvador'  , 'Cuba' )
fields_of_study={'Particle physics','Cosmology','Astrophysics','Mathematical physics'}
sep='arxiv.org/abs/'
#From https://github.com/inspirehep/rest-api-doc#rate-limiting
#every IP address is allowed 15 requests in a 5s window.
sleep=0.4

In [4]:
FULL=False
if FULL:
    df=pd.read_csv('LA_HEP.csv')
else:
    df=pd.read_csv('data/LA_HEP_not_cn.csv.gz')

In [5]:
df.columns

Index(['Lens ID', 'Title', 'Date Published', 'Publication Year',
       'Publication Type', 'Source Title', 'ISSNs', 'Publisher',
       'Source Country', 'Author/s', 'Abstract', 'Volume', 'Issue Number',
       'Start Page', 'End Page', 'Fields of Study', 'Keywords', 'MeSH Terms',
       'Chemicals', 'Funding', 'Source URLs', 'External URL', 'PMID', 'DOI',
       'Microsoft Academic ID', 'PMCID', 'Citing Patents Count', 'References',
       'Citing Works Count', 'author_count'],
      dtype='object')

In [6]:
df['author_count']=df['Author/s'].str.split('; ').apply(len)

Avoid large file requests. Analyse collaborations separately

In [7]:
df=df[df['author_count']<=10].reset_index(drop=True)

In [8]:
df.shape

(17508, 30)

In [9]:
df[df['Fields of Study'].str.contains('Physics')].shape

(17508, 30)

In [10]:
PP=df[df['Fields of Study'].str.contains('Particle physics')].shape[0]
PP

3758

In [11]:
PC=df[df['Fields of Study'].str.contains('Cosmology')].shape[0]
PC

829

In [12]:
PA=df[df['Fields of Study'].str.contains('Astrophysics')].shape[0]
PA

8809

In [13]:
PM=df[df['Fields of Study'].str.contains('Mathematical physics')].shape[0]
PM

5076

In [14]:
PP+PA+PC+PM

18472

In [15]:
ndois=df.DOI.dropna().shape[0]
ndois

15146

In [16]:
#nd[nd['Source URLs'].str.contains('arxiv.org')]['Source URLs'].iloc[0]

In [17]:
nd=df[df.DOI.isna()]
nd=nd.dropna(subset=['Source URLs']).reset_index(drop=True)
narxivs=nd[nd['Source URLs'].str.contains('arxiv.org')].shape[0]
narxivs

1496

Time required in hours

In [18]:
s=(ndois+narxivs)*3*0.4
s/60/60

5.5473333333333334

Scheme: dicts and lists are the same than in INSPIRE-HEP
```python
{'author_id':str,
 'name':dict, #of author
 'LA_institution_id':str,
 'external_system_identifiers':list, #of institution
 'LA_country':str,
 'LA_institution':str,
 'email_addresses':list, #of author
 'positions':list, #of author
 'ids':list, #of author
 'primary_arxiv_category', list # literature
 'Fields of Study': list # in ['Particle physics','Cosmology','Astrophysics','Mathematical physics']
}
```

Sort by Publication Year to get the last affiliations

In [19]:
df=df.sort_values('Publication Year',ascending=False).reset_index(drop=True)

In [20]:
df[:1]

Unnamed: 0,Lens ID,Title,Date Published,Publication Year,Publication Type,Source Title,ISSNs,Publisher,Source Country,Author/s,...,Source URLs,External URL,PMID,DOI,Microsoft Academic ID,PMCID,Citing Patents Count,References,Citing Works Count,author_count
0,025-688-286-323-660,Possible Relationship of Meteor Disintegration...,,2022,journal article,Advances in Space Research,02731177; 18791948,Elsevier BV,United Kingdom,A. A. Pimenta; Paulo Batista; Vania Fatima And...,...,https://www.sciencedirect.com/science/article/...,http://dx.doi.org/10.1016/j.asr.2021.11.028,,10.1016/j.asr.2021.11.028,3216041575,,0,001-562-150-814-802; 006-975-738-231-584; 007-...,0,5


In [21]:
%%writefile inspirehep.py
import time
import requests
import json

global db,db,LA_countries,fields_of_study,sep,sleep
LA_countries=( 'Brazil'  , 'Mexico'  , 'Colombia'  , 'Chile'  , 'Argentina'  , 'Ecuador'  , 'Guatemala'  , 'Peru'  ,
  'Venezuela'  , 'Uruguay'  , 'Bolivia'  , 'Costa Rica'  , 'Paraguay'  , 'Nicaragua'  , 'Panama'  , 'El Salvador'  , 'Cuba' )
fields_of_study={'Particle physics','Cosmology','Astrophysics','Mathematical physics'}
sep='arxiv.org/abs/'
#From https://github.com/inspirehep/rest-api-doc#rate-limiting
#every IP address is allowed 15 requests in a 5s window.
sleep=0.4

def get_index(db,index='author_id'):
    i=0
    d_index={}
    for d in db:
        d_index[ d.get(index) ]=i
        i=i+1
    return d_index

def get_work(db,a,idsLA,idsAU,db_index,FoS,PAC):
    """
    Use author dictionary from INSPIRE-HEP literature API 'authors list, to get author info:
    * author_id
    * url_author
    and update 'Fields of Study' and 'primary_arxiv_category' if author alread in db
    """
    author_id=None
    url_author=None        
    try:
        url_author=a.get('record').get('$ref')
    except:
        return True,author_id,url_author,FoS,PAC #next author (Not author Profile)
    if isinstance(url_author,str):
        author_id=url_author.split('/')[-1]
    else:
        return True,author_id,url_author,FoS,PAC #next author
    if author_id in idsAU:
        try:
            FoS=list(set(db[db_index[author_id]].get('Fields of Study')).union(FoS))
            try: #update db
                db[db_index[author_id]]['Fields of Study']=FoS
            except:
                pass
        except:
            pass
        try:
            PAC=list(set(db[db_index[author_id]].get('primary_arxiv_category')).union(PAC))
            try: #update db
                db[db_index[author_id]]['primary_arxiv_category']=PAC
            except:
                pass                
        except:
            pass

        #print('DEBUG:already there')
        return True,author_id,url_author,FoS,PAC #next author
    return False,author_id,url_author,FoS,PAC
    
def get_institutions(db,i,idsLA,idsAU,db_index,FoS,sleep):
    """
    * Use affiliation dictionary from affilitions list of author dictionary to get:
     `inst_name`
     `inst_url`
    * Use INSPIRE-HEP insitution API to get: 
      `country`
      'external_system_identifiers'
    """
    CONTINUE=True
    country=None
    esi=None
    inst_id=None
    inst_name=None
    try:
        inst_name=i.get('value')
        inst_url=i.get('record').get('$ref')
    except:
        #Next institution
        return CONTINUE,inst_id,inst_name,country,esi
    inst_id=inst_url.split('/')[-1]

    if inst_id in idsLA:
        #print('DEBUG: Institution already there')
        country=[ d.get('LA_country') for d in db 
                  if d.get('LA_institution_id')==inst_id ][0]                
        esi=[ d.get('external_system_identifiers') for d in db 
                  if d.get('LA_institution_id')==inst_id ][0]
        CONTINUE=False
        return CONTINUE,inst_id,inst_name,country,esi        
    else:
        #new institution
        time.sleep(sleep)
        ri=requests.get(inst_url)
        if ri.status_code==200:
            di=ri.json()
        else:
            return CONTINUE,inst_id,inst_name,country,esi

        try:
            country=di.get('metadata').get('addresses')[0].get('country')
        except:
            country=None
        if country not in LA_countries:
            return CONTINUE,inst_id,inst_name,country,esi
        else:
            #Get insitution metada from inspire
            esi=di.get('metadata').get('external_system_identifiers')
            if not esi:
                esi=[]
            CONTINUE=False
            return CONTINUE,inst_id,inst_name,country,esi
        
def get_author(url_author):
    """
    Get author profile from INSPIRE-HEP author API
    """
    if isinstance(url_author,str):
        time.sleep(sleep)
        epa=requests.get(url_author)

    if epa.status_code==200:
        epa=epa.json()

    positions=epa.get('metadata').get('positions')
    if not positions:
        positions=[]

    email_addresses=epa.get('metadata').get('email_addresses')

    if not email_addresses:
        email_addresses=[]

    name=epa.get('metadata').get('name')
    if not name:
        name={}

    aids=epa.get('metadata').get('ids')
    if not aids:
        aids=[]
    return positions,email_addresses,name,aids            

Overwriting inspirehep.py


In [22]:
from inspirehep import *

In [26]:
db=[]

In [29]:
istart=356#6308

### INSPIRE-HEP APIs:
For each DOI or arXiv identifier:
* (1) From literature API→ get authors info
    * `if` author info not already in db `for` each author:
        * (2) institution API → get institutions info
            * `if` institution is from Latin America `for` each one of the author's institutions:
                * (3) author API → get author profile
  * `else` autor already in db: update `'Field of Study'` and `'primary_arXiv_category'`

In [None]:
len_db_old=0
store=1000
for ii,ir in df[istart:].iterrows():
    print(ii,end='\r')
    ii=ii+1
    #=================================
    if ii>0 and ii%store==0 and len(db)!=len_db_old:
        len_db_old=len(db)
        print('')
        print(f'{ii} → db_LA.json updated with size: {len(db)}')
        f=open('db_LA.json','w')
        json.dump(db,f)
        f.close()    
    #==================================
    #UPDATE LISTS
    if db:
        idsLA=[d.get('LA_institution_id') for d in db]
        idsAU=[d.get('author_id') for d in db]
        db_index=get_index(db)
    else:
        idsLA=[]
        idsAU=[]
        db_index={}
    if isinstance(ir.DOI,str):
        q=[ 'doi',ir.DOI ]
    elif isinstance(ir['Source URLs'],str) and ir['Source URLs'].find(sep)>-1:
        q=[ 'arXiv',ir['Source URLs'].split(sep)[-1].split()[0] ]
    else:
        continue
    FoS=list(fields_of_study.intersection( ir['Fields of Study'].split('; ')  ) )
    time.sleep(sleep)
    #(1) literature API
    liteapi=f'https://inspirehep.net/api/literature?q={q[0]}:{q[1]}'
    r=requests.get(liteapi)
    if r.status_code==200:
        d=r.json()
    else:
        continue #Next doi or arXiv:id

    #authors...    
    l=d.get('hits').get('hits')
    if isinstance(l,list) and len(l)==1: #OK → Unique result
        lit=l[0].get('metadata')
        if lit.get('author_count')>10: #collaboration papers need to be analyzed separately
            continue #Next doi or arXiv:id
        la=lit.get('authors')
        try:
            PAC=lit['primary_arxiv_category']
        except:
            PAC=[]        
    else:
        #print(f'WARNING: {q} ignored by hits or wrong count')
        continue #Next doi or arXiv:id

    for a in la:
        CONTINUE,author_id,url_author,FoS,PAC=get_work(db,a,idsLA,idsAU,db_index,FoS,PAC)
        if CONTINUE:
            continue #Next author
        #affiliations... → LA required
        li=a.get('affiliations')
        if not li:
            continue #Next author
        for i in li:
            #(2) institutions API
            CONTINUE,inst_id,inst_name,country,esi=get_institutions(db,i,idsLA,idsAU,db_index,FoS,sleep)
            if CONTINUE: #Not LA institutiion
                continue #next institution
            #(3) author API
            positions,email_addresses,name,aids=get_author(url_author)
            db.append(
                {'author_id':author_id,
                 'name':name,
                 'LA_institution_id':inst_id,
                 'external_system_identifiers':esi,
                 'LA_country':country,
                 'LA_institution':inst_name,
                 'email_addresses':email_addresses,
                 'positions':positions,
                 'ids':aids,
                 'primary_arxiv_category':PAC,
                 'Fields of Study': FoS # in ['Particle physics','Cosmology','Astrophysics','Mathematical physics']
                }
            )
    #break

## Search for collaborations
* https://inspirehep.net/literature?sort=mostrecent&q=ac%3E10
* Search athors from db and get check if institution is froma LA

### [collaborations.ipynb](./collaborations.ipynb)

## Other countries
https://inspirehep.net/institutions?sort=mostrecent&size=25&page=1&q=Dominican+Republic

## Load database and simplify fields

In [84]:
import pandas as pd

In [85]:
df=pd.read_json('data/db_LA.json',dtype={'author_id':str, 'LA_institution_id':str})

In [86]:
df.shape

(2761, 11)

In [87]:
df=df[df['email_addresses'].apply(len)>0].reset_index(drop=True)

In [88]:
df.shape

(1672, 11)

In [89]:
df['author_id'].drop_duplicates().shape

(1473,)

In [90]:
df['full_name']=df['name'].str.get('value')

In [91]:
def get_email(l):
    current=False
    for d in l:
        if d.get('current'):
            email=d.get('value')
            current=True
    if l and not current:
        email=l[0].get('value')
    return email

In [92]:
df['email']=df['email_addresses'].apply(get_email)

In [93]:
df['current_position']=df['positions'].apply(lambda l: [d.get('rank') for d in l if d.get('rank') and d.get('current')]).str[0].fillna('Unknown')

In [94]:
df.columns

Index(['author_id', 'name', 'LA_institution_id', 'external_system_identifiers',
       'LA_country', 'LA_institution', 'email_addresses', 'positions', 'ids',
       'primary_arxiv_category', 'Fields of Study', 'full_name', 'email',
       'current_position'],
      dtype='object')

In [95]:
def get_institution(row):
    instituion=''
    l=[d.get('institution') for d in row.get('positions') if d.get('rank') and d.get('current')]
    if l:
        instituion=l[0]
    else:
        instituion=row.get('LA_institution')
    return instituion

In [96]:
df['current_institution']=df.apply(get_institution, axis='columns')

In [97]:
df['ORCID']=df['ids'].apply(lambda l: [d.get('value') for d in l if d.get('schema')=='ORCID']).str[0].fillna('')

In [98]:
df['Fields_of_Study']=df['Fields of Study'].apply(lambda l: '; '.join(l))

In [99]:
df['primary_arXiv_category']=df['primary_arxiv_category'].apply(lambda l: '; '.join(l))

In [100]:
pd.set_option('display.max_rows', 50)

In [101]:
df[:4]

Unnamed: 0,author_id,name,LA_institution_id,external_system_identifiers,LA_country,LA_institution,email_addresses,positions,ids,primary_arxiv_category,Fields of Study,full_name,email,current_position,current_institution,ORCID,Fields_of_Study,primary_arXiv_category
0,1051173,"{'value': 'Pedraza, Omar', 'preferred_name': '...",909015,"[{'value': 'grid.412866.f', 'schema': 'GRID'},...",Mexico,"UAEH, Pachuca","[{'value': 'omarp@uaeh.edu.mx', 'current': True}]",[{'record': {'$ref': 'https://inspirehep.net/a...,"[{'value': 'O.Pedraza.1', 'schema': 'INSPIRE B...",[gr-qc],"[Particle physics, Mathematical physics]","Pedraza, Omar",omarp@uaeh.edu.mx,Unknown,"UAEH, Pachuca",0000-0002-0260-0910,Particle physics; Mathematical physics,gr-qc
1,1008027,"{'value': 'Gonçalves, Victor Paulo Barros', 'n...",907173,"[{'value': 'grid.411221.5', 'schema': 'GRID'},...",Brazil,Pelotas U.,"[{'value': 'barros@ufpel.edu.br', 'current': T...","[{'rank': 'STAFF', 'record': {'$ref': 'https:/...","[{'value': '0000-0003-4943-9973', 'schema': 'O...","[hep-ph, hep-lat, astro-ph.HE, gr-qc, math-ph,...","[Particle physics, Mathematical physics, Astro...","Gonçalves, Victor Paulo Barros",barros@ufpel.edu.br,STAFF,Pelotas U.,0000-0003-4943-9973,Particle physics; Mathematical physics; Astrop...,hep-ph; hep-lat; astro-ph.HE; gr-qc; math-ph; ...
2,1079084,"{'value': 'Cottin Buracchio, Giovanna Francesc...",910504,"[{'value': 'grid.440617.0', 'schema': 'GRID'},...",Chile,Adolfo Ibanez U.,"[{'value': 'giovanna.cottin@uai.cl'}, {'value'...","[{'rank': 'JUNIOR', 'hidden': False, 'record':...","[{'value': '0000-0002-5308-5808', 'schema': 'O...","[hep-ph, hep-ex]",[Particle physics],"Cottin Buracchio, Giovanna Francesca",giovanna.cottin@uai.cl,JUNIOR,Adolfo Ibanez U.,0000-0002-5308-5808,Particle physics,hep-ph; hep-ex
3,997820,"{'value': 'Mendez, A.', 'preferred_name': 'A. ...",904336,"[{'value': 'grid.7870.8', 'schema': 'GRID'}, {...",Chile,"Chile U., Catolica","[{'value': 'mendez@ifae.es', 'current': True}]",[{'record': {'$ref': 'https://inspirehep.net/a...,"[{'value': 'A.Mendez.1', 'schema': 'INSPIRE BA...",[hep-ph],[Particle physics],"Mendez, A.",mendez@ifae.es,Unknown,"Chile U., Catolica",,Particle physics,hep-ph


In [None]:
dfx=df[['author_id', 'full_name', 'email','LA_institution_id', 
       'LA_country', 'LA_institution', 'current_institution',  'current_position', 'ORCID',
       'Fields_of_Study','primary_arXiv_category']]
dfx.to_csv('data/db_LA.csv',index=False)
dfx.to_excel('data/db_LA.xlsx',index=False)

In [None]:
#
dfx[dfx.full_name.str.contains('Acero')]