# HEP articles in lens.org from Latinamerica

https://link.lens.org/CuptXEnD9Hk

[Fields of Study](https://docs.openalex.org/about-the-data/concept)

* Level 1: Physics
  * Level 2: Particle physics
  * Level 2: Cosmology
  * Level 2: Astrophysics
  * Level 2: Mathematical physics

Countries with non-zero counts of articles

In [29]:
import pandas as pd
import requests
import time
import json

In [2]:
LA_countries=( 'Brazil'  , 'Mexico'  , 'Colombia'  , 'Chile'  , 'Argentina'  , 'Ecuador'  , 'Guatemala'  , 'Peru'  ,
  'Venezuela'  , 'Uruguay'  , 'Bolivia'  , 'Costa Rica'  , 'Paraguay'  , 'Nicaragua'  , 'Panama'  , 'El Salvador'  , 'Cuba' )
fields_of_study={'Particle physics','Cosmology','Astrophysics','Mathematical physics'}
sep='arxiv.org/abs/'
#From https://github.com/inspirehep/rest-api-doc#rate-limiting
#every IP address is allowed 15 requests in a 5s window.
sleep=0.4

In [3]:
FULL=False
if FULL:
    df=pd.read_csv('LA_HEP.csv')
else:
    df=pd.read_csv('data/LA_HEP_not_cn.csv.gz')

In [4]:
df.columns

Index(['Lens ID', 'Title', 'Date Published', 'Publication Year',
       'Publication Type', 'Source Title', 'ISSNs', 'Publisher',
       'Source Country', 'Author/s', 'Abstract', 'Volume', 'Issue Number',
       'Start Page', 'End Page', 'Fields of Study', 'Keywords', 'MeSH Terms',
       'Chemicals', 'Funding', 'Source URLs', 'External URL', 'PMID', 'DOI',
       'Microsoft Academic ID', 'PMCID', 'Citing Patents Count', 'References',
       'Citing Works Count', 'author_count'],
      dtype='object')

In [5]:
df['author_count']=df['Author/s'].str.split('; ').apply(len)

Avoid large file requests. Analyse collaborations separately

In [6]:
df=df[df['author_count']<=10].reset_index(drop=True)

In [7]:
df.shape

(17508, 30)

In [8]:
df[df['Fields of Study'].str.contains('Physics')].shape

(17508, 30)

In [9]:
PP=df[df['Fields of Study'].str.contains('Particle physics')].shape[0]
PP

3758

In [10]:
PC=df[df['Fields of Study'].str.contains('Cosmology')].shape[0]
PC

829

In [11]:
PA=df[df['Fields of Study'].str.contains('Astrophysics')].shape[0]
PA

8809

In [12]:
PM=df[df['Fields of Study'].str.contains('Mathematical physics')].shape[0]
PM

5076

In [13]:
PP+PA+PC+PM

18472

In [21]:
ndois=df.DOI.dropna().shape[0]
ndois

15146

In [15]:
#nd[nd['Source URLs'].str.contains('arxiv.org')]['Source URLs'].iloc[0]

In [22]:
nd=df[df.DOI.isna()]
nd=nd.dropna(subset=['Source URLs']).reset_index(drop=True)
narxivs=nd[nd['Source URLs'].str.contains('arxiv.org')].shape[0]
narxivs

1496

Time required in hours

In [24]:
s=(dois+narxivs)*3*0.4
s/60/60

5.5473333333333334

Scheme: dicts and lists are the same than in INSPIRE-HEP
```python
{'author_id':str,
 'name':dict, #of author
 'LA_institution_id':str,
 'external_system_identifiers':list, #of institution
 'LA_country':str,
 'LA_institution':str,
 'email_addresses':list, #of author
 'positions':list, #of author
 'ids':list, #of author
 'Fields of Study': list # in ['Particle physics','Cosmology','Astrophysics','Mathematical physics']
}
```

Sort by Publication Year to get the last affiliations

In [25]:
df=df.sort_values('Publication Year',ascending=False).reset_index(drop=True)

In [26]:
df[:1]

Unnamed: 0,Lens ID,Title,Date Published,Publication Year,Publication Type,Source Title,ISSNs,Publisher,Source Country,Author/s,...,Source URLs,External URL,PMID,DOI,Microsoft Academic ID,PMCID,Citing Patents Count,References,Citing Works Count,author_count
0,025-688-286-323-660,Possible Relationship of Meteor Disintegration...,,2022,journal article,Advances in Space Research,02731177; 18791948,Elsevier BV,United Kingdom,A. A. Pimenta; Paulo Batista; Vania Fatima And...,...,https://www.sciencedirect.com/science/article/...,http://dx.doi.org/10.1016/j.asr.2021.11.028,,10.1016/j.asr.2021.11.028,3216041575,,0,001-562-150-814-802; 006-975-738-231-584; 007-...,0,5


In [42]:
db=[]

In [45]:
istart=538

In [None]:
def get_index(db):
    i=0
    d_index={}
    for d in db:
        d_index[ d.get('author_id') ]=i
        i=i+1
    return d_index

for ii,ir in df[istart:].iterrows():
    print(ii,end='\r')
    ii=ii+1
    #UPDATE LISTS
    if db:
        idsLA=[d.get('LA_institution_id') for d in db]
        idsAU=[d.get('author_id') for d in db]
        db_index=get_index(db)
    else:
        idsLA=[]
        idsAU=[]
        db_index={}
    if isinstance(ir.DOI,str):
        q=[ 'doi',ir.DOI ]
    elif isinstance(ir['Source URLs'],str) and ir['Source URLs'].find(sep)>-1:
        q=[ 'arXiv',ir['Source URLs'].split(sep)[-1].split()[0] ]
    else:
        continue
    FoS=list(fields_of_study.intersection( ir['Fields of Study'].split('; ')  ) )
    time.sleep(sleep)
    r=requests.get(f'https://inspirehep.net/api/literature?q={q[0]}:{q[1]}')
    if r.status_code==200:
        d=r.json()
    else:
        continue

    #authors...    
    l=d.get('hits').get('hits')
    if isinstance(l,list) and len(l)==1: #OK → Unique result
        la=l[0].get('metadata').get('authors')
    else:
        #print(f'WARNING: {q} ignored by hits or wrong count')
        continue

    for a in la:
        author_id=None
        url_author=None        
        try:
            url_author=a.get('record').get('$ref')
        except:
            continue #next author (Not author Profile)
        if isinstance(url_author,str):
            author_id=url_author.split('/')[-1]
        else:
            continue #next author
        if author_id in idsAU:
            try:
                FoS=list(set(db[db_index[author_id]].get('Fields of Study')).union(FoS))
            except:
                pass
            #print('DEBUG:already there')
            continue #next author
        #affiliations... → LA required
        li=a.get('affiliations')
        if not li:
            continue #Next institution
        for i in li:
            country=None
            esi=None
            try:
                inst_name=i.get('value')
                inst_url=i.get('record').get('$ref')
            except:
                #Next institution
                continue
            inst_id=inst_url.split('/')[-1]

            LA=False #CONTINUE only if LA is True
            if inst_id in idsLA:
                #print('DEBUG: Institution already there')
                LA=True #institution is from LA
                country=[ d.get('LA_country') for d in db 
                          if d.get('LA_institution_id')==inst_id ][0]                
                esi=[ d.get('external_system_identifiers') for d in db 
                          if d.get('LA_institution_id')==inst_id ][0]
            else:
                #new institution
                time.sleep(sleep)
                ri=requests.get(inst_url)
                if ri.status_code==200:
                    di=ri.json()
                else:
                    continue #Next institution
                    
                try:
                    country=di.get('metadata').get('addresses')[0].get('country')
                except:
                    country=None
                if country not in LA_countries:
                    continue #next institution
                else:
                    #Get insitution metada from inspire
                    esi=di.get('metadata').get('external_system_identifiers')
                    if not esi:
                        esi=[]
                    LA=True
            if LA:
                if isinstance(url_author,str):
                    time.sleep(sleep)
                    epa=requests.get(url_author)

                if epa.status_code==200:
                    epa=epa.json()

                positions=epa.get('metadata').get('positions')
                if not positions:
                    positions=[]
                    
                email_addresses=epa.get('metadata').get('email_addresses')

                if not email_addresses:
                    email_addresses=[]

                name=epa.get('metadata').get('name')
                if not name:
                    name={}

                aids=epa.get('metadata').get('ids')
                if not aids:
                    aids=[]
                    
                db.append(
                    {'author_id':author_id,
                     'name':name,
                     'LA_institution_id':inst_id,
                     'external_system_identifiers':esi,
                     'LA_country':country,
                     'LA_institution':inst_name,
                     'email_addresses':email_addresses,
                     'positions':positions,
                     'ids':aids,
                     'Fields of Study': FoS # in ['Particle physics','Cosmology','Astrophysics','Mathematical physics']
                    }
                )
                if ii>0 and ii%100==0:
                    f=open('db_LA.json','w')
                    json.dump(db,f)
                    f.close()

2310

In [None]:
2214