# Construction of JSON object Authors from INSPIRE-HEP
For authors with declared affiliation
<img src="https://raw.githubusercontent.com/restrepo/inspire/master/img/authors.svg" width=700>

### TODO:
1. Missing countries object
3. Test RAM with sys.getsizeof()

## Design of the object-oriented alghorithm
each object must be initialized with the relevant part of the JSON from the INSPIRE-HEP api, and must have a method to obtain the relevant part of the "auhtor" object.

For example. The basic class must be `profile` and the input must be the JSON, `p`, from the profile API response
```
>>> au=profile(p)
>>> au.get_author()
```
`au` has the attributes accoding to the previous diagram

In [1]:
import pandas as pd
import sys
from IPython.display import JSON

In [2]:
import json
import requests
import time
import copy
#https://towardsdatascience.com/do-not-use-if-else-for-validating-data-objects-in-python-anymore-17203e1d65fe
from cerberus import Validator
import pycountry

global countries
countries=[x.name for x in pycountry.countries]

global _excluded_keys
_excluded_keys=['get_author','author','sample_profile','schema_profile','to_dict','to_json',
               'get_authors','db','sample_author', 'schema_author', 'sleep','profile',
                'sample_work', 'schema_work', 'work',
               'institutions_list', 'literature_size','not_collaboration','q', 'q_inst', 'size', 'size_inst', 'work_list']

class empty_json:
    status_code=0
    def json(self):
        return {}

class profile:
    #Default behaviour: if key exists must be 'type'
    schema_profile={"name":{'type':'dict','required':True},
            "email_addresses":{'type':'list'},
            "positions":{'type':'list'},
            "ids":{'type':'list'},
            }        
    sample_profile={"advisors":[{"ids":[{"value":"INSPIRE-00133260",
                                    "schema":"INSPIRE ID"}],
                "name":"Valle, Jose W.F.",
                "hidden":False,
                "record":{"$ref":"https://inspirehep.net/api/authors/985058"},
                "degree_type":"phd","curated_relation":False},
               {"name":"Ponce, William A.","hidden":False,
                "degree_type":"master",
                "curated_relation":False}],
                "email_addresses":[{"value":"restrepo@udea.edu.co","current":True}],
    "positions":[{"rank":"SENIOR",
                  "hidden":False,
                  "record":{"$ref":"https://inspirehep.net/api/institutions/903906"},
                  "current":True,
                  "start_date":"2004",
                  "institution":"Antioquia U.",
                  "curated_relation":True},
                 {"rank":"PHD",
                  "hidden":False,
                  "record":{"$ref":"https://inspirehep.net/api/institutions/907907"},
                  "current":False,
                  "end_date":"2001","start_date":"1997",
                  "institution":"Valencia U., IFIC","curated_relation":True}],
    "ids":[{"value":"D.Restrepo.1","schema":"INSPIRE BAI"},
           {"value":"INSPIRE-00119748","schema":"INSPIRE ID"},
           {"value":"0000-0001-6455-5564","schema":"ORCID"},
           {"value":"diego-restrepo-209b7927","schema":"LINKEDIN"}],
    "name":{"value":"Restrepo Quintero, Diego Alejandro",
            "preferred_name":"Diego Restrepo"},
    "stub":False,
    "urls":[{"value":"http://gfif.udea.edu.co"},
            {"value":"https://scholar.google.com/citations?user=1sKULCoAAAAJ"},
            {"value":"https://www.researchgate.net/profile/Diego_Restrepo2"}],
    "status":"active",
    "$schema":"https://inspirehep.net/schemas/records/authors.json",
    "deleted":False,
    "control_number":991924,
    "legacy_version":"20210323213044.0",
    "arxiv_categories":["hep-ph"],
    "legacy_creation_date":"1999-08-23"}
    def __init__(self,p):
        if p:
            v=Validator(self.schema_profile,allow_unknown=True)
            if not v.validate(p):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP profile dictionary:
                    {v.errors}                    
                    See `self.sample_profile`'''
                               )
        self.profile=p
            
    def get_author(self):
        '''
        Select:
        * current position
        * non-INSPIRE ids
        * name
        * control_number
        * arxiv_categories
        '''
        if not hasattr(self,'profile'):
            return {}
        p=self.profile
        self.name=p.get('name')
        self.positions=p.get('positions')
        self.email_addresses=p.get('email_addresses')
        self.ids=p.get('ids')
        
        return self
    def to_dict(self):
        #print( dir(self) )
        l=[]
        for x in dir(self):
            if x.find('__')==-1 and x not in _excluded_keys:
                #print(x)
                l.append((x,eval( f'self.{x}')) )
        return dict(l)
        #return dict( [(x,eval( f'self.{x}')) for x in dir(self) 
        #  if x.find('__')==-1 and x not in _excluded_keys] )
    
class author(profile):
    sleep=0.4
    #`ids → [{"schema": "INSPIRE BAI"}]` required for author_id
    #`affiliations → [{...}] required for institution_id
    schema_author={"ids": {'type':'list','required':True,
                           'schema':{'type':'dict','required':True,
                                     'schema':{'schema':{'type':'string',
                                                         'required':True},
                                               'value': {'type':'string'}
                                              }
                                    }
                          }, 
            "record":{'type':'dict'}, 
            "full_name": {'type':'string'}, 
            "affiliations":{'type':'list',
                           'schema':{'type':'dict','required':True}}
           }
    sample_author={"ids":[{"value": "D.Restrepo.1", 
                            "schema": "INSPIRE BAI"}
                          ], 
                   "uuid": "300c9b2c-15b0-4937-ad36-8ccff33fd09d", 
                   "emails": ["restrepo@udea.edu.co"], 
                   "record": {"$ref": "https://inspirehep.net/api/authors/991924"}, 
                   "full_name": "Restrepo, Diego", 
                   "affiliations": [{"value": "Antioquia U.", 
                                     "record": {"$ref": "https://inspirehep.net/api/institutions/903906"}},
                                   {"value":"Campinas State U.",
                                    "record":{"$ref":"https://inspirehep.net/api/institutions/902714"}}], 
                   "signature_block": "RASTRAPd", 
                   "raw_affiliations": [{"value": "Instituto de Física, Universidad de Antioquia, Calle 70 No 52-21, Medellín, Colombia"}, 
                                        {"value": "Instituto de Física Gleb Wataghin, UNICAMP, 13083-859, Campinas, SP, Brazil"}]}
    def __init__(self,a,db=[]):
        self.author=a
        self.db=db
        if a:
            v=Validator(self.schema_author,allow_unknown=True)
            if not v.validate(a):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP author dictionary:
                    {v.errors}
                    See `self.sample_author`''')
        #super(author, self).__init__(p)
    def get_authors(self):
        #TODO: Check previous analysis for more metadata
        #use requests
        a=self.author
        r=empty_json()
        self.full_name=a.get('full_name')        
        self.author_id=[i for i  in a.get('ids') if i.get('schema')=='INSPIRE BAI'
                  ][0].get('value')

        #a.get('affiliations') is not always there
        try:
            affiliations=[dict([(k,d.get(k)) for k in d if k in ['value','record']]) 
                               for d in a.get('affiliations') ]
        except TypeError:
            affiliations=[]
            
        #Profile
        try:
            url=a.get('record').get("$ref")
        except AttributeError:
            url=''
        if url and isinstance(url,str):
            self.profile_id=url.split('/')[-1]
            r=requests.get(url)
            time.sleep(self.sleep)
        if r.status_code==200:
            p=r.json().get('metadata')
        else:
            p={}#{'status_code':r.status_code}
            #return self.db #author without affilitions are not considered
            
        #We assume that the affilition is defined at least for
        #one of the authors of the paper
        #Authos without affiliations to get their institution are not considered
        super(author, self).__init__(p)             
        d=super(author, self).get_author() #→ self.profile
        
        #self.to_dict=super(author, self).to_dict
        ll=[x for x in self.db if hasattr(x,'author_id') and hasattr(x,'institution_id')]
        #self.db[i] affiliation not in affiliation are not touched
        self.institution=''
        self.institution_id=0
        for aff in affiliations:
            #We assumme that aff have at least the institution infomation
            self.institution=aff.get('value')
            self.institution_id=aff.get('record').get('$ref').split('/')[-1]
            filtered_db=[x for x in ll if x.author_id==self.author_id and x.institution_id==self.institution_id]
            if filtered_db:
                au=filtered_db[0] #must be unique!
            else: #aff not in self.db → New affiliation
                ai=copy.deepcopy(self)
                self.db.append(ai)
        return self.db
    def to_json(self):
        return [d.to_dict() for d in self.db]
        
class work(author):
    schema_work={'citation_count':{'type':'integer'},
            'control_number':{'type':'integer','required':True},
            'primary_arxiv_category':{'type':'list'},
            'preprint_data':{'type':'string'},
            'legacy_creation_date':{'type':'string'}
            }
    def __init__(self,w,db=[]):
        self.sample_work={'citation_count': 0,
                          'control_number':2080612,
                          'publication_info':[{'year':2022}],
                          'primary_arxiv_category':['hep-ph'],
                          'authors':[self.sample_author]#publication_info,preprint_date,legacy_creation_date
                                }
        if w:
            v=Validator(self.schema_work,allow_unknown=True)
            if not v.validate(w):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP work dictionary:
                    {v.errors}
                    See `self.sample_work`''')
        
        self.db=db
        self.work=w
    def get_authors(self):
        '''
        l: list of author objects
        '''
        l=self.work
        #TODO: Check previous analysis for more metadata
        primary_arxiv_category=l.get('primary_arxiv_category')
        if not primary_arxiv_category:
            try:
                primary_arxiv_category=l.get('arxiv_eprints')[0].get('categories')
            except TypeError:
                primary_arxiv_category=[]
        #if not primary_arxiv_category:
        #    primary_arxiv_category=[]
        try:
            year=str(l.get('publication_info')[0].get('year')) #scheme validate
        except TypeError:
            year=0
        if not year:
            try:
                year=l.get('preprint_date').split('-')[0]
            except TypeError:
                year=0
        if not year:
            try:
                year=l.get('legacy_creation_date').split('-')[0]
            except TypeError:
                year=0
        if not year:
                year='0000'
        #'inst_id':aff_id
        paper={'recid':l.get('control_number'),'year':year,
               'citation_count':l.get('citation_count'),
               'primary_arxiv_category':primary_arxiv_category}
        #In update
        #'primary_arxiv_category':primary_arxiv_category
        #super(work, self).__init__(w.get('authors')[0])
               
        aus=l.get('authors')
        for a in aus: #same self.author_id but several institute_ids for several affiliations
            super(work, self).__init__(a,self.db)
            super(work, self).get_authors() #add and replace self attributes → author*..., institution*..
            paper['author_id']=self.author_id
            #Each d object keeps its RAM memory space independent of reasignation list from db to adb
            adb=[d for d in self.db if d.author_id==self.author_id]
            papers=[]
            for d in adb:
                if hasattr(d,'papers'):
                    papers=d.papers
                    break #found papers for self.author_id
            for d in adb:
                d.papers=papers #reatach papers if already have
                paper['instituion_id']=d.institution_id
                if paper not in d.papers:
                    #detach from RAM!
                    d.papers.append(copy.deepcopy(paper))
            #break
                
        return self.db        

class literature(work):
    def __init__(self,q,db=[],size=25):
        self.q=q
        self.db=db
        self.size=size
    def get_authors(self):
        NEXT=True
        i=0
        while NEXT:
            if i==0:
                url=f'https://inspirehep.net/api/literature?size={self.size}&page=1&q={self.q}'
            r=requests.get(url)
            time.sleep(self.sleep)        
            if r.status_code!=200:
                print(f'WARNING → bad request for q={self.q}')
                return self.db
            try:
                self.work_list=r.json().get('hits').get('hits')
            except:
                print(f'WARNING → bad request for q={self.q}')
                return self.db
            #We assume here that r.json().get('links') exists
            if r.json().get('links').get('next'):
                url=r.json().get('links').get('next')
            else:
                NEXT=False
            for l in self.work_list:
                w=l.get('metadata')
                super(literature, self).__init__(w,self.db)
                super(literature, self).get_authors() #add and replace self attributes → author*..., institution*..
            #EMERGENCY EXIT: We assume here that r.json().get('total') exists
            if i>r.json().get('hits').get('total')//self.size+1:
                NEXT=False
            i+=1
        return self.db
    
class institutions(literature):
    def __init__(self,q,not_collaboration=True,db=[],size=25,literature_size=25):
        self.q_inst=q
        self.db=db
        self.size_inst=size
        self.literature_size=literature_size
        self.not_collaboration=not_collaboration
    def get_authors(self):
        if self.not_collaboration:
            ac='ac 1->10'
        else:
            ac=''
        NEXT=True
        i=0
        while NEXT:
            if i==0:
                #TODO: add author number restriction
                url=f'https://inspirehep.net/api/institutions/?q={self.q_inst}&size={self.size_inst}&page=1'
            r=requests.get(url)
            time.sleep(self.sleep)        
            if r.status_code!=200:
                print(f'WARNING → bad request for q={self.q_inst}')
                return self.db
            try:
                self.institutions_list=r.json().get('hits').get('hits')
            except:
                print(f'WARNING → bad request for q={self.q_inst}')
                return self.db
            #We assume from here that r.json() is just OK
            links=r.json().get('links')
            total=r.json().get('hits').get('total')
            COUNTRY=False
            if self.q_inst in countries:
                COUNTRY=True
            #get aff_legacy name and aff realy belongs to country: https://stackoverflow.com/a/46249796
            #fix q=aff_legacy
            for ist in self.institutions_list:
                if COUNTRY:
                    try:
                        country=ist.get('metadata').get('addresses')[0].get('country')
                    except:
                        country=''
                    if self.q_inst!=country:
                        print(f'WARNING: affiliation is in {country}, not in {self.q_inst}')
                        continue
                aff=ist.get('metadata').get('legacy_ICN')
                #TODO: check old code for encoding
                q_aff=f'aff {aff}'
                if ac:
                    q_aff=f'{q_aff} and {ac}'
                print(q_aff)                                    
                q_aff=requests.utils.quote(q_aff)
                super(institutions, self).__init__(q_aff,self.db,self.literature_size) #WARNING → self.q and self.size produced
                super(institutions, self).get_authors() #add and replace self attributes → author*..., institution*..
            #We assume here that r.json().get('links') exists
            if links.get('next'):
                url=links.get('next')
            else:
                NEXT=False
            #EMERGENCY EXIT: We assume here that r.json().get('total') exists
            if i > total//self.size_inst+1:
                NEXT=False
            print(f'page: {i} of {total} → {url}')                
            i+=1
        return self.db    

In [3]:
au=profile({})
p=au.sample_profile
au=profile(p)
P=au.get_author()
assert P.name.get('value') =='Restrepo Quintero, Diego Alejandro'
assert au.name.get('value')=='Restrepo Quintero, Diego Alejandro'
assert au.to_dict().get('name').get('value')=='Restrepo Quintero, Diego Alejandro'

In [4]:
AU=author({})
a=AU.sample_author
AU=author(a,db=[])
db=AU.get_authors()
au=db[0]
assert au.profile_id=='991924'
assert au.author_id =='D.Restrepo.1'
assert au.email_addresses[0]['value']=='restrepo@udea.edu.co'
assert au.full_name=='Restrepo, Diego'
assert [ d.get('value') for d in au.ids if d.get('schema')=='ORCID'
   ][0]=='0000-0001-6455-5564'
assert au.institution=='Antioquia U.'
assert au.institution_id=='903906'
assert au.name.get('value')=='Restrepo Quintero, Diego Alejandro'
assert au.positions[0].get('rank')=='SENIOR'
assert db[1].institution=='Campinas State U.'
assert db[1].profile_id==db[0].profile_id
#add a third value to db
new_au=copy.deepcopy(db[0])
new_au.profile_id='991925'
db.append(new_au)
AU=author(a,db) #TODO →Check update scenarios
ll=AU.get_authors()
assert ll[2].profile_id=='991925'
assert AU.to_json()[1].get('institution')=='Campinas State U.'
assert AU.to_json()[0].get('profile_id')==AU.to_json()[1].get('profile_id')

In [5]:
W=work({})
w=W.sample_work
W=work(w)
db=W.get_authors()
assert db[1].papers[0]['recid']==2080612
W=work(w,db)
db=W.get_authors()
assert len(db)==2
assert W.to_json()[1].get('papers')[0]['recid']==2080612

In [6]:
f=open('data/work.json','r')
w=json.load(f)
f.close()
W=work(w,[])
db=W.get_authors()
assert len(db[0].papers)==1 and db[0].papers[0].get('author_id')=='N.Bernal.1'
assert len(db[1].papers)==1 and db[1].papers[0].get('author_id')=='D.Restrepo.1'
assert len(W.to_json()[0].get('papers'))==1 and W.to_json()[0].get('papers')[0].get('author_id')=='N.Bernal.1'
assert len(W.to_json()[1].get('papers'))==1 and W.to_json()[1].get('papers')[0].get('author_id')=='D.Restrepo.1'

In [7]:
#db is update only if necessary
W=work(w,db)
db=W.get_authors()
assert len(db[0].papers)==1 and db[0].papers[0].get('author_id')=='N.Bernal.1'
assert len(db[1].papers)==1 and db[1].papers[0].get('author_id')=='D.Restrepo.1'
assert len(W.to_json()[0].get('papers'))==1 and W.to_json()[0].get('papers')[0].get('author_id')=='N.Bernal.1'
assert len(W.to_json()[1].get('papers'))==1 and W.to_json()[1].get('papers')[0].get('author_id')=='D.Restrepo.1'

In [8]:
#db only updated with new papers
w=W.sample_work
for i in range(2):
    W=work(w,db)
    db=W.get_authors()
    assert len(db[1].papers)==3
    assert len( W.to_json()[1].get('papers') )==3

test with author without profile

In [19]:
a={"raw_affiliations":
   [{"value":"Don Bosco University, San Salvador, El Salvador"}],
   "full_name_unicode_normalized":"terezón, brisa",
   "full_name":"Terezón, Brisa",
   "affiliations":[{"record":{"$ref":"https://inspirehep.net/api/institutions/907440"},
                    "value":"El Salvador U."}],
   "ids":[{"schema":"INSPIRE BAI","value":"B.Terezon.1"}],
   "last_name":"Terezón",
   "bai":"B.Terezon.1",
   "signature_block":"TARASANb",
   "uuid":"78752d3e-e818-4deb-b9bb-482560136993",
   "first_name":"Brisa"}

In [20]:
A=author(a)

In [21]:
A.get_authors()

[<__main__.author at 0x7fd6e6dfa910>]

In [23]:
A.to_json()[0]

{'author': {'raw_affiliations': [{'value': 'Don Bosco University, San Salvador, El Salvador'}],
  'full_name_unicode_normalized': 'terezón, brisa',
  'full_name': 'Terezón, Brisa',
  'affiliations': [{'record': {'$ref': 'https://inspirehep.net/api/institutions/907440'},
    'value': 'El Salvador U.'}],
  'ids': [{'schema': 'INSPIRE BAI', 'value': 'B.Terezon.1'}],
  'last_name': 'Terezón',
  'bai': 'B.Terezon.1',
  'signature_block': 'TARASANb',
  'uuid': '78752d3e-e818-4deb-b9bb-482560136993',
  'first_name': 'Brisa'},
 'author_id': 'B.Terezon.1',
 'email_addresses': None,
 'full_name': 'Terezón, Brisa',
 'ids': None,
 'institution': 'El Salvador U.',
 'institution_id': '907440',
 'name': None,
 'positions': None}

test with small institution

In [8]:
'El Salvador' in countries

True

test with medium institution (one-page)

test with large institution (several-pages)

test with small country

In [25]:
C=institutions('El Salvador',[])

In [26]:
db=C.get_authors()

aff El Salvador U.
aff Unlisted, SV
page: 0 of 2 → https://inspirehep.net/api/institutions/?q=El Salvador&size=25&page=1


In [27]:
sys.getsizeof(db)

120

In [30]:
pd.DataFrame(C.to_json())

Unnamed: 0,author,author_id,email_addresses,full_name,ids,institution,institution_id,name,papers,positions,profile_id
0,"{'affiliations': [{'value': 'Roraima U.', 'rec...",M.De.Campos.2,"[{'value': 'miguel.campos@ufrr.br', 'current':...","De Campos, Miguel","[{'value': 'M.De.Campos.2', 'schema': 'INSPIRE...",Roraima U.,906911,"{'value': 'Campos, Miguel', 'name_variants': [...","[{'recid': 1646671, 'year': '2021', 'citation_...",[{'record': {'$ref': 'https://inspirehep.net/a...,1040220
1,"{'ids': [{'value': 'R.Hurtado.1', 'schema': 'I...",R.Hurtado.1,"[{'value': 'rghurtadoh@unal.edu.co', 'current'...","Hurtado, Rafael","[{'value': 'R.Hurtado.1', 'schema': 'INSPIRE B...",Perugia U.,904144,"{'value': 'Hurtado, Rafael', 'preferred_name':...","[{'recid': 469153, 'year': '1998', 'citation_c...","[{'rank': 'SENIOR', 'record': {'$ref': 'https:...",1005244
2,"{'ids': [{'value': 'R.Hurtado.1', 'schema': 'I...",R.Hurtado.1,"[{'value': 'rghurtadoh@unal.edu.co', 'current'...","Hurtado, Rafael","[{'value': 'R.Hurtado.1', 'schema': 'INSPIRE B...","INFN, Perugia",904996,"{'value': 'Hurtado, Rafael', 'preferred_name':...","[{'recid': 469153, 'year': '1998', 'citation_c...","[{'rank': 'SENIOR', 'record': {'$ref': 'https:...",1005244
3,"{'ids': [{'value': 'G.Violini.1', 'schema': 'I...",G.Violini.1,,"Violini, Galileo","[{'value': 'G.Violini.1', 'schema': 'INSPIRE B...",Calabria U.,903615,"{'value': 'Violini, Galileo', 'preferred_name'...","[{'recid': 469153, 'year': '1998', 'citation_c...",[{'record': {'$ref': 'https://inspirehep.net/a...,984616
4,"{'ids': [{'value': 'G.Violini.1', 'schema': 'I...",G.Violini.1,,"Violini, Galileo","[{'value': 'G.Violini.1', 'schema': 'INSPIRE B...","INFN, Cosenza",906035,"{'value': 'Violini, Galileo', 'preferred_name'...","[{'recid': 469153, 'year': '1998', 'citation_c...",[{'record': {'$ref': 'https://inspirehep.net/a...,984616
5,"{'ids': [{'value': 'G.Violini.1', 'schema': 'I...",G.Violini.1,,"Violini, Galileo","[{'value': 'G.Violini.1', 'schema': 'INSPIRE B...",El Salvador U.,907440,"{'value': 'Violini, Galileo', 'preferred_name'...","[{'recid': 469153, 'year': '1998', 'citation_c...",[{'record': {'$ref': 'https://inspirehep.net/a...,984616


test with medium country (one-page)

test with large country (several-pages)

authors → list of authors object
```
[a_1,a_2,...a_n]
```
where `a_i` is just a dictionary which is either created or updated

In [None]:
institution

In [None]:
aus=inspire.country(country)
aus=inspire.legacy_instituion(legacy_instituion)
aus=inspire.literature(inspire_authors)
#TODO
aus=inspire.author(inspire_author) #A author with severeal affiliations generates several au → self.paper is the only different (add email to paper)

In [None]:
au =inspire.profile(profile) # → a_i

Input:

In [None]:
authors=inspire.read_json('authors.json')
authors=inspire.read_list(author_list)

In [102]:
r=requests.get('https://inspirehep.net/api/literature/2080612')

In [116]:
JSON(r.json().get('metadata'))

<IPython.core.display.JSON object>

In [45]:
import requests

In [None]:
Can be exported to JSON or Excel