# Construction of JSON object Authors from INSPIRE-HEP
<img src="https://raw.githubusercontent.com/restrepo/inspire/master/img/authors.svg" width=700>

## Design of the object-oriented alghorithm
each object must be initialized with the relevant part of the JSON from the INSPIRE-HEP api, and must have a method to obtain the relevant part of the "auhtor" object.

For example. The basic class must be `profile` and the input must be the JSON, `p`, from the profile API response
```
>>> au=profile(p)
>>> au.get_author()
```
`au` has the attributes accoding to the previous diagram

In [1]:
import pandas as pd

In [49]:
import json
import requests
import time
import copy
#https://towardsdatascience.com/do-not-use-if-else-for-validating-data-objects-in-python-anymore-17203e1d65fe
from cerberus import Validator

class empty_json:
    status_code=0
    def json(self):
        return {}

class profile:
    #Default behaviour: if key exists must be 'type'
    schema_profile={"name":{'type':'dict','required':True},
            "email_addresses":{'type':'list'},
            "positions":{'type':'list'},
            "ids":{'type':'list'},
            }        
    sample_profile={"advisors":[{"ids":[{"value":"INSPIRE-00133260",
                                    "schema":"INSPIRE ID"}],
                "name":"Valle, Jose W.F.",
                "hidden":False,
                "record":{"$ref":"https://inspirehep.net/api/authors/985058"},
                "degree_type":"phd","curated_relation":False},
               {"name":"Ponce, William A.","hidden":False,
                "degree_type":"master",
                "curated_relation":False}],
                "email_addresses":[{"value":"restrepo@udea.edu.co","current":True}],
    "positions":[{"rank":"SENIOR",
                  "hidden":False,
                  "record":{"$ref":"https://inspirehep.net/api/institutions/903906"},
                  "current":True,
                  "start_date":"2004",
                  "institution":"Antioquia U.",
                  "curated_relation":True},
                 {"rank":"PHD",
                  "hidden":False,
                  "record":{"$ref":"https://inspirehep.net/api/institutions/907907"},
                  "current":False,
                  "end_date":"2001","start_date":"1997",
                  "institution":"Valencia U., IFIC","curated_relation":True}],
    "ids":[{"value":"D.Restrepo.1","schema":"INSPIRE BAI"},
           {"value":"INSPIRE-00119748","schema":"INSPIRE ID"},
           {"value":"0000-0001-6455-5564","schema":"ORCID"},
           {"value":"diego-restrepo-209b7927","schema":"LINKEDIN"}],
    "name":{"value":"Restrepo Quintero, Diego Alejandro",
            "preferred_name":"Diego Restrepo"},
    "stub":False,
    "urls":[{"value":"http://gfif.udea.edu.co"},
            {"value":"https://scholar.google.com/citations?user=1sKULCoAAAAJ"},
            {"value":"https://www.researchgate.net/profile/Diego_Restrepo2"}],
    "status":"active",
    "$schema":"https://inspirehep.net/schemas/records/authors.json",
    "deleted":False,
    "control_number":991924,
    "legacy_version":"20210323213044.0",
    "arxiv_categories":["hep-ph"],
    "legacy_creation_date":"1999-08-23"}
    def __init__(self,p):
        if p:
            v=Validator(self.schema_profile,allow_unknown=True)
            if not v.validate(p):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP profile dictionary:
                    {v.errors}                    
                    See `self.sample_profile`'''
                               )
        self.profile=p
            
    def get_author(self):
        '''
        Select:
        * current position
        * non-INSPIRE ids
        * name
        * control_number
        * arxiv_categories
        '''
        p=self.profile
        self.is_profile=True
        self.name=p.get('name')
        self.positions=p.get('positions')
        self.email_addresses=p.get('email_addresses')
        self.ids=p.get('ids')
        
        return {'name':self.name,
                'positions':self.positions,
                'email_addresses':self.email_addresses,
                'ids':self.ids}
    
class author(profile):
    sleep=0.4
    #`ids → [{"schema": "INSPIRE BAI"}]` required for author_id
    #`affiliations → [{...}] required for institution_id
    schema_author={"ids": {'type':'list','required':True,
                           'schema':{'type':'dict','required':True,
                                     'schema':{'schema':{'type':'string',
                                                         'required':True}
                                              }
                                    }
                          }, 
            "record":{'type':'dict'}, 
            "full_name": {'type':'string'}, 
            "affiliations":{'type':'list',
                           'schema':{'type':'dict','required':True}}
           }
    sample_author={"ids":[{"value": "D.Restrepo.1", 
                            "schema": "INSPIRE BAI"}
                          ], 
                   "uuid": "300c9b2c-15b0-4937-ad36-8ccff33fd09d", 
                   "emails": ["restrepo@udea.edu.co"], 
                   "record": {"$ref": "https://inspirehep.net/api/authors/991924"}, 
                   "full_name": "Restrepo, Diego", 
                   "affiliations": [{"value": "Antioquia U.", 
                                     "record": {"$ref": "https://inspirehep.net/api/institutions/903906"}},
                                   {"value":"Campinas State U.",
                                    "record":{"$ref":"https://inspirehep.net/api/institutions/902714"}}], 
                   "signature_block": "RASTRAPd", 
                   "raw_affiliations": [{"value": "Instituto de Física, Universidad de Antioquia, Calle 70 No 52-21, Medellín, Colombia"}, 
                                        {"value": "Instituto de Física Gleb Wataghin, UNICAMP, 13083-859, Campinas, SP, Brazil"}]}
    def __init__(self,a,db=[]):
        self.author=a
        self.db=db
        if a:
            v=Validator(self.schema_author,allow_unknown=True)
            if not v.validate(a):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP author dictionary:
                    {v.errors}
                    See `self.sample_author`''')
        #super(author, self).__init__(p)
    def get_authors(self):
        #TODO: Check previous analysis for more metadata
        #use requests
        a=self.author
        db=self.db
        r=empty_json()
        self.author_id=[i for i  in a.get('ids') if i.get('schema')=='INSPIRE BAI'
                  ][0].get('value')

        self.profile_id=''
        affiliations=[dict([(k,d.get(k)) for k in d if k in ['value','record']]) 
                               for d in a.get('affiliations') ]
        url=a.get('record').get("$ref")
        if url and isinstance(url,str):
            self.profile_id=url.split('/')[-1]
            r=requests.get(url)
            time.sleep(self.sleep)
        if r.status_code==200:
            self.profile=r.json().get('metadata')
        d=super(author, self).get_author()
        self.full_name=a.get('full_name')
        ll=[x for x in db if hasattr(x,'author_id') and hasattr(x,'institution_id')]
        #db[i] affiliation not in affiliation are not touched 
        for aff in affiliations:
            #We assumme that aff have at least the institution infomation
            self.institution=aff.get('value')
            self.institution_id=aff.get('record').get('$ref').split('/')[-1]
            filtered_db=[x for x in ll if x.profile_id==self.profile_id and x.institution_id==self.institution_id]
            if filtered_db:
                au=filtered_db[0] #must be unique!
                au.new_attribute=True
            else: #aff not in db → New affiliation
                ai=copy.deepcopy(self)
                db.append(ai)
        return db        

In [50]:
au=profile({})
p=au.sample_profile
au=profile(p)
d=au.get_author()
assert d['name']['value']=='Restrepo Quintero, Diego Alejandro'
assert au.name.get('value')=='Restrepo Quintero, Diego Alejandro'

In [276]:
AU=author({})
a=AU.sample_author
AU=author(a,db=[])
db=AU.get_authors()
au=db[0]
assert au.profile_id=='991924'
assert au.author_id =='D.Restrepo.1'
assert au.email_addresses[0]['value']=='restrepo@udea.edu.co'
assert au.full_name=='Restrepo, Diego'
assert [ d.get('value') for d in au.ids if d.get('schema')=='ORCID'
   ][0]=='0000-0001-6455-5564'
assert au.institution=='Antioquia U.'
assert au.institution_id=='903906'
assert au.name.get('value')=='Restrepo Quintero, Diego Alejandro'
assert au.positions[0].get('rank')=='SENIOR'
assert db[1].institution=='Campinas State U.'
assert db[1].profile_id==db[0].profile_id
#add a third value to db
new_au=copy.deepcopy(db[0])
new_au.profile_id='991925'
db.append(new_au)
AU=author(a,db) #TODO →Check update scenarios
ll=AU.get_authors()
assert ll[2].profile_id=='991925'

In [298]:
class work(author):
    schema_work={'citation_count':{'type':'integer'},
            'control_number':{'type':'integer','required':True},
            'primary_arxiv_category':{'type':'list'},
            'preprint_data':{'type':'string'},
            'legacy_creation_date':{'type':'string'}
            }
    def __init__(self,w,db=[]):
        self.sample_work={'citation_count': 0,
                          'control_number':2080612,
                          'publication_info':[{'year':2022}],
                          'primary_arxiv_category':['hep-ph'],
                          'authors':[self.sample_author]#publication_info,preprint_date,legacy_creation_date
                                }
        if w:
            v=Validator(self.schema_work,allow_unknown=True)
            if not v.validate(w):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP work dictionary:
                    {v.errors}
                    See `self.sample_work`''')
        
        self.db=db
        self.work=w
    def get_authors(self):
        '''
        l: list of author objects
        '''
        l=self.work
        #TODO: Check previous analysis for more metadata
        primary_arxiv_category=l.get('primary_arxiv_category')
        if not primary_arxiv_category:
            primary_arxiv_category=l.get('arxiv_eprints')[0].get('categories')
        if not primary_arxiv_category:
            primary_arxiv_category=[]
        year=str(l.get('publication_info')[0].get('year')) #scheme validate
        if not year:
             year=l.get('preprint_date').split('-')[0]
        if not year:
            year=l.get('legacy_creation_date').split('-')[0]
        if not year:
                year='0000'
        #'inst_id':aff_id
        paper={'recid':l.get('control_number'),'year':year,
               'citation_count':l.get('citation_count'),
               'primary_arxiv_category':primary_arxiv_category}
        #In update
        #'primary_arxiv_category':primary_arxiv_category
        #super(work, self).__init__(w.get('authors')[0])
               
        aus=l.get('authors')
        for a in aus: #same self.author_id but several institute_ids for several affiliations
            super(work, self).__init__(a,self.db)
            super(work, self).get_authors() #add and replace self attributes → author*..., institution*..
            paper['author_id']=self.author_id
            #Each d object keeps its RAM memory space independent of reasignation list from db to adb
            adb=[d for d in self.db if d.author_id==self.author_id]
            print(self.author_id,len(adb))
            papers=[]
            for d in adb:
                if hasattr(d,'papers'):
                    papers=d.papers
                    break #found papers for self.author_id
            for d in adb:
                d.papers=papers #reatach papers if already have
                print(self.db[0].papers,paper)
                paper['instituion_id']=d.institution_id
                if paper not in d.papers:
                    #detach from RAM!
                    d.papers.append(copy.deepcopy(paper))
            #break
                
        return self.db

In [299]:
f=open('data/work.json','r')
w=json.load(f)
f.close()

In [283]:
A=author(w['authors'][0],[])

In [164]:
db=A.get_authors()

In [302]:
W=work({})
w=W.sample_work
W=work(w)
db=W.get_authors()
assert db[1].papers[0]['recid']==2080612
W=work(w,db)
db=W.get_authors()
assert len(db)==2

D.Restrepo.1 2
[] {'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1'}
[{'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '903906'}] {'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '903906'}
D.Restrepo.1 2
[{'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '903906'}, {'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '902714'}] {'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1'}
[{'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restr

In [300]:
W=work(w,[])
db=W.get_authors()
assert len(db[0].papers)==1 and db[0].papers[0].get('author_id')=='N.Bernal.1'
assert len(db[1].papers)==1 and db[1].papers[0].get('author_id')=='D.Restrepo.1'

N.Bernal.1 1
[] {'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1'}
D.Restrepo.1 1
[{'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1', 'instituion_id': '908168'}] {'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '908168'}


In [301]:
W=work(w,db)
db=W.get_authors()
assert len(db[0].papers)==1 and db[0].papers[0].get('author_id')=='N.Bernal.1'
assert len(db[1].papers)==1 and db[1].papers[0].get('author_id')=='D.Restrepo.1'

N.Bernal.1 1
[{'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1', 'instituion_id': '908168'}] {'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1'}
D.Restrepo.1 1
[{'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1', 'instituion_id': '908168'}] {'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '908168'}


In [291]:
db[0].papers[0].get('author_id')

'N.Bernal.1'

In [292]:
db[1].papers[0].get('author_id')

'D.Restrepo.1'

In [294]:
w=W.sample_work
W=work(w,db)
db=W.get_authors()

D.Restrepo.1 2
[{'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1', 'instituion_id': '908168'}] {'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1'}
[{'recid': 1904961, 'year': '2021', 'citation_count': 4, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'N.Bernal.1', 'instituion_id': '908168'}] {'recid': 2080612, 'year': '2022', 'citation_count': 0, 'primary_arxiv_category': ['hep-ph'], 'author_id': 'D.Restrepo.1', 'instituion_id': '903906'}


In [295]:
db[0].papers

[{'recid': 1904961,
  'year': '2021',
  'citation_count': 4,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'N.Bernal.1',
  'instituion_id': '908168'}]

In [296]:
db[1].papers

[{'recid': 1904961,
  'year': '2021',
  'citation_count': 4,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'D.Restrepo.1',
  'instituion_id': '903906'},
 {'recid': 2080612,
  'year': '2022',
  'citation_count': 0,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'D.Restrepo.1',
  'instituion_id': '903906'},
 {'recid': 2080612,
  'year': '2022',
  'citation_count': 0,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'D.Restrepo.1',
  'instituion_id': '902714'}]

In [297]:
db[2].papers

[{'recid': 1904961,
  'year': '2021',
  'citation_count': 4,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'D.Restrepo.1',
  'instituion_id': '903906'},
 {'recid': 2080612,
  'year': '2022',
  'citation_count': 0,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'D.Restrepo.1',
  'instituion_id': '903906'},
 {'recid': 2080612,
  'year': '2022',
  'citation_count': 0,
  'primary_arxiv_category': ['hep-ph'],
  'author_id': 'D.Restrepo.1',
  'instituion_id': '902714'}]

In [128]:
db[0].author_id,db[1].author_id

('N.Bernal.1', 'D.Restrepo.1')

In [131]:
d=db[1]

In [132]:
d.papers

[{'recid': 1904961,
  'year': '2021',
  'citation_count': 4,
  'primary_arxiv_category': ['hep-ph'],
  'institution_id': '908168',
  'author_id': 'N.Bernal.1'},
 {'recid': 1904961,
  'year': '2021',
  'citation_count': 4,
  'primary_arxiv_category': ['hep-ph'],
  'institution_id': '903906',
  'author_id': 'D.Restrepo.1'}]

In [305]:
q='recid:2080612+or+recid:1904961'
r=requests.get(f'https://inspirehep.net/api/literature?size=25&page=1&q={q}')

In [311]:
r.json().get('hits').get('hits')[1].get('metadata')

{'citation_count': 0,
 'citation_count_without_self_citations': 0,
 'authors': [{'uuid': '300c9b2c-15b0-4937-ad36-8ccff33fd09d',
   'recid': 991924,
   'ids': [{'value': 'D.Restrepo.1', 'schema': 'INSPIRE BAI'}],
   'raw_affiliations': [{'value': 'Instituto de Física, Universidad de Antioquia, Calle 70 No 52-21, Medellín, Colombia'},
    {'value': 'Instituto de Física Gleb Wataghin, UNICAMP, 13083-859, Campinas, SP, Brazil'}],
   'record': {'$ref': 'https://inspirehep.net/api/authors/991924'},
   'bai': 'D.Restrepo.1',
   'first_name': 'Diego',
   'full_name': 'Restrepo, Diego',
   'last_name': 'Restrepo',
   'affiliations': [{'record': {'$ref': 'https://inspirehep.net/api/institutions/903906'},
     'value': 'Antioquia U.'},
    {'record': {'$ref': 'https://inspirehep.net/api/institutions/902714'},
     'value': 'Campinas State U.'}],
   'full_name_unicode_normalized': 'restrepo, diego',
   'signature_block': 'RASTRAPd'},
  {'uuid': '6c7aac55-0895-43a2-b068-9e5bc905012d',
   'recid': 

In [None]:
#in general len(l)!=0
authors=inspire.literature(l)

authors → list of authors object
```
[a_1,a_2,...a_n]
```
where `a_i` is just a dictionary which is either created or updated

In [None]:
aus=inspire.country(country)
aus=inspire.legacy_instituion(legacy_instituion)
aus=inspire.literature(inspire_authors)
#TODO
aus=inspire.author(inspire_author) #A author with severeal affiliations generates several au → self.paper is the only different (add email to paper)

In [None]:
au =inspire.profile(profile) # → a_i

Input:

In [None]:
authors=inspire.read_json('authors.json')
authors=inspire.read_list(author_list)

In [102]:
r=requests.get('https://inspirehep.net/api/literature/2080612')

In [115]:
from IPython.display import JSON

In [116]:
JSON(r.json().get('metadata'))

<IPython.core.display.JSON object>

In [45]:
import requests

In [None]:
Can be exported to JSON or Excel