# Construction of JSON object for experimental Authors, from INSPIRE-HEP
<img src="https://raw.githubusercontent.com/restrepo/inspire/master/img/authors-ex.svg" width=1000>

In [1]:
from inspirehep import *
class experimental_author(profile):
    sleep = 0.4
    #`ids → [{"schema": "INSPIRE BAI"}]` required for author_id
    #`affiliations → [{...}] required for institution_id
    schema_experimental_author={"ids": {'type':'list',
                           'schema':{'type':'dict',
                                     'schema':{'schema':{'type':'string'},
                                               'value': {'type':'string'}
                                              }
                                    }
                          }, 
            "record":{'type':'dict'}, 
            "full_name": {'type':'string'}, 
            "affiliations":{'type':'list',
                           'schema':{'type':'dict','required':True}},
            "proyect_membership":{'type':'list',
                                    'schema':{'type':'dict',
                                              'schema':{'record':{'type':'dict'},
                                                        'name':{'type':'string'},
                                                        'current':{'type':'boolean'}
                                                       }
                                             }
                                },
            "name":{'type':'dict',
                    'schema':{'value':{'type':'string'},
                              'preferred_name':{'type':'string'},
                              }
                    },
            "control_number":{'type':'integer'},
            "arxiv_categories":{'type':'list'}
           }
    sample_experimental_author={'project_membership': [{'name': 'CERN-LHC-CMS',
                                    'record': {'$ref': 'https://inspirehep.net/api/experiments/1108642'},
                                    'current': False,
                                    'curated_relation': True}],
                                'ids': [{'value': 'Y.Andreev.1', 'schema': 'INSPIRE BAI'}],
                                'name': {'value': 'Andreev, Yuri',
                                'preferred_name': 'Yuri Andreev'},
                                'control_number': 1018372,
                                'arxiv_categories': ['hep-ex']
                                }
    

    def __init__(self,a,db=[]):
        self.author=a
        self.db=db
        if a:
            v=Validator(self.schema_experimental_author,allow_unknown=True)
            if not v.validate(a):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP author dictionary:
                    {v.errors}
                    See `self.sample_author`''')
    def get_authors(self):
        #TODO: Check previous analysis for more metadata
        #use requests
        # Not longer required

        a=self.author


        self.project_membership = a.get('project_membership')
        control_number = a.get('control_number')
        url = f"https://inspirehep.net/api/authors/{control_number}"
        r=empty_json()

        try:
            r=requests.get(url,timeout=timeout)
            time.sleep(self.sleep)
        except:
            r.status_code=-1
        if r.status_code==200:
            p=r.json().get('metadata')
        else:
            p={}
        #We assume that the affilition is defined at least for
        #one of the authors of the paper
        #Authos without affiliations to get their institution are not considered
        super(experimental_author, self).__init__(p)             
        d=super(experimental_author, self).get_author() #→ self.profile
        try:
            self.full_name=p.get('name').get('value')
        except:
            self.full_name=''
        self.profile_id=p.get('control_number')
        try:
            self.author_id=[i for i  in self.profile.get('ids') if i.get('schema')=='INSPIRE BAI'
                        ][0].get('value')
        except:
            self.author_id=None 

        ll=[x for x in self.db if hasattr(x,'author_id') and hasattr(x,'institution_id')]
        # TODO: Check if the author has multiple current positions
        # for positions...
        try:
            current_position=self.profile.get('positions')[0]
        except:
            current_position={}
        self.institution=current_position.get('institution')
        try:
            aff_url=current_position.get('record').get('$ref')
        except:
            aff_url=''
        self.institution_id=aff_url.split('/')[-1]
        # ==== country ====
        self.country=None
        if aff_url:
            rr=empty_json()
            try:
                rr=requests.get(aff_url,timeout=timeout)
                time.sleep(self.sleep)
            except:
                rr.status_code=-1
            if rr.status_code==200:
                try:
                    self.country=rr.json().get('metadata').get('addresses')[0].get('country')
                    # if self.country:
                    #    self.cacheco[self.institution]=self.country
                except:
                    self.country=None
        else:
            self.country=None
        # =================
        filtered_db=[x for x in ll if x.author_id==self.author_id 
                         and x.institution_id==self.institution_id]
        if filtered_db:
            au=filtered_db[0] #must be unique!
        else: #aff not in self.db → New affiliation
            ai=copy.copy(self)#deepcopy(self)
            self.db.append(ai)
            del ai #be sure that ai will not be modified

        return self.db
    def to_json(self):
        return [d.to_dict() for d in self.db]


class experiment(experimental_author):
    '''
    paper['legacy_name']=ex.get('legacy_name')
    paper['control_number']=ex.get('control_number')
    paper['number_of_papers']=ex.get('number_of_papers')
    paper['collaboration']=ex.get("collaboration").get('value')
    paper['literature_link']
    '''
    schema_experiment={'control_number':{'type':'integer','required':True},
                       'number_of_papers':{'type':'integer'},
                       'legacy_name':{'type':'string'},
                       'collaboration':{'type':'dict',
                                        'schema':{'value':{'type':'string'}}},
                       'description':{'type':'string'},
                       'accelerator':{'type':'dict',
                                      'schema':{'value':{'type':'string'}}},
                       'date_approved':{'type':'string'},
                       'date_proposed':{'type':'string'},
                       'date_started':{'type':'string'},
                       'date_completed':{'type':'string'},
                       'project_type':{'type':'list'},
                       'inspire_classification':{'type':'list'}
            }
    def __init__(self,e,db=[],size=250):
        self.sample_experiment = {'number_of_papers': 13049, #
                                    'self': {'$ref': 'https://inspirehep.net/api/experiments/1108642'}, 
                                    'long_name': 'CMS: The Compact Muon Solenoid',
                                    'accelerator': {'value': 'LHC'},
                                    'description': 'The 27-km Large Hadron Collider (LHC) is the largest and most powerful particle accelerator ever built. It accelerates protons to nearly the velocity of light -- in clockwise and anti-clockwise directions -- and then collides them at four locations around its ring. At these points, the energy of the particle collisions gets transformed into mass, spraying particles in all directions. The Compact Muon Solenoid (or CMS) detector sits at one of these four collision points. It is a general-purpose detector; that is, it is designed to observe any new physics phenomena that the LHC might reveal. CMS acts as a giant, high-speed camera, taking 3D “photographs” of particle collisions from all directions up to 40 million times each second. Although most of the particles produced in the collisions are “unstable”, they transform rapidly into stable particles that can be detected by CMS. By identifying (nearly) all the stable particles produced in each collision, measuring their momenta and energies, and then piecing together the information of all these particles like putting together the pieces of a puzzle, the detector can recreate an “image” of the collision for further analysis.',
                                    'legacy_name': 'CERN-LHC-CMS',
                                    'date_approved': '1996-01-31', #
                                    'date_proposed': '1992-10-01', #
                                    'date_started': '2009-11-23', ##
                                    'date_completed': '9999', #
                                    'project_type': ['experiment'],
                                    'collaboration': {'value': 'CMS'}, #
                                    'control_number': 1108642, #
                                    'inspire_classification': ['Collider Experiments|Hadrons|p p'] #
                                    }
        if e:
            v=Validator(self.schema_experiment,allow_unknown=True)
            if not v.validate(e):
                raise Exception(f'''
                    Input is not an INSPIRE-HEP experiment dictionary:
                    {v.errors}
                    See `self.sample_work`''')
        
        self.db = db
        self.experiment = e
        self.authors_size = size

        
    def get_authors(self):
        '''
        l: list of author objects
        '''

        l=self.experiment

        #print(l.get('control_number'),end='\r')
        #TODO: Check previous analysis for more metadata
        inspire_classification=l.get('inspire_classification')
        if not inspire_classification:
            inspire_classification = []
        
        control_number=l.get('control_number')

        try:
            year=int(l.get('date_started').split('-')[0])
        except:
            year=0
        if not year:
            year=9999
        #'inst_id':aff_id
        paper={}
        paper['legacy_name'] = l.get('legacy_name')
        paper['control_number'] = control_number
        paper['number_of_papers']=l.get('number_of_papers')
        try:
            paper['collaboration']=l.get("collaboration").get('value')
        except:
            paper['collaboration']=None
        paper['literature_link']=f"https://inspirehep.net/api/literature?size=25&page=1&q=collaboration:{paper.get('collaboration')}"
        paper['year']=year
        paper['inspire_classification']=inspire_classification
        paper['dates'] = {'approved':l.get('date_approved'),
                          'proposed':l.get('date_proposed'),
                          'started':l.get('date_started'), 
                          'completed':l.get('date_completed')}
        paper['long_name']=l.get('long_name')
        paper['description']=l.get('description')
        try:
            paper['accelerator']=l.get('accelerator').get('value')
        except:
            paper['accelerator']=None
        paper['project_type']=l.get('project_type')      


        #In update
        #'primary_arxiv_category':primary_arxiv_category
        #super(work, self).__init__(w.get('authors')[0])

        r=empty_json()
        # TODO: load all pages
        NEXT=True
        i=0        
        while NEXT:
            if i==0:
                url=f'https://inspirehep.net/api/authors?q=project_membership.record.%24ref%3A{control_number}&page=1&size={self.authors_size}'
            try:
                r=requests.get(url,timeout=timeout)
                time.sleep(self.sleep)
            except:
                r.status_code=-1
            if r.status_code==200:
                aus=r.json().get('hits').get('hits')
            else:
                aus=[]
            if i==0 and len(aus)==250:
                try:
                    paper['description']=paper['description'][:20]+'...'
                except:
                    paper['description']=''
            #print(i,url,len(aus))

            for a in aus: #same self.author_id but several institute_ids for several affiliations
                super(experiment, self).__init__(a.get('metadata'),self.db)
                super(experiment, self).get_authors() #add and replace self attributes → author*..., institution*..
                paper['author_id']=self.author_id
                paper['profile_id']=self.profile_id
                #Each d object keeps its RAM memory space independent of reasignation list from db to adb
                adb=[d for d in self.db if d.author_id==self.author_id]

                papers = []
                for d in adb:
                    if hasattr(d,'papers'):
                        papers=d.papers
                        break #found papers for self.author_id
                for d in adb:
                    d.papers=papers #reatach papers if already have
                    if paper not in d.papers:
                        #detach from RAM!
                        cppaper=copy.copy(paper)
                        d.papers.append(cppaper)
                        del cppaper
            
            links=r.json().get('links')
            try:
                total=r.json().get('hits').get('total')
            except:
                total=0
            try:
                if links.get('next'):
                    url=links.get('next')
                else:
                    NEXT=False
            except:
                NEXT=False
            #EMERGENCY EXIT: We assume here that r.json().get('total') exists
            if i > total//self.authors_size+1:
                NEXT=False
            i+=1            

        return self.db



In [10]:

#DEFINE TEST
sample_experimental_author={'project_membership': [{'name': 'CERN-LHC-CMS',
                                    'record': {'$ref': 'https://inspirehep.net/api/experiments/1108642'},
                                    'current': False,
                                    'curated_relation': True}],
                                'ids': [{'value': 'Y.Andreev.1', 'schema': 'INSPIRE BAI'}],
                                'name': {'value': 'Andreev, Yuri',
                                'preferred_name': 'Yuri Andreev'},
                                'control_number': 1018372,
                                'arxiv_categories': ['hep-ex']
                                }
a=experimental_author(sample_experimental_author)
db=a.get_authors()


In [2]:
#DEFINE TEST
sample_experiment = {'number_of_papers': 13049, #
                                    'self': {'$ref': 'https://inspirehep.net/api/experiments/1108642'}, 
                                    'long_name': 'CMS: The Compact Muon Solenoid',
                                    'accelerator': {'value': 'LHC'},
                                    'description': 'The 27-km Large Hadron Collider (LHC) is the largest and most powerful particle accelerator ever built. It accelerates protons to nearly the velocity of light -- in clockwise and anti-clockwise directions -- and then collides them at four locations around its ring. At these points, the energy of the particle collisions gets transformed into mass, spraying particles in all directions. The Compact Muon Solenoid (or CMS) detector sits at one of these four collision points. It is a general-purpose detector; that is, it is designed to observe any new physics phenomena that the LHC might reveal. CMS acts as a giant, high-speed camera, taking 3D “photographs” of particle collisions from all directions up to 40 million times each second. Although most of the particles produced in the collisions are “unstable”, they transform rapidly into stable particles that can be detected by CMS. By identifying (nearly) all the stable particles produced in each collision, measuring their momenta and energies, and then piecing together the information of all these particles like putting together the pieces of a puzzle, the detector can recreate an “image” of the collision for further analysis.',
                                    'legacy_name': 'CERN-LHC-CMS',
                                    'date_approved': '1996-01-31', #
                                    'date_proposed': '1992-10-01', #
                                    'date_started': '2009-11-23', ##
                                    'date_completed': '9999', #
                                    'project_type': ['experiment'],
                                    'collaboration': {'value': 'CMS'}, #
                                    'control_number': 1108642, #
                                    'inspire_classification': ['Collider Experiments|Hadrons|p p'] #
                                    }
e=experiment(sample_experiment,size=5)
db=e.get_authors()

0 https://inspirehep.net/api/authors?q=project_membership.record.%24ref%3A1108642&page=1&size=5
1 https://inspirehep.net/api/authors/?q=project_membership.record.%24ref%3A1108642&size=5&page=2


In [75]:
test=[{'updated': '2022-01-28T11:24:41.532294+00:00',
  'created': '2012-04-13T00:00:00+00:00',
  'metadata': {'number_of_papers': 954,
   'facet_inspire_classification': ['Collider|Heavy Flavor Factory',
    'Collider|e+ e-'],
   'normalized_name_variants': ['CLEO', 'CLEO'],
   'core': True,
   'self': {'$ref': 'https://inspirehep.net/api/experiments/1109984'},
   'urls': [{'value': 'http://w4.lns.cornell.edu/public/CLEO/'}],
   '$schema': 'https://inspirehep.net/schemas/records/experiments.json',
   'long_name': 'The {CLEO} Experiment at {CESR}',
   'accelerator': {'value': 'CESR'},
   'description': 'Since 1979 the collaboration has conducted studies of b, c, tau and gamma-gamma physics in e+ e- interactions near 10 GeV. Current topics include determination of the CKM parameters and the Standard Model tests in decays of heavy flavors, as well as QCD tests in a variety of processes. Successive detector upgrades have kept pace with luminosity improvements to the Cornell Electron Storage Ring (CESR), which has delivered over 6/fb of integrated luminosity to date. The CLEO-II detector (proposed 1983, approved 1984, operational since 1989) consists of drift chambers for tracking charged particles and measuring dE / dx, time-of-flight counters, a 7800-element CsI electromagnetic calorimeter, a 1.5-tesla superconducting solenoid, iron for flux return and muon identification, and muon chambers. A three-layer silicon vertex detector was added in the Fall of 1995. Taking data (July 96). A major upgrade, the CLEO-III detector, was proposed and approved in 1994, and the installation is planned for 1998.',
   'legacy_name': 'CESR-CLEO',
   'date_started': '1979-10',
   'institutions': [{'value': 'Cornell U.',
     'record': {'$ref': 'https://inspirehep.net/api/institutions/911819'},
     'curated_relation': True}],
   'project_type': ['experiment'],
   'collaboration': {'value': 'CLEO', 'curated_relation': False},
   'date_approved': '1977',
   'date_proposed': '1975',
   'name_variants': ['CLEO'],
   'control_number': 1109984,
   'date_completed': '9999',
   'legacy_version': '20200123205452.0',
   'legacy_creation_date': '2012-04-13',
   'inspire_classification': ['Collider Experiments|Heavy Flavor Factory',
    'Collider Experiments|e+ e-'],
   'external_system_identifiers': [{'value': 'EXPERIMENT-6963',
     'schema': 'SPIRES'}]},
  'links': {'json': 'https://inspirehep.net/api/experiments/1109984?format=json'},
  'id': '1109984'}]

## Run for all the experiment entries in inspirehep.net

In [2]:
dbj_all = []
dbj_la = []

In [2]:
import json
f=open('inspire-ex.json','r')
dbj_all=json.load(f)
f.close()
f=open('inspire_LA-ex.json','r')
dbj_la=json.load(f)
f.close()

In [None]:
def update_json(dbj,dbj_all):
    for d in dbj:
        if d.get('profile_id') in [d.get('profile_id') for d in dbj_all]:
            for dd in dbj_all:
                if d.get('profile_id')==dd.get('profile_id') and d not in dbj_all:
                    if len(d.get('papers'))>0 and d.get('papers')[0] not in dd.get('papers'):
                        dd.get('papers').append(d.get('papers')[0])
        else:
            if d not in dbj_all:
                dbj_all.append(d)
    return dbj_all

LA_countries=['Brazil', 'Mexico', 'Argentina','Chile', 'Colombia','Bolivia','Cuba', #5
              'Costa Rica', 'Ecuador', 'El Salvador', 'Guatemala', 'Honduras', #10
               'Nicaragua', 'Panama', 'Paraguay', 'Peru', #15
              'Dominican Republic','Uruguay','Venezuela']
q=''#'FNAL-E-0690'
size=250
sleep=0.4
r = empty_json()
# TODO: load all pages
NEXT=True
i=0        
while NEXT:
    if i==0:
        url=f'https://inspirehep.net/api/experiments?size={size}'
        if q:
            url=f'{url}&q={q}'
    try:
        r=requests.get(url,timeout=timeout)
        time.sleep(sleep)
    except:
        r.status_code=-1
    print(i,url,r.status_code)

    if r.status_code==200:
        exps=r.json().get('hits').get('hits')
    else:
        exps=[]
    #exps=[e for e in exps if e.get('metadata').get('control_number')==1109984]
    for e in exps: #same self.author_id but several institute_ids for several affiliations
        if e.get('metadata').get('control_number') in set([x for sublist in 
                  [[dd.get('control_number') for dd in d.get('papers')] 
                  for d in dbj_all] for x in sublist]):
                  continue
        expr=experiment(e.get('metadata'),db=[],size=250)
        db_ex=expr.get_authors()
        dbj=expr.to_json()
        dbj_all=update_json(dbj,dbj_all)
        if set(LA_countries).intersection(set([d.country for d in db_ex])):
            dbj_la=update_json(dbj,dbj_la)
        print(str(i).zfill(4),'→',e.get('metadata').get('control_number'),
                      str(len(db_ex)).zfill(4),str(len(dbj)).zfill(4),
                      str(len(dbj_all)).zfill(5),str(len(dbj_la)).zfill(5),end='\r')

    f=open('inspire-ex.json','w')
    json.dump(dbj_all,f)
    f.close()
    f=open('inspire_LA-ex.json','w')
    json.dump(dbj_la,f)
    f.close()

    links=r.json().get('links')
    total=r.json().get('hits').get('total')                
    if links.get('next'):
        url=links.get('next')
    else:
        NEXT=False
    #EMERGENCY EXIT: We assume here that r.json().get('total') exists
    #if i==0:
    #    NEXT=False
    if i > total//size+1:
        NEXT=False
    i+=1
    #Fix duplicated authors
    f=open('inspire-ex.json','w')
    json.dump(dbj_all,f)
    f.close()
    f=open('inspire_LA-ex.json','w')
    json.dump(dbj_la,f)
    f.close()