# Inspire
#### [Inspire Schemas](http://inspire-schemas.readthedocs.io/en/latest/schemas/records/)

## 1. Initialize

### 1.1 Import packages

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import sys
#import json
import requests
import pickle
import pandas as pd
import random as rnd
from dateutil import parser
#
import Inspiretools as inspire

#### 1.1.1 Parallel  (Ignore if there are no running parallel engines. Uses ipyparallel)

In [2]:
# Parallel module. Before starting it launch engines from the Anaconda prompt
# with the command
# ipcluster start -n 4
# see guide at https://ipyparallel.readthedocs.io/en/latest/intro.html
import ipyparallel as ipp

### 1.2 Initialize the parallel engines (Ignore if there are no running parallel engines. Uses ipyparallel)

In [3]:
c = ipp.Client()
dview = c[:]
print("Running " + str(max(c.ids)+1) + " engines" )

Running 4 engines


            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@host.riccardotorre.com')
            or instruct your controller to listen on an external IP.


In [4]:
with dview.sync_imports():
    import requests
    import pickle
    import pandas as pd
    import random as rnd
    from dateutil import parser

importing requests on engine(s)
importing pickle on engine(s)
importing pandas on engine(s)
importing random on engine(s)
importing parser from dateutil on engine(s)


### 1.3 Define styles and options for Pandas

In [5]:
pd.options.display.max_colwidth = 10000
#pd.options.display.height = 10000
pd.options.display.width = 10000
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
mytablestyle = [{"selector": ".row_heading, .blank", 
                 "props": [('display', 'none;')]},
                {"selector": ".col_heading, .blank", 
                  "props": [("text-align", "left")]},
                {"selector": "td",
                  "props": [("text-align", "left")]},
                {"selector": "caption",
                  "props": [("font-size", "120%"),
                            ("font-weight", "bold"),
                            ("text-decoration", "underline")]}]

### 1.4 Define basic functions, repository etc..

In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]
def save_object(obj, filename, mode):
    with open(filename, mode) as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
def load_object(filename):
    with open(filename, 'rb') as input:
        return pickle.load(input)
def StringToDate(date):
    if date == None:
        tmp = None
    else:
        tmp = parser.parse(str(date),dayfirst=True)
        tmp = tmp.year + (tmp.month - 1) / 12 + (tmp.day - 1) / 365
    return tmp

class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            for col in row:
                html.append("<td>{0}</td>".format(col))
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)
    
repository = "labs.inspirehep.net"
urlrecords = "https://" + repository + "/api/literature/"
urlsearchbase = "https://" + repository + "/api/literature?sort=mostrecent"

def urlrecord(num):
    return urlrecords+str(num)
def datajson(i):
    try:
        with requests.get(urlrecord(i)) as url:
            return url.json()
    except Exception as e:
        return 0    
# THIS FUNCTION CAN RUN IN PARALLEL
def paralleldatajson(i):
    repository = "labs.inspirehep.net"
    urlrecords = "https://" + repository + "/api/literature/"
    def urlrecord(num):
        return urlrecords+str(num)
    def datajson(i):
        try:
            with requests.get(urlrecord(i)) as url:
                return url.json()
        except Exception as e:
            return 0
    return datajson(i)
def importjsonrecords(i):
    bunchsize = 199
    size = bunchsize + 11
    urlsearch = urlrecords + "?page=1&size="+ str(size) +"&cc=literature&q=recid%20%3E%3D%20"+ str(i) +"%20and%20recid%20%3C"+ str(i+bunchsize)
    try:
        with requests.get(urlsearch) as url:
            tmp = url.json()
            tmp = tmp["hits"]["hits"]
            tmp = sorted(tmp, key=lambda k: k["id"])
    except Exception as e:
        tmp = None
    return tmp
def importjsonurl(urlimport, headervalue = None):
    try:
        with requests.get(urlimport, headers=headervalue) as url:
            tmp = url.json()["hits"]["hits"]
#            tmp = sorted(tmp, key=lambda k: k["id"])
    except Exception as e:
        tmp = None
    return tmp
urlprova = 'https://labs.inspirehep.net/api/literature?sort=mostrecent&size=1&q=a+torre,+r,'

### 1.5 Define search function

In [7]:
def importjsonsearch(string, n, parallel = 0):
    out = []
    bunchsize = 250
    headervalue={"Accept":"application/vnd+inspire.record.ui+json"}
    baseurl = urlsearchbase 
    searchurl = baseurl + "&q=" + "+".join(string.split())
    try:
        with requests.get(searchurl + "&size=1", headers=headervalue) as url:
            existing = url.json()["hits"]["total"]
    except Exception as e:
        existing = 0
    if existing == 0:
        print("The search did not match any result.")
    else:
        if n > existing:
            print(str(existing)+ " records found. This is  less than " + str(n) + ". Search reduced to " + str(existing) + " records.")
            number = existing
        else:
            number = n
        if (number/bunchsize).is_integer():
            split = int(number/bunchsize)
        else:
            split = int(number/bunchsize) + 1
        pages = [searchurl + "&q="+("+".join(string.split()))+"&page"+str(i)+"&size="+str(bunchsize) for i in range(1,split+1)]
        def importpage(page):
            try:
                with requests.get(page, headers=headervalue) as url:
                    tmp = url.json()["hits"]["hits"]
                    tmp = [i["id"] for i in tmp]
                    out = tmp
            except Exception as e:
                out = []
            return out
        if parallel == 1:
            print('Parallel mode')
            out = sum(dview.map_sync(importpage, pages),[])
        else:
            print('Serial mode')
            out = sum(map(importpage, pages),[])     
        if out != []:
            out = sorted(out,reverse=True)
            out = out[:number]
    return out

## 2. Import and save records

### 2.1 Make search and load recids

In [8]:
%%time
recids = importjsonsearch("a torre, r and -cn:**",23)
print(len(recids))

Serial mode
23
CPU times: user 39.8 ms, sys: 12.6 ms, total: 52.5 ms
Wall time: 1.03 s


In [9]:
%%time
recids = importjsonsearch("a torre, r and -cn:**",122,1)
print(len(recids))

47 records found. This is  less than 122. Search reduced to 47 records.
Parallel mode
47
CPU times: user 21.7 ms, sys: 5.55 ms, total: 27.3 ms
Wall time: 562 ms


In [10]:
%%time
recids = importjsonsearch("a torre, r",1200)
print(len(recids))

53 records found. This is  less than 1200. Search reduced to 53 records.
Serial mode
53
CPU times: user 37.5 ms, sys: 7.57 ms, total: 45 ms
Wall time: 537 ms


In [11]:
%%time
recids = importjsonsearch("a torre, r",1200,1)
print(len(recids))

53 records found. This is  less than 1200. Search reduced to 53 records.
Parallel mode
53
CPU times: user 18.3 ms, sys: 5.17 ms, total: 23.5 ms
Wall time: 559 ms


In [12]:
recids = importjsonsearch("r.torre.1",100,1)
print(len(recids))
print(recids)

48 records found. This is  less than 100. Search reduced to 48 records.
Parallel mode
48
[1718784, 1718163, 1713706, 1713705, 1713704, 1713703, 1710078, 1710056, 1677568, 1664629, 1637603, 1624179, 1591995, 1518447, 1512122, 1488288, 1469453, 1467223, 1449991, 1409951, 1409747, 1389171, 1380186, 1374543, 1373918, 1343319, 1343133, 1295914, 1294778, 1281686, 1252848, 1246166, 1223633, 1207605, 1201959, 1121023, 1111661, 1094573, 945500, 940384, 926294, 892770, 892720, 866732, 856300, 856011, 843274, 836568]


### 2.2 Import correspondin records (serial and parallel)

In [22]:
%%time
# For many large records this could be very slow
recordsser = list(map(datajson, recids))

CPU times: user 986 ms, sys: 131 ms, total: 1.12 s
Wall time: 11.4 s


In [14]:
%%time
recordspar = list(dview.map_sync(paralleldatajson, recids))

CPU times: user 223 ms, sys: 32 ms, total: 255 ms
Wall time: 4.48 s


In [23]:
len(recids)

48

In [24]:
recordsser == recordspar

True

In [25]:
records = recordspar
del(recordspar,recordsser)

In [26]:
data = {recids[i]: inspire.HEP.HEPObject(records[i]) for i in range(len(records))}

In [27]:
data[recids[0]]

<Inspiretools.HEP.HEPObject at 0x116f94828>

### 2.3. Save and load data

In [28]:
%%time
save_object(data, "data.pkl", 'wb')

CPU times: user 112 ms, sys: 27.6 ms, total: 140 ms
Wall time: 154 ms


In [29]:
%%time
data = load_object('data.pkl')

CPU times: user 137 ms, sys: 14.5 ms, total: 152 ms
Wall time: 151 ms


In [31]:
existingrecordsIDs = list(data.keys())
print(existingrecordsIDs)

[1718784, 1718163, 1713706, 1713705, 1713704, 1713703, 1710078, 1710056, 1677568, 1664629, 1637603, 1624179, 1591995, 1518447, 1512122, 1488288, 1469453, 1467223, 1449991, 1409951, 1409747, 1389171, 1380186, 1374543, 1373918, 1343319, 1343133, 1295914, 1294778, 1281686, 1252848, 1246166, 1223633, 1207605, 1201959, 1121023, 1111661, 1094573, 945500, 940384, 926294, 892770, 892720, 866732, 856300, 856011, 843274, 836568]


## 3 Visualize (and understand) data

In [32]:
#examples (see Help below for full schema)
rec = recids[1]
data[rec].Recid()
data[rec].Collections()
data[rec].Abstract()
[[i.Categories(),i.PrimaryCategory(),i.Value()] for i in data[rec].ArXivEprints()]

1718163

['Literature', 'Fermilab']

'The discovery of the Higgs boson in 2012, by the ATLAS and CMS experiments, was a success achieved with only a percent of the entire dataset foreseen for the LHC. It opened a landscape of possibilities in the study of Higgs boson properties, Electroweak Symmetry breaking and the Standard Model in general, as well as new avenues in probing new physics beyond the Standard Model. Six years after the discovery, with a conspicuously larger dataset collected during LHC Run 2 at a 13 TeV centre-of-mass energy, the theory and experimental particle physics communities have started a meticulous exploration of the potential for precision measurements of its properties. This includes studies of Higgs boson production and decays processes, the search for rare decays and production modes, high energy observables, and searches for an extended electroweak symmetry breaking sector. This report summarises the potential reach and opportunities in Higgs physics during the High Luminosity phase of the LHC

[[['hep-ph', 'hep-ex'], 'hep-ph', '1902.00134']]

In [37]:
[data[rec].Collections()]s

[['Literature', 'Fermilab']]

In [40]:
[data[rec].JSONSchema()]

['https://labs.inspirehep.net/schemas/records/hep.json']

In [None]:
#existingrecordsIDs = list(data.keys())
#for rec in existingrecordsIDs:
# rec = 1469453
# rec = 1637603
rec = recids[1]
# rec = existingrecordsIDs[6]
pd.DataFrame([data[rec].Recid()], columns=["Recid"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Link()], columns=["Link"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].JSONSchema()], columns=["Schema"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Collections()], columns=["Collections:"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Date(),
               i.Expert(),
               i.Ientifier(),
               i.Status()] for i in data[rec].DesyBookkeeping()], 
                 columns=["Date", "Expert", "Identifier","Status"]).style\
                    .set_caption("Desy Bookkeeping").set_table_styles(mytablestyle)
pd.DataFrame([["Date Created", data[rec].DateCreated(), StringToDate(data[rec].DateCreated())],
              ["Date Updated", data[rec].DateUpdated(), StringToDate(data[rec].DateUpdated())],
              ["Earliest Date", data[rec].EarliestDate(), StringToDate(data[rec].EarliestDate())],
              ["Legacy Creation Date", data[rec].EarliestDate(), StringToDate(data[rec].EarliestDate())],
              ["Preprint Date", data[rec].PreprintDate(), StringToDate(data[rec].PreprintDate())]], 
             columns=["Date", "String", "Float"]).style.set_caption("Dates").set_table_styles(mytablestyle)
pd.DataFrame([[i.Source(), i.Value()] for i in data[rec].Abstracts()], columns=["Source", "Value"]).style\
        .set_caption("Abstracts").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Abstract()], columns=["Abstract"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Recid(),
               i.Record().JSONReference(),
               i.Accelerator(),
               i.CuratedRelation(),
               i.Experiment(),
               i.Institution(),
               i.LegacyName()] for i in data[rec].AcceleratorExperiments()], 
                 columns=["Recid", "Record", "Accelerator", "CuratedRelation",
                          "Experiment", "Institution", "LegacyName"]).style\
                    .set_caption("Experiments").set_table_styles(mytablestyle)
pd.DataFrame([[i.DateTime(),
               i.Email(),
               i.InternalUID(),
               i.Method(),
               i.ORCID(),
               i.Source(),
               i.SubmissionNumber()] for i in [data[rec].AcquisitionSource()]],
                 columns=["DateTime", "Email", "InternalUID", "Method", "ORCID",
                          "Source", "SubmissionNumber"]).style\
                    .set_caption("Acquisition Source").set_table_styles(mytablestyle)
pd.DataFrame([[i.Categories(),i.PrimaryCategory(),i.Value()] for i in data[rec].ArXivEprints()], 
             columns=["Categories", "Primary Category", "Value"]).style\
                    .set_caption("Arxiv Eprints").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].AuthorsCount()], columns=["Authors Count"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Recid(),
  i.Record().JSONReference(),
  [[j.Schema(),j.Value()] for j in i.IDs()],
  i.FullName(), 
  i.AlternativeNames(),
  [[j.Record().JSONReference(),j.Value(),j.CuratedRelation()] for j in i.Affiliations()],
  i.Emails(),
  i.RawAffiliations(),
  i.CreditRoles(),
  i.CuratedRelation(),
  i.InspireRoles(),
  i.SignatureBlock(),
  i.UUID()] for i in data[rec].Authors()], 
  columns=["Recid", "Record", str(["Schema","Value"]), "FullName",
           "AlternativeNames", str(["Record","Value","CuratedRelation"]),
           "Emails", "RawAffiliations", "CreditRoles", "CuratedRelation",
           "InspireRoles", "SignatureBlock", "UUID"]).style\
    .set_caption("Authors").set_table_styles(mytablestyle)
pd.DataFrame([[i.Title(),i.Volume()] for i in data[rec].BookSeries()], 
             columns=["Title", "Volume"]).style\
                    .set_caption("Book Series").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].CiteableFlag()], columns=["Citeable"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Recid(), i.Record().JSONReference(), i.Value()] for i in data[rec].Collaborations()], 
                 columns=["Recid", "Record", "Value"]).style\
                    .set_caption("Collaboration").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].ControlNumber()], columns=["Control Number"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Core()], columns=["Core"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].CorporateAuthor()], columns=["CorporateAuthor"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Curated()], columns=["Curated"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Deleted()], columns=["Deleted"]).style.set_table_styles(mytablestyle)
pd.DataFrame([i.JSONReference() for i in data[rec].DeletedRecords()], columns=["Deleted Records"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].DocumentType()], columns=["Document Type"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Description(),
        i.FullText(),
        i.Hidden(),
        i.Key(),
        i.Material(),
        i.OriginalURL(),
        i.Source(),
        i.URL()] for i in data[rec].Documents()], 
                 columns=["Description", "FullText", "Hidden", "Key",
                          "Material", "OriginalURL", "Source", "URL"]).style\
                    .set_caption("Documents").set_table_styles(mytablestyle)
pd.DataFrame([[i.Material(),
        i.Source(),
        i.Value()] for i in data[rec].DOIs()], 
                 columns=["Material", "Source", "Value"]).style\
                    .set_caption("DOIs").set_table_styles(mytablestyle)
pd.DataFrame(data[rec].Editions(), columns=["Editions"]).style.set_table_styles(mytablestyle)
pd.DataFrame(data[rec].EnergyRanges(), columns=["Energy Range"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Schema(),
        i.Value()] for i in data[rec].ExternalSystemIdentifiers()], 
                 columns=["Schema", "Value"]).style\
                    .set_caption("External Identifiers").set_table_styles(mytablestyle)
pd.DataFrame(data[rec].FacetInspireDocType(), columns=["Doc type"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Caption(),
        i.Key(),
        i.Label(),               
        i.Material(),
        i.Source(),
        i.URL()] for i in data[rec].Figures()], 
                 columns=["Caption", "Key", "Label",
                          "Material", "Source", "URL"]).style\
                    .set_caption("Figures").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].FiguresCount()], columns=["Figures Count"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Agency(),
        i.GrantNumber(),
        i.ProjectNumber()] for i in data[rec].FundingInfo()], 
                 columns=["Agency", "Grant Number", "Project Number"]).style\
                    .set_caption("Funding Info").set_table_styles(mytablestyle)
pd.DataFrame([[i.Date(),
        i.Place(),
        i.Publisher()] for i in data[rec].Imprints()], 
                 columns=["Date", "Place", "Publisher"]).style\
                    .set_caption("Imptints").set_table_styles(mytablestyle)
pd.DataFrame([[i.Source(),
        i.Term()] for i in data[rec].InspireCategories()], 
                 columns=["Source", "Term"]).style\
                    .set_caption("Inspire Categories").set_table_styles(mytablestyle)
pd.DataFrame([[i.Medium(),
        i.Value()] for i in data[rec].ISBNs()], 
                 columns=["Medium", "Value"]).style\
                    .set_caption("ISBNs").set_table_styles(mytablestyle)
pd.DataFrame([[i.Schema(), i.Source(), i.Value()] for i in data[rec].Keywords()], 
                 columns=["Schema", "Source", "Value"]).style\
                    .set_caption("Keywords").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Languages()], columns=["Languages"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Imposing(),
               i.License(),
               i.Material(),
               i.URL()] for i in data[rec].Licenses()], 
                 columns=["Imposing", "License", "Material", "URL"]).style\
                    .set_caption("Licenses").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].NewRecord().JSONReference()], columns=["New Record"]).style.set_table_styles(mytablestyle)
pd.DataFrame([data[rec].PagesCount()], columns=["New Record"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Material(),
               i.Schema(),
               i.Source(),
               i.Value()] for i in data[rec].PersistentIdentifiers()], 
                 columns=["Material", "Schema", "Source", "Value"]).style\
                    .set_caption("Persistent Identifier").set_table_styles(mytablestyle)
pd.DataFrame([[i.ArtID(),
  i.Cnum(),
  i.ConfAcronym(),
  i.ConferenceRecord().JSONReference(),
  i.CuratedRelation(),
  i.Hidden(),
  i.JournalIssue(), 
  i.JournalTitle(),
  i.JournalVolume(),
  i.Material(),
  i.PageStart(),
  i.PageEnd(),
  i.ParentISBN(),
  i.ParentRecord().JSONReference(),
  i.ParentReportNumber(),
  i.PubInfoFreeText(),
  i.Year()] for i in data[rec].PublicationInfo()], 
  columns=["ArtID", "Conf Num", "Conf Acronym", "Conf Record",
           "Curated Relation", "Hidden", "Journal Issue", "Journal Title",
           "Journal Volume", "Material", "Page Start", "Page End",
           "Parent ISBN", "Parent Record", "ParentReportNumber", 
           "PubInfoFreeText", "Year"]).style\
    .set_caption("Publication Info").set_table_styles(mytablestyle)
pd.DataFrame([data[rec].Refereed()], columns=["Refereed"]).style.set_table_styles(mytablestyle)
pd.DataFrame([[i.Record().JSONReference(),i.LegacyCurated(),
  [[j.Schema(), j.Source(), j.Value()] for j in i.RawRefs()], 
  i.Reference().ArXivEprint(), i.Reference().Collaborations(), 'etc'] for i in data[rec].References()], 
  columns=["Record", "Legacy Curated", "Raw Refs", "ArXiv", "Collaborations","Etc."]).style\
    .set_caption("References").set_table_styles(mytablestyle)

## 4. Help (inspiretool manual)

In [58]:
help(inspire.HEP)

Help on class HEP in module inspiretools:

class HEP(builtins.object)
 |  Class that defines all Objects and Attributes of HEP records
 |  
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  AcceleratorExperimentObject = <class 'inspiretools.HEP.AcceleratorExpe...
 |      Class containing the object HEP.AcceleratorExperimentObject defined by the
 |      HEP.HEPObject.Experiments method of the HEP.HEPObject class
 |      Inherits the Parent Classes
 |      HEP.InitParent (indirectly)
 |      HEP.CuratedRelationParent
 |      HEP.RecidParent
 |      HEP.RecordParent
 |      Contains the methods
 |      HEP.AcceleratorExperimentObject.Accelerator
 |      HEP.AcceleratorExperimentObject.CuratedRelation
 |      HEP.Accelerato