This notebook contains an arxivRecord class, used to store metadata associated to a given document on the arXiv. 

In [2]:
class arxivRecord:
    
    def __init__(self, identifier = 'ISBN 0-399-14986-4', title = "Pattern Recognition", 
                 date = '2003-02-03', authors = ['William Gibson'], subjects = ['Sci-Fi'],
                 abstract = '''Set in August and September 2002, the story follows Cayce Pollard, 
                 a 32-year-old marketing consultant who has a psychological sensitivity to corporate symbols.'''):
        self.identifier = identifier
        self.title = title
        self.date = date
        self.subjects = subjects
        self.authors = authors
        self.abstract = abstract
    
    # instantiates a new arxivRecord from an ElementTree retrieved from arxiv.org
    def from_et(e):
        # we get the ETs corresponding to header and metadata
        header = e[0]
        metadata = e[1][0]
        
        identifier = header[0].text[14:]
        date = header[1].text
        
        # we now read each of the metadata fields
        title_tag = '{http://purl.org/dc/elements/1.1/}title'
        creator_tag = '{http://purl.org/dc/elements/1.1/}creator'
        subject_tag = '{http://purl.org/dc/elements/1.1/}subject'
        description_tag = '{http://purl.org/dc/elements/1.1/}description'
        date_tag = '{http://purl.org/dc/elements/1.1/}date'
        
        title = [child.text for child in metadata if child.tag == title_tag][0]
        authors = [child.text for child in metadata if child.tag == creator_tag]
        subjects = [child.text for child in metadata if child.tag == subject_tag]
        abstract = [child.text for child in metadata if child.tag == description_tag][0]
        date = [child.text for child in metadata if child.tag == date_tag][0]
        
        # clean up the abstract and title by removing the newlines
        def clean_up(s):
            s = s.replace('\n', ' ')
            s = s.replace('  ', ' ')
            if len(s) > 0:
                if s[0] == ' ':
                    s = s[1:]
            if len(s) > 0: 
                if s[-1] == ' ':
                    s = s[:-1]
            return s
        
        title = clean_up(title)
        abstract = clean_up(abstract)
        
        # keep only the arXiv categories corresponding to arXiv math subjects
        subjects = [s for s in subjects if 'Mathematics - ' in s]
        
        return arxivRecord(identifier = identifier, title = title, date = date, authors = authors, abstract = abstract, subjects = subjects)
    
    def to_dict(self):
        return { 'identifier' : self.identifier, 'title' : self.title, 'date' : self.date, 
                'authors' : self.authors, 'subjects' : self.subjects, 'abstract' : self.abstract}
    
    def from_dict(d):
        return arxivRecord(identifier = d['identifier'], title = d['title'], date = d['date'],
                          authors = d['authors'], subjects = d['subjects'], abstract = d['abstract'])
        

In [12]:
import xml.etree.ElementTree as ET
import requests
import time


# returns a list of arxiv records in mathematics, starting with records from a given year and younger, 
# to a given maximum of records
def get_arxiv_records(since = 2019, maximum = 100):
    result = []
    more_id_available = True
    resumption_token = ''
    
    # toggles debug mode
    DEBUG_MODE = True
    
    # in the loop, the code fetches the identifiers and then checks whether at the end we have a "resumption token"
    # which means that the returned list is incomplete, in which case another request is made
    while more_id_available:        
        time.sleep(11)
        if resumption_token == '':
            url = "http://export.arxiv.org/oai2?verb=ListRecords&from=" + str(since) + "-01-01&metadataPrefix=oai_dc&set=math"
        else:
            url = "http://export.arxiv.org/oai2?verb=ListRecords&resumptionToken=" + resumption_token
            
        response = requests.get(url)
        parsed = ET.fromstring(response.text)
        list_identifiers = parsed[2]
        
        
        result = result + [arxivRecord.from_et(child) for child in list_identifiers[:-1]]
        
        if list_identifiers[-1].text:
            resumption_token = list_identifiers[-1].text
        else:
            more_id_available = False
            
        if len(result) > maximum:
            more_id_available = False
        else:
            if DEBUG_MODE:
                print("Currently " + str(len(result)) + " records downloaded.")
    return result[:maximum]

Here's an example of using the get_arxiv_records() function to fetch some data from arxiv.org. 

In [4]:
some_ar = [ar.to_dict() for ar in get_arxiv_records(maximum = 10)]

In [None]:
import json

records = [ar.to_dict() for ar in get_arxiv_records(since=1990, maximum = 100000)]
with open('arxiv_database.json', 'w') as json_file:  
    json.dump(records, json_file)

Currently 1000 records downloaded.
Currently 2000 records downloaded.
Currently 3000 records downloaded.
Currently 4000 records downloaded.
Currently 5000 records downloaded.
Currently 6000 records downloaded.


In [6]:
with open('arxiv_database.json', 'r') as json_file:
    loaded_list = json.load(json_file)

In [7]:
loaded_list

[{'identifier': '0705.1265',
  'title': "A noncommutative Bohnenblust-Spitzer identity for Rota-Baxter algebras  solves Bogoliubov's recursion",
  'date': '2007-05-09',
  'authors': ['Ebrahimi-Fard, Kurusch',
   'Manchon, Dominique',
   'Patras, Frederic'],
  'subjects': ['Mathematics - Combinatorics',
   'High Energy Physics - Theory',
   'Mathematical Physics',
   'Mathematics - Rings and Algebras'],
  'abstract': "The Bogoliubov recursion is a particular procedure appearing in the process of renormalization in perturbative quantum field theory. It provides convergent expressions for otherwise divergent integrals. We develop here a theory of functional identities for noncommutative Rota-Baxter algebras which is shown to encode, among others, this process in the context of Connes-Kreimer's Hopf algebra of renormalization. Our results generalize the seminal Cartier-Rota theory of classical Spitzer-type identities for commutative Rota-Baxter algebras. In the classical, commutative, case

In [11]:
sample = arxivRecord.from_dict(loaded_list[0])

In [12]:
sample.to_dict()

{'identifier': '0705.1265',
 'title': "A noncommutative Bohnenblust-Spitzer identity for Rota-Baxter algebras  solves Bogoliubov's recursion",
 'date': '2007-05-09',
 'authors': ['Ebrahimi-Fard, Kurusch',
  'Manchon, Dominique',
  'Patras, Frederic'],
 'subjects': ['Mathematics - Combinatorics',
  'High Energy Physics - Theory',
  'Mathematical Physics',
  'Mathematics - Rings and Algebras'],
 'abstract': "The Bogoliubov recursion is a particular procedure appearing in the process of renormalization in perturbative quantum field theory. It provides convergent expressions for otherwise divergent integrals. We develop here a theory of functional identities for noncommutative Rota-Baxter algebras which is shown to encode, among others, this process in the context of Connes-Kreimer's Hopf algebra of renormalization. Our results generalize the seminal Cartier-Rota theory of classical Spitzer-type identities for commutative Rota-Baxter algebras. In the classical, commutative, case, these ide