## ArxiveAPI

In [6]:
import re
import itertools
import json
import sys
import os
import time
import traceback
from lxml import etree
import pandas as pd
import glob
import numpy as np

In [25]:
 import xml.etree.ElementTree as ET

### Get to know

In [11]:
import urllib, urllib.request
url = 'http://export.arxiv.org/api/query?search_query=all:quantum&start=0&max_results=2'
data = urllib.request.urlopen(url)
f = data.read().decode('utf-8')
#print(f)

In [26]:
import urllib.request as libreq
urlx = 'http://export.arxiv.org/api/query?search_query=all:quantum&start=0&max_results=1'
with libreq.urlopen(urlx) as url:
    response = url.read()
    tree = ET.fromstring(response)
    for docTitle in tree.findall(".//title"):
        print(docTitle.text)

In [29]:
#!pip install feedparser

### Systematic Parsing

python_arXiv_parsing_example.py

This sample script illustrates a basic ```arXiv api``` call
followed by parsing of the results using the 
feedparser python module.

Please see the documentation at 
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api 
mailing list at arxiv-api@googlegroups.com.

urllib is included in the standard python library.
feedparser can be downloaded from http://feedparser.org/ .

*Author: Julius B. Lucks*

This is free software.  Feel free to do what you want
with it, but please play nice with the arXiv API!

https://static.arxiv.org/static/arxiv.marxdown/0.1/help/api/examples/python_arXiv_parsing_example.txt

In [35]:
import urllib
import feedparser

In [36]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'all:quantum' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)


In [43]:
# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
#feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
#feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

In [39]:
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

#### Feed Information

In [40]:
# print out feed information
print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)

Feed title: ArXiv Query: search_query=all:quantum&amp;id_list=&amp;start=0&amp;max_results=5
Feed last updated: 2021-05-27T00:00:00-04:00


#### Metadata

In [41]:
# print opensearch metadata
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print ('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print ('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

totalResults for this query: 324135
itemsPerPage for this query: 5
startIndex for this query: 0


#### Entry by entry

In [45]:
# Run through each entry, and print out information
for entry in feed.entries:
    print ('e-print metadata')
    print ('arxiv-id: %s' % entry.id.split('/abs/')[-1])
    print ('Published: %s' % entry.published)
    print ('Title:  %s' % entry.title)
    
    # feedparser v4.1 only grabs the first author
    author_string = entry.author
    
    # grab the affiliation in <arxiv:affiliation> if present
    # - this will only grab the first affiliation encountered
    #   (the first affiliation for the first author)
    # Please email the list with a way to get all of this information!
    try:
        author_string += ' (%s)' % entry.arxiv_affiliation
    except AttributeError:
        pass
    
    print ('Last Author:  %s' % author_string)
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        print ('Authors:  %s' % ', '.join(author.name for author in entry.authors))
    except AttributeError:
        pass

    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            print ('abs page link: %s' % link.href)
        elif link.title == 'pdf':
            print ('pdf link: %s' % link.href)
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    print ('Journal reference: %s' % journal_ref)
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    print ('Comments: %s' % comment)
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    print ('Primary Category: %s' % entry.tags[0]['term'])
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    print ( 'All Categories: %s' % (', ').join(all_categories))
    
    # The abstract is in the <summary> element
    print ('Abstract: %s' %  entry.summary)
    
    print("----------------------------------------")

e-print metadata
arxiv-id: quant-ph/0201082v1
Published: 2002-01-18T15:08:05Z
Title:  Quantum Computers and Quantum Computer Languages: Quantum Assembly
  Language and Quantum C Language
Last Author:  Stephen Blaha
Authors:  Stephen Blaha
abs page link: http://arxiv.org/abs/quant-ph/0201082v1
pdf link: http://arxiv.org/pdf/quant-ph/0201082v1
Journal reference: No journal ref found
Comments: 32 pages
Primary Category: quant-ph
All Categories: quant-ph, cs.PL
Abstract: We show a representation of Quantum Computers defines Quantum Turing Machines
with associated Quantum Grammars. We then create examples of Quantum Grammars.
Lastly we develop an algebraic approach to high level Quantum Languages using
Quantum Assembly language and Quantum C language as examples.
----------------------------------------
e-print metadata
arxiv-id: quant-ph/0407102v1
Published: 2004-07-14T14:47:27Z
Title:  Quantum Networks for Generating Arbitrary Quantum States
Last Author:  Michele Mosca
Authors:  Phillip K

### Data Parser

In [48]:
DATA = []
# Run through each entry, and print out information
for entry in feed.entries:
    data = {}
    data.update({'arxiv-id': entry.id.split('/abs/')[-1]})
    data.update({'published': entry.published})
    data.update({'title': entry.title})
    
    
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        data.update({'Authors': ', '.join(author.name for author in entry.authors)})
    except AttributeError:
        pass

    
    
    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            data.update({'abs page link': link.href})
        elif link.title == 'pdf':
            data.update({'pdf link': link.href})
            
            
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    data.update({'Journal reference':journal_ref})
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    data.update({'Comments': comment})
    
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    data.update({'Primary Category': entry.tags[0]['term']})
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    data.update({'All Categories': (', ').join(all_categories)})
    
    # The abstract is in the <summary> element
    data.update({'Abstract': entry.summary})
                 
    print(data)
    DATA.append(data)
    
    print("----------------------------------------")

{'arxiv-id': 'quant-ph/0201082v1', 'published': '2002-01-18T15:08:05Z', 'title': 'Quantum Computers and Quantum Computer Languages: Quantum Assembly\n  Language and Quantum C Language', 'Authors': 'Stephen Blaha', 'abs page link': 'http://arxiv.org/abs/quant-ph/0201082v1', 'pdf link': 'http://arxiv.org/pdf/quant-ph/0201082v1', 'Journal reference': 'No journal ref found', 'Comments': '32 pages', 'Primary Category': 'quant-ph', 'All Categories': 'quant-ph, cs.PL', 'Abstract': 'We show a representation of Quantum Computers defines Quantum Turing Machines\nwith associated Quantum Grammars. We then create examples of Quantum Grammars.\nLastly we develop an algebraic approach to high level Quantum Languages using\nQuantum Assembly language and Quantum C language as examples.'}
----------------------------------------
{'arxiv-id': 'quant-ph/0407102v1', 'published': '2004-07-14T14:47:27Z', 'title': 'Quantum Networks for Generating Arbitrary Quantum States', 'Authors': 'Phillip Kaye, Michele Mo