## ArxiveAPI

In [6]:
import re
import itertools
import json
import sys
import os
import time
import traceback
from lxml import etree
import pandas as pd
import glob
import numpy as np

In [25]:
 import xml.etree.ElementTree as ET

In [29]:
#!pip install feedparser

### Systematic Parsing

python_arXiv_parsing_example.py

This sample script illustrates a basic ```arXiv api``` call
followed by parsing of the results using the 
feedparser python module.

Please see the documentation at 
http://export.arxiv.org/api_help/docs/user-manual.html
for more information, or email the arXiv api 
mailing list at arxiv-api@googlegroups.com.

urllib is included in the standard python library.
feedparser can be downloaded from http://feedparser.org/ .

*Author: Julius B. Lucks*

This is free software.  Feel free to do what you want
with it, but please play nice with the arXiv API!

https://static.arxiv.org/static/arxiv.marxdown/0.1/help/api/examples/python_arXiv_parsing_example.txt

In [83]:
import urllib
import feedparser

In [84]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'all:quantum' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 5000

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)


In [85]:
# Opensearch metadata such as totalResults, startIndex, 
# and itemsPerPage live in the opensearch namespase.
# Some entry metadata lives in the arXiv namespace.
# This is a hack to expose both of these namespaces in
# feedparser v4.1
#feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
#feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

In [86]:
# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

#### Feed Information

In [87]:
# print out feed information
print('Feed title: %s' % feed.feed.title)
print('Feed last updated: %s' % feed.feed.updated)

Feed title: ArXiv Query: search_query=all:quantum&amp;id_list=&amp;start=0&amp;max_results=5000
Feed last updated: 2021-05-27T00:00:00-04:00


#### Metadata

In [88]:
# print opensearch metadata
print('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print ('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print ('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

totalResults for this query: 324135
itemsPerPage for this query: 5000
startIndex for this query: 0


#### Entry by entry

In [89]:
DATA = []
# Run through each entry, and print out information
for entry in feed.entries:
    data = {}
    data.update({'arxiv-id': entry.id.split('/abs/')[-1]})
    data.update({'published': entry.published})
    data.update({'title': entry.title})
    
    
    
    # feedparser v5.0.1 correctly handles multiple authors, print them all
    try:
        data.update({'Authors': ', '.join(author.name for author in entry.authors)})
    except AttributeError:
        pass

    
    
    # get the links to the abs page and pdf for this e-print
    for link in entry.links:
        if link.rel == 'alternate':
            data.update({'abs page link': link.href})
        elif link.title == 'pdf':
            data.update({'pdf link': link.href})
            
            
    
    # The journal reference, comments and primary_category sections live under 
    # the arxiv namespace
    try:
        journal_ref = entry.arxiv_journal_ref
    except AttributeError:
        journal_ref = 'No journal ref found'
    data.update({'Journal reference':journal_ref})
    
    try:
        comment = entry.arxiv_comment
    except AttributeError:
        comment = 'No comment found'
    data.update({'Comments': comment})
    
    
    # Since the <arxiv:primary_category> element has no data, only
    # attributes, feedparser does not store anything inside
    # entry.arxiv_primary_category
    # This is a dirty hack to get the primary_category, just take the
    # first element in entry.tags.  If anyone knows a better way to do
    # this, please email the list!
    data.update({'Primary Category': entry.tags[0]['term']})
    
    # Lets get all the categories
    all_categories = [t['term'] for t in entry.tags]
    data.update({'All Categories': (', ').join(all_categories)})
    
    # The abstract is in the <summary> element
    data.update({'Abstract': entry.summary})
                 
    #print(data)
    DATA.append(data)
    
    #print("----------------------------------------")

In [90]:
with open ("arxiv.json", "w")as f:
    json.dump(DATA,f)

In [91]:
df = pd.DataFrame(DATA)

In [92]:
df.head()

Unnamed: 0,arxiv-id,published,title,Authors,abs page link,pdf link,Journal reference,Comments,Primary Category,All Categories,Abstract
0,quant-ph/0201082v1,2002-01-18T15:08:05Z,Quantum Computers and Quantum Computer Languag...,Stephen Blaha,http://arxiv.org/abs/quant-ph/0201082v1,http://arxiv.org/pdf/quant-ph/0201082v1,No journal ref found,32 pages,quant-ph,"quant-ph, cs.PL",We show a representation of Quantum Computers ...
1,quant-ph/0407102v1,2004-07-14T14:47:27Z,Quantum Networks for Generating Arbitrary Quan...,"Phillip Kaye, Michele Mosca",http://arxiv.org/abs/quant-ph/0407102v1,http://arxiv.org/pdf/quant-ph/0407102v1,"Phillip Kaye, Michele Mosca, ""Quantum Networks...","3 pages, 2 figures. This paper previously appe...",quant-ph,quant-ph,Quantum protocols often require the generation...
2,0804.3401v1,2008-04-21T20:07:38Z,Quantum Computational Complexity,John Watrous,http://arxiv.org/abs/0804.3401v1,http://arxiv.org/pdf/0804.3401v1,No journal ref found,"44 pages, 14 figures. To appear in the Springe...",quant-ph,quant-ph,This article surveys quantum computational com...
3,1311.4939v1,2013-11-20T02:23:12Z,Geometrical perspective on quantum states and ...,Zeqian Chen,http://arxiv.org/abs/1311.4939v1,http://arxiv.org/pdf/1311.4939v1,No journal ref found,4 pages,quant-ph,quant-ph,We interpret quantum computing as a geometric ...
4,1611.03472v1,2016-11-10T20:30:19Z,Universal Quantum Algorithm,Avatar Tulsi,http://arxiv.org/abs/1611.03472v1,http://arxiv.org/pdf/1611.03472v1,No journal ref found,This is a preliminary draft. Comments are most...,quant-ph,quant-ph,Quantum amplitude amplification and quantum ph...


In [93]:
df.to_csv("arxiv.csv")

In [94]:
df.shape

(5000, 11)