In [None]:
# Download arxiv documents

In [3]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def get_arxiv(topic, number):
  url = "http://export.arxiv.org/api/query?search_query=all:%s&max_results=%d" % (topic, number)
  response = requests.get(url)

  if response.status_code == 200:
      content = response.content
      # Parse the XML content
      root = ET.fromstring(content)
      # Lists to store extracted data
      titles = []
      summaries = []
      published_dates = []
      authors_list = []
      categories_list = []

      # Looping to extract information
      for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
          title = entry.find('{http://www.w3.org/2005/Atom}title').text
          summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
          
          published = entry.find('{http://www.w3.org/2005/Atom}published').text
          authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')]
          categories = [category.attrib['term'] for category in entry.findall('{http://www.w3.org/2005/Atom}category')]

          titles += [title]
          summaries += [summary]
          published_dates += [published]
          authors_list += [authors]
          categories_list += [categories]

      data = { 'Title': titles, 'Summary': summaries, 'Published Date': published_dates, 'Categories': categories_list, 'Authors': authors_list    }
      df = pd.DataFrame(data)
      return df
  else:
      print("Error: Unable to fetch content from the URL.")

In [4]:
data = get_arxiv("my favourite topic", 1000)
print("Dataset size:", len(data))
data.head()

Dataset size: 1000


Unnamed: 0,Title,Summary,Published Date,Categories,Authors
0,Astroparticle physics - A Personal Outlook,"At the request of the organizers, this talk ...",1996-02-15T17:41:36Z,[astro-ph],[John Ellis]
1,Une liste de problèmes,This is a structured compilation of some of ...,2022-12-12T09:34:12Z,"[math.AG, 14-01 14-02 14D10 14E08 14G12]",[Jean-Louis Colliot-Thélène]
2,Noetherianity up to symmetry,These lecture notes for the 2013 CIME/CIRM s...,2013-10-07T09:00:53Z,[math.AG],[Jan Draisma]
3,Multi-boson correlations using wave-packets II,We investigate the analytically solvable pio...,2007-09-24T21:23:17Z,[nucl-th],"[M. I. Nagy, T. Csorgo]"
4,Topological quantum field theories,Following my plenary lecture on ICMP2000 I r...,2000-11-29T00:29:54Z,[hep-th],[Albert Schwarz]
