# 1. Using HTTP Request

In [2]:
from bs4 import BeautifulSoup
import urllib.request

url = 'http://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=100'
data = urllib.request.urlopen(url)

# Parse the XML data
soup = BeautifulSoup(data, 'xml')

# Initialize an empty list to store the results
results = []

# Iterate over each entry in the XML data
for entry in soup.find_all('entry'):
    # Initialize an empty dictionary for each entry
    result = {}
    
    # Extract the abstract, title, and authors
    result['abstract'] = entry.find('summary').text.strip()
    result['title'] = entry.find('title').text.strip()
    
    # Extract the author names and get only the first name
    authors = entry.find_all('author')
    result['authors'] = [author.find('name').text.strip().split('\n')[0] for author in authors]
    
    # Append the result to the list of results
    results.append(result)

# Print the list of results
print(results)



[{'abstract': 'The effect of the electron-electron cusp on the convergence of configuration\ninteraction (CI) wave functions is examined. By analogy with the\npseudopotential approach for electron-ion interactions, an effective\nelectron-electron interaction is developed which closely reproduces the\nscattering of the Coulomb interaction but is smooth and finite at zero\nelectron-electron separation. The exact many-electron wave function for this\nsmooth effective interaction has no cusp at zero electron-electron separation.\nWe perform CI and quantum Monte Carlo calculations for He and Be atoms, both\nwith the Coulomb electron-electron interaction and with the smooth effective\nelectron-electron interaction. We find that convergence of the CI expansion of\nthe wave function for the smooth electron-electron interaction is not\nsignificantly improved compared with that for the divergent Coulomb interaction\nfor energy differences on the order of 1 mHartree. This shows that, contrary to\

# 2. Using arxiv package

In [36]:
import arxiv, logging, PyPDF2, requests, io, os
logging.basicConfig(level=logging.DEBUG)

class ArxivScraper(object):
  def __init__(self, query, max_results=10):
    
    self.query = query

  # Construct the default API client.
    self.client = arxiv.Client()

    # Search for the 10 most recent articles matching the keyword "quantum."
    self.search = arxiv.Search(
      query = query,
      max_results = max_results,
    )
    
  def scrape(self):

    papers = []
    for r in self.client.results(self.search):
      
      url = "https://export." + r.pdf_url.split("://")[-1]
      req = requests.get(url)
      
      file_bin = io.BytesIO(req.content)
      pdf_reader = PyPDF2.PdfReader(file_bin)
      text = "\n".join([pdf_reader.pages[i].extract_text() for i in range(len(pdf_reader.pages))])
      
      file_name = f"{url.split('pdf/')[-1]}.txt"
      #save text as txt where the name is the entry_id contained in folder named by query
      if not os.path.exists(self.query):
        os.makedirs(self.query)
        
      with open(f"{self.query}/{file_name}", "w", encoding = 'utf-8') as f:
       f.write(text)
      
      r.
      papers.append({
        'entry_id': r.entry_id,
        'title': r.title,
        'abstract': r.summary,
        'authors': r.authors,
        'pdf_url': r.pdf_url,
        'doi': r.doi,
        'updated': r.updated,
        'published': r.published,
        'categories': r.categories,
        'text': text,
        'ref': text.split("\nReferences\n")[-1]
      })
      
    return papers
    



In [37]:
papers = ArxivScraper("Mixture of Experts", 5).scrape()

INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=Mixture+of+Experts&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=Mixture+of+Experts&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100 HTTP/1.1" 200 56454
INFO:arxiv:Got first page: 100 of 2367007 total results
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /pdf/1806.08200v1 HTTP/1.1" 200 1363398
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /pdf/1312.4314v3 HTTP/1.1" 200 1928905
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
DEBUG:urllib3.

In [None]:
OPENAI_API_KEY = "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"


In [38]:
papers

[{'entry_id': 'http://arxiv.org/abs/1806.08200v1',
  'title': 'Mixtures of Experts Models',
  'abstract': 'Mixtures of experts models provide a framework in which covariates may be\nincluded in mixture models. This is achieved by modelling the parameters of the\nmixture model as functions of the concomitant covariates. Given their mixture\nmodel foundation, mixtures of experts models possess a diverse range of\nanalytic uses, from clustering observations to capturing parameter\nheterogeneity in cross-sectional data. This chapter focuses on delineating the\nmixture of experts modelling framework and demonstrates the utility and\nflexibility of mixtures of experts models as an analytic tool.',
  'authors': [arxiv.Result.Author('Isobel Claire Gormley'),
   arxiv.Result.Author('Sylvia Frühwirth-Schnatter')],
  'pdf_url': 'http://arxiv.org/pdf/1806.08200v1',
  'doi': None,
  'updated': datetime.datetime(2018, 6, 21, 12, 30, 12, tzinfo=datetime.timezone.utc),
  'published': datetime.datetime