# XML to CSV

To receive some data entries to work with in the database, I use [https://export.arxiv.org](https://export.arxiv.org) via the query endpoint.

The query: [https://export.arxiv.org/api/query?search_query=cat:cs.*&start=0&max_results=300&sortBy=submittedDate&sortOrder=descending
](https://export.arxiv.org/api/query?search_query=cat:cs.*&start=0&max_results=300&sortBy=submittedDate&sortOrder=descending
).

In [17]:
import requests, os

n_entries = 100
batch_size = 10 # break-down process to get 10 items at the time
file_path = 'arxiv_papers.xml'

# Write the content of the url query to a file
response = requests.get(url)

# Check if the file exists and overwrite it
if os.path.exists(file_path):
    os.remove(file_path)

with open(file_path, 'w') as file:
    for start in range(0, n_entries, batch_size):
        url = f"""
            https://export.arxiv.org/api/query?search_query=cat:cs.*&
            start={start}&
            max_results={batch_size}&
            sortBy=submittedDate&
            sortOrder=descending
        """
        response = requests.get(url)
        file.write(response.text)




In [2]:
import xml.etree.ElementTree as ET
import csv

# Parse the XML file
tree = ET.parse('arxiv_papers.xml')
root = tree.getroot()

# Open a new CSV file to write
with open('arxiv_papers.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    
    # Write header
    csvwriter.writerow(['title', 'author', 'summary', 'published'])
    
    # Extract data
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        author = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        published = entry.find('{http://www.w3.org/2005/Atom}published').text
        
        # Write row to CSV
        csvwriter.writerow([title, author, summary, published])