### automate the fetching of the most recent papers in arxiv on a specific topic (search query)  and save the information to a csv

Arxiv api : https://info.arxiv.org/help/api/basics.html#quickstart

In [1]:
#install feedparser to parse the arxiv api response
!pip install feedparser



In [2]:
query = "gen+ai"

### query the api with 'gen+ai' , get 20 results

In [3]:
import feedparser
import pandas as pd

# Define search parameters
base_url = "http://export.arxiv.org/api/query?"

params = {
    "search_query": f"all:{query}",
    "start": 0,
    "max_results": 20,
    "sortBy": "submittedDate",
    "sortOrder": "descending"
}

# Construct the full query URL
query_url = (
    f"{base_url}search_query={params['search_query']}"
    f"&start={params['start']}&max_results={params['max_results']}"
    f"&sortBy={params['sortBy']}&sortOrder={params['sortOrder']}"
)

# Parse the arXiv feed
feed = feedparser.parse(query_url)




inspect the response to find appropriate fileds to save

In [4]:
type(feed)


feedparser.util.FeedParserDict

In [5]:
list(feed.keys())

['bozo',
 'entries',
 'feed',
 'headers',
 'href',
 'status',
 'encoding',
 'version',
 'namespaces']

In [6]:
type(feed.entries)

list

In [7]:
feed.entries[0]

{'id': 'http://arxiv.org/abs/2502.14865v1',
 'guidislink': True,
 'link': 'http://arxiv.org/abs/2502.14865v1',
 'updated': '2025-02-20T18:59:51Z',
 'updated_parsed': time.struct_time(tm_year=2025, tm_mon=2, tm_mday=20, tm_hour=18, tm_min=59, tm_sec=51, tm_wday=3, tm_yday=51, tm_isdst=0),
 'published': '2025-02-20T18:59:51Z',
 'published_parsed': time.struct_time(tm_year=2025, tm_mon=2, tm_mday=20, tm_hour=18, tm_min=59, tm_sec=51, tm_wday=3, tm_yday=51, tm_isdst=0),
 'title': 'Time Travel: A Comprehensive Benchmark to Evaluate LMMs on Historical\n  and Cultural Artifacts',
 'title_detail': {'type': 'text/plain',
  'language': None,
  'base': 'http://export.arxiv.org/api/query?search_query=all:gen+ai&start=0&max_results=20&sortBy=submittedDate&sortOrder=descending',
  'value': 'Time Travel: A Comprehensive Benchmark to Evaluate LMMs on Historical\n  and Cultural Artifacts'},
 'summary': "Understanding historical and cultural artifacts demands human expertise and\nadvanced computational 

In [8]:
type(feed.entries[0])

feedparser.util.FeedParserDict

In [9]:
# print each key value pair in feed.entries[0]

for key, value in feed.entries[0].items():
  print(f"{key}: {value}")


id: http://arxiv.org/abs/2502.14865v1
guidislink: True
link: http://arxiv.org/abs/2502.14865v1
updated: 2025-02-20T18:59:51Z
updated_parsed: time.struct_time(tm_year=2025, tm_mon=2, tm_mday=20, tm_hour=18, tm_min=59, tm_sec=51, tm_wday=3, tm_yday=51, tm_isdst=0)
published: 2025-02-20T18:59:51Z
published_parsed: time.struct_time(tm_year=2025, tm_mon=2, tm_mday=20, tm_hour=18, tm_min=59, tm_sec=51, tm_wday=3, tm_yday=51, tm_isdst=0)
title: Time Travel: A Comprehensive Benchmark to Evaluate LMMs on Historical
  and Cultural Artifacts
title_detail: {'type': 'text/plain', 'language': None, 'base': 'http://export.arxiv.org/api/query?search_query=all:gen+ai&start=0&max_results=20&sortBy=submittedDate&sortOrder=descending', 'value': 'Time Travel: A Comprehensive Benchmark to Evaluate LMMs on Historical\n  and Cultural Artifacts'}
summary: Understanding historical and cultural artifacts demands human expertise and
advanced computational techniques, yet the process remains complex and
time-inten

### pick title, authors , abstract ,summary ,  paper_url (link), pdf_url (link to pdf) , published date for saving to csv

In [10]:
data = []
for entry in feed.entries:
    title = entry.title.strip()
    authors = ", ".join(author.name for author in entry.authors)
    abstract = entry.summary.strip()
    summary = entry.summary.strip()
    paper_url = entry.link
    # Create PDF URL by replacing 'abs' with 'pdf' and appending .pdf
    pdf_url = entry.id.replace("abs", "pdf") + ".pdf"
    published = entry.published  # publication date as string

    data.append({
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "summary":summary,
        "paper_url": paper_url,
        "pdf_url": pdf_url,
        "date": published
    })



add these records to a dataframe

In [11]:
# Convert to DataFrame and convert date field to datetime
df = pd.DataFrame(data)
df["date"] = pd.to_datetime(df["date"])

# Sort the DataFrame by date (most recent first)
df_sorted = df.sort_values(by="date", ascending=False)



In [13]:

# Save the sorted DataFrame to a CSV file
csv_filename = f"latest_arxiv_research_papers_on_{query}.csv"
df_sorted.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' created successfully with {len(df_sorted)} papers.")

CSV file 'latest_arxiv_research_papers_on_gen+ai.csv' created successfully with 20 papers.
