In [1]:
import requests
from bs4 import BeautifulSoup
import time
import os

In [2]:
#Getting user inputs
query_input   = input("\nEnter your search term for arXiv: ").strip() #what you need to search in arxiv
choice_papers = int(input("\nEnter how many Papers per page (can only input number [25,50,100,200] only if not default value is 50): ").strip())
num_pages     = int(input("\nEnter how many pages to scrape (each has above mentioned papers): ").strip())
choice        = input("\nEnter the choice for paper sorting\n"
                        "1. Latest by announce date\n"
                        "2. Latest by submition date\n"
                        "3. Oldest by announce date\n"
                        "4. Oldest by submition date\n"
                        "5. By Relevence\n"
                        "   Default option is '1'\n"
                        "Entered Choice >>>").strip()


Enter your search term for arXiv:  architecture security

Enter how many Papers per page (can only input number [25,50,100,200] only if not default value is 50):  150

Enter how many pages to scrape (each has above mentioned papers):  2

Enter the choice for paper sorting
1. Latest by announce date
2. Latest by submition date
3. Oldest by announce date
4. Oldest by submition date
5. By Relevence
   Default option is '1'
Entered Choice >>> 2


In [20]:
#sorting_order setting(selection) based on choice entered
if choice == "1":
    order = "-announced_date_first"
elif choice == "2":
    order = "-submitted_date"
elif choice == "3":
    order = "announced_date_first"    
elif choice == "4":
    order = "submitted_date"
elif choice == "5":
    order = ""
else:
    order = "-announced_date_first"  # default value

#Setting the papers(results) per page according to user input choice_papers
if choice_papers in [25, 50, 100 , 200 ]:
    num_papers = choice_papers
else:
    num_papers = 50 #default number of results(papers) per page    

In [21]:
#Just printing what we are going to scrape and how many 
total_papers=num_papers*num_pages

print(f"Scraping {total_papers} Papers...\nfrom {num_pages} pages and per page {num_papers} papers\n")
print(f"Related to '{query_input}'\n")
print(f"Taking papers based on the sorting : {order}")

Scraping 100 Papers...
from 2 pages and per page 50 papers

Related to 'architecture security'

Taking papers based on the sorting : -submitted_date


In [22]:
#Url creation according to arxiv
query = query_input.replace(" ", "+")
base_url = "https://arxiv.org/search/"
url_temp = f"{base_url}?query={query}&searchtype=all&abstracts=show&order={order}&size={num_papers}"

#showing how the url would look like
url_temp

'https://arxiv.org/search/?query=architecture+security&searchtype=all&abstracts=show&order=-submitted_date&size=50'

In [23]:
#Debugging - Getting the http request
response = requests.get(url_temp)
response.raise_for_status()

In [24]:
#debugiing
#a="&start=100"
#x=f"{url}{a}"
#x

In [25]:
num_papers

50

In [27]:
papers=[] #array to store the scraped papers
count=0 #for counting how many papers fetched
for page in range(num_pages):
    start0 = page*num_papers
    start1 =f"&start={start0}"
    url = f"{url_temp}{start1}"
    
    print(f"\n\nFetching page number {page+1}\n{url}")
    print("Paper number: ")
   
    response = requests.get(url) #getting the http request
    response.raise_for_status() #if http request failed show error message
    soup = BeautifulSoup(response.text, "html.parser") #creating soup object for futher extracion

    for item in soup.select("li.arxiv-result"): #"li.arxiv-result"  is the tag that for each paper section so this will iterate over all in current page
        #extracting title of paper
        title_tag = item.select_one("p.title")
        title = title_tag.get_text(strip=True) if title_tag else "No Title Found"

        #extract publication info, date, .... 
        meta_tag = item.select_one("p.is-size-7")
        meta_text = meta_tag.get_text(" ", strip=True) if meta_tag else "No metadata found"

        #extracting Abstract
        abstract_tag = item.select_one("span.abstract-full")
        abstract = abstract_tag.get_text(strip=True) if abstract_tag else "No Abstract Found"

        #debugging 
        count=count+1
        print(f"{count}, ",end="")
        
        #saving each in to papers
        papers.append((title, meta_text, abstract))
        time.sleep(1/50) #just adding some time delay for each request to not to spam the server kinda

        
print("\n\n\nCompleted fetching data\n") 



Fetching page number 1
https://arxiv.org/search/?query=architecture+security&searchtype=all&abstracts=show&order=-submitted_date&size=50&start=0
Paper number: 
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 

Fetching page number 2
https://arxiv.org/search/?query=architecture+security&searchtype=all&abstracts=show&order=-submitted_date&size=50&start=50
Paper number: 
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 


Completed fetching data



In [28]:
print(f"Totally {len(papers)} papers fetched")

Totally 100 papers fetched


In [29]:
file_name = query_input.replace("/", "+")

for future note
need to specify path etc and what file type does the data should be stored etc....

In [30]:
#storing the result to a file inside folder named arxivscrpped . (here now to just a text file that uses utf-8 encoding)
folder_name = "arxivscraped"
os.makedirs(folder_name, exist_ok=True)
filename = f"arxiv_{file_name}_{total_papers}_{order}.txt" #deciding file name
filepath = f"{folder_name}/{filename}" #deciding path

with open(filepath, "w", encoding="utf-8") as f:
    for idx, (title,meta_text, abstract) in enumerate(papers, 1):
        f.write(f"Paper {idx}:\n")
        f.write(f"Title   : {title}\n")
        f.write(f"Source  : {meta_text}\n")
        f.write(f"Abstract: {abstract}\n")
        f.write("\n"+"-"*100+"\n\n")
 
print(f"Done Scraping {len(papers)} newest papers for  '{query_input}' .")

Done Scraping 100 newest papers for  'architecture security' .
