# Section 1: Scrape all Subtopics from arxiv.org

In [None]:
pip install bs4

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_homepage():
    url = 'https://arxiv.org/'
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser') #contains the HTML content of arxiv.org

    categories = {} #maps categories to subtopics

    main_categories = soup.find_all('h2') # main categories are defined by <h2> tags

    subtopics = []
    for category in main_categories:
        category_name = category.text
        
        subtopic_list = category.find_next('ul') # ul division = a bunch <li>s
        
        if subtopic_list: 
            subtopic_links = subtopic_list.find_all('a')
            
            for link in subtopic_links:
                link_url = link.get('href')
                if "/list/" in link_url and "/recent" in link_url:
                    subtopics.append("https://arxiv.org/" + link_url)
            
            # add the category and its subtopics to the dictionary
            categories[category_name] = subtopics

    with open('subtopics.txt', 'w', encoding='utf-8') as f:
        for subtopic in subtopics:
            f.write(subtopic + '\n')

scrape_homepage()

# Section 2: Scrape all papers from subtopic

In [None]:
# this code extracts more information than we need like the authors, and abstract
# I used the functions from get_all_from_subtopic.py instead to just get the titles
import requests
from bs4 import BeautifulSoup

def scrape_all_papers(base_url='https://arxiv.org/list/cs.AI/recent'):
    papers = []

    def scrape_page(url):
        response = requests.get(url)
        html_content = response.text

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extract the papers from the page
        papers_list = soup.find_all('div', class_='meta')
        for paper in papers_list:
            title = paper.find('div', class_='list-title').text.strip()
            authors = paper.find('div', class_='list-authors').text.strip()
            abstract = paper.find('p', class_='abstract').text.strip() if paper.find('p', class_='abstract') else 'No abstract'
            papers.append({'title': title, 'authors': authors, 'abstract': abstract})

        # Find the next page link
        paging_div = soup.find('div', class_='paging')
        next_page_link = None
        if paging_div:
            links = paging_div.find_all('a')
            for link in links:
                if 'Next' in link.text or '>' in link.text:
                    next_page_link = 'https://arxiv.org' + link.get('href')
                    break
        return next_page_link

    next_url = base_url
    while next_url:
        next_url = scrape_page(next_url)

    # Write all papers to a file
    with open("csAItest.txt", "w", encoding='utf-8') as f:
        for paper in papers:
            f.write(f"{paper['title']}\n")
            f.write(f"Authors: {paper['authors']}\n")
            f.write(f"Abstract: {paper['abstract']}\n")
            f.write("\n")

scrape_all_papers()
 

In [None]:
from get_all_from_subtopic import scrape_articles

with open ('subtopics.txt', 'rb') as f:
    itemlist = f.read().splitlines()

for link in itemlist:
    scrape_articles(link)

In [None]:
import os
from glob import glob

# Path to the directory containing the txt files
directory_path = "CHANGE ME/SUBTOPIC OF INTEREST/*"
output_file = "cs_titles.txt"

# find all txt files in the directory
txt_files = glob(directory_path)

merged_content = []

for file_path in txt_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        merged_content.append(file.read())

merged_content_str = "\n".join(merged_content)

with open(output_file, 'w', encoding='utf-8') as output_file:
    output_file.write(merged_content_str)

print(f"Merged {len(txt_files)} files into {output_file.name}")
