In [1]:
import os
import pandas as pd
import requests

from bs4 import BeautifulSoup
from tqdm import tqdm

In [2]:
def scrape_arxiv(url, output_filepath='output.xlsx'):
    '''
    Function to scrape an arXiv search.

    Args:
        - url: search strategy URL.
        - output_filepath: ('output.xlsx' by default).

    Returns:
        df: DataFrame with search results (title, authors, abstract, article_links for each article).

  
    '''
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    titles = [title.text.strip() for title in soup.find_all('p', class_='title is-5 mathjax')]
    authors = [' '.join(author.text[9:].split()) for author in soup.find_all('p', class_='authors')]
    abstracts = [abstract.text.strip().rsplit("△ Less", 1)[0] for abstract in soup.find_all('span', class_='abstract-full has-text-grey-dark mathjax')]
    article_links = [link['href'] for link in soup.find_all('a', text='pdf')]

    df = pd.DataFrame({
        'title': titles,
        'authors': authors,
        'abstract': abstracts,
        'pdf_link': article_links
    })

    df.insert(0, 'id', range(1, len(df) + 1))
    df.to_excel(output_filepath, index=False)

    print(f'{df.shape[0]} results were found.')
    return df


def download_papers(links_list,path='papers'):
    '''
    Function to download a list of .pdf links.

    Args:
        - links_list: List of .pdf links.
        - path: Folder path where files will be downloaded.

    Returns:
        None.

    @author: Enrique Callejas Castro
    '''
    os.makedirs('papers', exist_ok=True)

    for i in tqdm(range(len(links_list)), desc='Downloading papers'):
        response = requests.get(links_list[i])
        filename = os.path.join(path, f'{i+1}.pdf')
        with open(filename, 'wb') as f:
            f.write(response.content)

In [3]:
# Example usage.
# search strategy: title="deep learning"; AND title="health*"    
url = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term=%22deep+learning%22&terms-0-field=title&terms-1-operator=AND&terms-1-term=%22health*%22&terms-1-field=title&classification-physics_archives=all&classification-include_cross_list=include&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size=50&order=-announced_date_first'  

# Scraping url.
df = scrape_arxiv(url,'example.xlsx')
display(df.head())

# Downloading papers.
download_papers(df['pdf_link'])

38 results were found.


Unnamed: 0,id,title,authors,abstract,pdf_link
0,1,AlerTiger: Deep Learning for AI Model Health M...,"Zhentao Xu, Ruoying Wang, Girish Balaji, Manas...",Data-driven companies use AI models extensivel...,https://arxiv.org/pdf/2306.01977
1,2,Mitigating climate and health impact of small-...,"Usman Nazir, Murtaza Taj, Momin Uppal, Sara Kh...",Industrial air pollution has a direct health i...,https://arxiv.org/pdf/2303.11654
2,3,Deep Learning Mental Health Dialogue System,"Lennart Brocki, George C. Dyer, Anna Gładka, N...",Mental health counseling remains a major chall...,https://arxiv.org/pdf/2301.09412
3,4,Enhancing the prediction of disease outcomes u...,"Zhichao Yang, Weisong Liu, Dan Berlowitz, Hong Yu",Question: Can an encoder-decoder architecture ...,https://arxiv.org/pdf/2212.12067
4,5,Deep learning for structural health monitoring...,"Fabio Carrara, Fabrizio Falchi, Maria Girardi,...",Thanks to recent advancements in numerical met...,https://arxiv.org/pdf/2211.10351


Downloading papers: 100%|██████████| 38/38 [02:37<00:00,  4.15s/it]
