## Requirements

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Get Abstracts urls

In [3]:
url = "https://arxiv.org/list/astro-ph.SR/new"
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')

In [4]:
links = list()
items = soup.find_all('dt')
for item in items:
    link = item.find_all('a', href=True)[0]['href']
    links.append(f'https://arxiv.org{link}')

## Scrapping from urls

In [13]:
dict_art = {
    'title': [],
    'authors':[],
    'comment':[],
    'subjects':[],
    'abstract':[]
}

In [14]:
for link in links:
    res = requests.get(link)
    soup = BeautifulSoup(res.text, 'html.parser')

    # Title
    title = soup.findAll('h1', {"class":"title mathjax"})[0].text
    title = title.split('Title:')[1].strip()

    dict_art['title'].append(title)

    # Authors
    authors = soup.findAll('div', {"class":"authors"})[0].find_all('a')
    authors_list = list()
    for author in authors:
        authors_list.append(author.text)

    dict_art['authors'].append(authors_list)

    meta = soup.findAll('tr')

    if len(meta)<5:
        dict_art['comment'].append(None)

        subjects = soup.findAll('tr')[0].find_all('td')
        if subjects[0].text == "Subjects:":
            dict_art['subjects'].append(subjects[1].text.replace('\n',''))
        else: 
            dict_art['subjects'].append(None)
    else:
        comments = soup.findAll('tr')[0].find_all('td')
        if comments[0].text == "Comments:":
            dict_art['comment'].append(comments[1].text)
        else: 
            dict_art['comment'].append(None)
        
        # Subjects
        subjects = soup.findAll('tr')[1].find_all('td')
        if subjects[0].text == "Subjects:":
            dict_art['subjects'].append(subjects[1].text.replace('\n',''))
        else: 
            dict_art['subjects'].append(None)

    # Subjects
    abstract = soup.findAll('blockquote', {"class":"abstract mathjax"})[0].text
    abstract = abstract.replace('\n', '').split('Abstract:  ')[1].strip()
    dict_art['abstract'].append(abstract)


In [16]:
df = pd.DataFrame.from_dict(dict_art, orient='index').T

In [20]:
df.authors

0     [Kareem El-Badry, Hans-Walter Rix, Yvette Cend...
1      [Philippe Z. Yao, Eliot Quataert, Andy Goulding]
2     [M. Lafarga, I. Ribas, M. Zechmeister, A. Rein...
3                                   [A. Ali, A. Mindil]
4     [E. Antonucci, C. Downs, G. E. Capuano, D. Spa...
5                 [M. Kriginsky, R. Oliver, D. Kuridze]
6                     [Gregory W. Henry, Jacob L. Bean]
7     [Niall Whiteford, Alistair Glasse, Katy L. Chu...
8     [Viraj Manwadkar, Alessandro Alberto Trani, Ba...
9     [Sven Kiefer, David Gobrecht, Leen Decin, Chri...
10    [Omkar Dhamane, Anil Raghav, Zubair Shaikh, Ut...
11    [Florian Kirchschlager, Franziska D. Schmidt, ...
12        [Markus J. Aschwanden, Nived Vilangot Nhalil]
13    [P. Beniamini, Z. Wadiasingh, J. Hare, K. Rajw...
14            [Justin Kin Jun Hew, Christoph Federrath]
15                                 [Janosz W. Dewberry]
16    [H. Barzegar, M. Bigdeli, G. H. Bordbar, B. Es...
Name: authors, dtype: object

In [21]:
df['nb_of_aut'] = df.authors.apply(len)
df.to_csv("")