In [25]:
import pandas as pd
import numpy as np
import urllib.request as libreq
from groq import Groq
import xml.etree.ElementTree as ET
from IPython.display import display, Latex
from scholarly import ProxyGenerator, scholarly, MaxTriesExceededException
import certifi
import os
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
from bs4 import BeautifulSoup

os.environ['SSL_CERT_FILE'] = certifi.where()

In [5]:
todays_date = datetime.today().strftime('%Y%m%d')
print(todays_date)
yesterdays_date = (datetime.today() - pd.Timedelta(days=1)).strftime('%Y%m%d')
print(yesterdays_date)
todays_date = '20241231'
yesterdays_date = '20241230'

20250103
20250102


In [17]:
query = 'search_query=cat:astro-ph*+AND+submittedDate:[' + str(yesterdays_date) + '+TO+' + str(todays_date) + ']&start=0&max_results=80&sortBy=submittedDate&sortOrder=ascending'
print(query)
base_url = 'http://export.arxiv.org/api/query?'

with libreq.urlopen(base_url + query) as url:
    r = url.read()
print(r)

# Parse the XML content
root = ET.fromstring(r)

# Function to print the XML in a readable format
def print_readable_xml(element, indent=""):
    for child in element:
        print(f"{indent}{child.tag}: {child.text.strip() if child.text else ''}")
        print_readable_xml(child, indent + "  ")


search_query=cat:astro-ph*+AND+submittedDate:[20241230+TO+20241231]&start=0&max_results=80&sortBy=submittedDate&sortOrder=ascending


In [18]:
data = pd.DataFrame(columns=['title', 'abstract', 'authors', 'published', 'link', 'category'])
namespace = {'atom': 'http://www.w3.org/2005/Atom'}

entries = []
for entry in root.findall('atom:entry', namespace):
    title = entry.find('atom:title', namespace).text.strip()
    abstract = entry.find('atom:summary', namespace).text.strip()
    authors = ', '.join([author.find('atom:name', namespace).text.strip() for author in entry.findall('atom:author', namespace)])
    published = entry.find('atom:published', namespace).text.strip()
    link = entry.find('atom:link[@rel="alternate"]', namespace).attrib['href']
    category = entry.find('atom:category', namespace).attrib['term']
    
    entries.append({'title': title, 'abstract': abstract, 'authors': authors, 'published': published, 'link': link, 'category': category})

data = pd.concat([data, pd.DataFrame(entries)], ignore_index=True)

print(data)

                                                title  \
0   Lower Limits on Scalar Ultralight Dark Matter ...   
1   The Advantage of Early Detection and Localizat...   
2   SgrA* spin and mass estimates through the dete...   
3   Gravitational Lensing and Image Distortion by ...   
4   Solar Filaments Detection using Active Contour...   
5   Probing Long-Range Forces Between Neutrinos wi...   
6   The Pristine survey. XXVI. Chemical abundances...   
7   Pre-trained Audio Transformer as a Foundationa...   
8   Dynamical system describing cloud of particles...   
9   Robust Bayesian inference with gapped LISA dat...   
10  Rotational excitation cross sections for chlor...   
11  Constraining the $f$-mode oscillations frequen...   
12  Constraining the modified symmetric teleparall...   
13  Diprotodon on the sky. The Large Galactic Supe...   
14  Revisiting the flaring activity in early 2015 ...   
15  Mergers of Binary Primordial Black Holes in Ev...   
16  Recovering 21cm global sign

In [14]:
data['published'] = pd.to_datetime(data['published'])
data['published'] = data['published'].dt.date

In [15]:
category_replacements = {
    'CO': 'Cosmology and Nongalactic Astrophysics',
    'EP': 'Earth and Planetary Astrophysics',
    'GA': 'Astrophysics of Galaxies',
    'HE': 'High Energy Astrophysical Phenomena',
    'IM': 'Instrumentation and Methods for Astrophysics',
    'SR': 'Solar and Stellar Astrophysics'
}

def replace_category(category):
    category = category.removeprefix('astro-ph.')
    for key, value in category_replacements.items():
        category = category.replace(key, value)
    return category

data['category'] = data['category'].map(replace_category)

In [16]:
data

Unnamed: 0,title,abstract,authors,published,link,category
0,Lower Limits on Scalar Ultralight Dark Matter ...,Oscillations of scalar ultralight dark matter ...,"Tejas Deshpande, Andra Ionescu, Nicholas Mille...",2024-12-30,http://arxiv.org/abs/2412.20623v1,hep-ex
1,The Advantage of Early Detection and Localizat...,Early detection and localization of gravitatio...,"Tao Yang, Rong-Gen Cai, Zhoujian Cao, Hyung Mo...",2024-12-30,http://arxiv.org/abs/2412.20664v1,gr-qc
2,SgrA* spin and mass estimates through the dete...,We analyze the parameter estimation accuracy t...,"Verónica Vázquez-Aceves, Yiren Lin, Alejandro ...",2024-12-30,http://arxiv.org/abs/2412.20738v1,High Energy Astrophysical Phenomena
3,Gravitational Lensing and Image Distortion by ...,We investigate gravitational lensing by \texti...,"Shafia Maryam, Mubasher Jamil, Mustapha Azreg-...",2024-12-30,http://arxiv.org/abs/2412.20745v1,gr-qc
4,Solar Filaments Detection using Active Contour...,"In this article, an active contours without ed...","Sanmoy Bandyopadhyay, Vaibhav Pant",2024-12-30,http://arxiv.org/abs/2412.20749v1,cs.CV
5,Probing Long-Range Forces Between Neutrinos wi...,We study the consequences of new long-range fo...,"David E. Kaplan, Xuheng Luo, Surjeet Rajendran",2024-12-30,http://arxiv.org/abs/2412.20766v1,hep-ph
6,The Pristine survey. XXVI. Chemical abundances...,Context: The C-19 stellar stream is the most m...,"P. Bonifacio, E. Caffau, P. François, N. Marti...",2024-12-30,http://arxiv.org/abs/2412.20776v1,Astrophysics of Galaxies
7,Pre-trained Audio Transformer as a Foundationa...,As gravitational wave detectors become more ad...,"Chayan Chatterjee, Abigail Petulante, Karan Ja...",2024-12-30,http://arxiv.org/abs/2412.20789v1,gr-qc
8,Dynamical system describing cloud of particles...,We consider fairly general class of dynamical ...,"Robert Stańczy, Dorota Bors",2024-12-30,http://arxiv.org/abs/2412.20791v1,math-ph
9,Robust Bayesian inference with gapped LISA dat...,"The Laser Interferometer Space Antenna (LISA),...","Niklas Houba, Jean-Baptiste Bayle, Michele Val...",2024-12-30,http://arxiv.org/abs/2412.20793v1,Instrumentation and Methods for Astrophysics


In [20]:
link = 'https://arxiv.org/list/astro-ph/new'

page = libreq.urlopen(link)
html = page.read().decode('utf-8')
print(html)

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">

<head>  <title>Astrophysics  </title>
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png">
  <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest">
  <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5">
  <meta name="msapplication-TileColor" content="#da532c">
  <meta name="theme-color" content="#ffffff">
  <link rel="stylesheet" type="text/css" 

In [None]:
soup = BeautifulSoup(html, 'html.parser')
h3_tag = soup.find('h3', string=lambda x: x and 'New submissions' in x)
if h3_tag:
    number_of_papers = int(h3_tag.string.split('(')[1].split()[1])
    print(f"Number of papers: {number_of_papers}")
else:
    print("Tag not found")

Number of papers: 41


In [28]:
def extract_paper_metadata(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # title
    title_tag = soup.find('div', class_='list-title mathjax')
    title = title_tag.get_text(strip=True).replace('Title:', '').strip() if title_tag else None

    # abstract
    abstract_tag = soup.find('p', class_='mathjax')
    abstract = abstract_tag.get_text(strip=True) if abstract_tag else None

    # authors
    authors = [author.get_text(strip=True) for author in soup.find_all('a') if 'searchtype=author' in author.get('href', '')]

    # figures, tables, and pages / comments
    comments_tag = soup.find('div', class_='list-comments mathjax')
    comments = comments_tag.get_text(strip=True).replace('Comments:', '').strip() if comments_tag else ''
    figures = int(comments.split(',')[1].strip().split()[0]) if 'figures' in comments else None
    pages = int(comments.split(',')[0].strip().split()[0]) if 'pages' in comments else None

    # PDF link
    pdf_tag = soup.find('a', title='Download PDF')
    pdf_link = pdf_tag['href'] if pdf_tag else None

    # primary subject
    primary_subject_tag = soup.find('span', class_='primary-subject')
    primary_subject = primary_subject_tag.get_text(strip=True) if primary_subject_tag else None

    # journal
    submitted_journal = comments.split('Submitted to ')[-1] if 'Submitted to' in comments else None

    # metadata
    metadata = {}
    for meta_tag in soup.find_all('meta'):
        name = meta_tag.get('name')
        content = meta_tag.get('content')
        if name and content:
            metadata[name] = content

    return {
        'title': title,
        'abstract': abstract,
        'authors': authors,
        'figures': figures,
        'pages': pages,
        'pdf_link': pdf_link,
        'primary_subject': primary_subject,
        'submitted_journal': submitted_journal,
        'metadata': metadata
    }


In [None]:
metadata = extract_paper_metadata(html)
print(metadata)

{'title': 'A new constraint on galaxy-halo connections of [O II] emitters via HOD modelling with angular clustering and luminosity functions from the Subaru HSC survey', 'abstract': 'Establishing a robust connection model between emission-line galaxies (ELGs) and their host dark haloes is of paramount importance in anticipation of upcoming redshift surveys. In this paper, we propose a novel halo occupation distribution (HOD) framework that incorporates galaxy luminosity, a key observable reflecting ELG star-formation activity, into the galaxy occupation model. This innovation enables prediction of galaxy luminosity functions (LFs) and facilitates joint analyses using both angular correlation functions (ACFs) and LFs. Using physical information from luminosity, our model provides more robust constraints on the ELG-halo connection compared to methods relying solely on ACF and number density constraints. Our model was applied to [O II]-emitting galaxies observed at two redshift slices at 

In [30]:
metadata_list = [(key, value) for key, value in metadata.items()]
print(metadata_list)

[('title', 'A new constraint on galaxy-halo connections of [O II] emitters via HOD modelling with angular clustering and luminosity functions from the Subaru HSC survey'), ('abstract', 'Establishing a robust connection model between emission-line galaxies (ELGs) and their host dark haloes is of paramount importance in anticipation of upcoming redshift surveys. In this paper, we propose a novel halo occupation distribution (HOD) framework that incorporates galaxy luminosity, a key observable reflecting ELG star-formation activity, into the galaxy occupation model. This innovation enables prediction of galaxy luminosity functions (LFs) and facilitates joint analyses using both angular correlation functions (ACFs) and LFs. Using physical information from luminosity, our model provides more robust constraints on the ELG-halo connection compared to methods relying solely on ACF and number density constraints. Our model was applied to [O II]-emitting galaxies observed at two redshift slices 