In [68]:
import pandas as pd
import numpy as np
import urllib.request as libreq
from groq import Groq
import xml.etree.ElementTree as ET
from IPython.display import display, Latex
from scholarly import ProxyGenerator, scholarly, MaxTriesExceededException
import certifi
import os
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()
from bs4 import BeautifulSoup
import re
import PyPDF2
import io

os.environ['SSL_CERT_FILE'] = certifi.where()

In [5]:
todays_date = datetime.today().strftime('%Y%m%d')
print(todays_date)
yesterdays_date = (datetime.today() - pd.Timedelta(days=1)).strftime('%Y%m%d')
print(yesterdays_date)
todays_date = '20241231'
yesterdays_date = '20241230'

20250103
20250102


In [17]:
query = 'search_query=cat:astro-ph*+AND+submittedDate:[' + str(yesterdays_date) + '+TO+' + str(todays_date) + ']&start=0&max_results=80&sortBy=submittedDate&sortOrder=ascending'
print(query)
base_url = 'http://export.arxiv.org/api/query?'

with libreq.urlopen(base_url + query) as url:
    r = url.read()
print(r)

# Parse the XML content
root = ET.fromstring(r)

# Function to print the XML in a readable format
def print_readable_xml(element, indent=""):
    for child in element:
        print(f"{indent}{child.tag}: {child.text.strip() if child.text else ''}")
        print_readable_xml(child, indent + "  ")


search_query=cat:astro-ph*+AND+submittedDate:[20241230+TO+20241231]&start=0&max_results=80&sortBy=submittedDate&sortOrder=ascending


In [85]:
link = 'https://arxiv.org/list/astro-ph/new'

page = libreq.urlopen(link)
html = page.read().decode('utf-8')
print(html)

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">

<head>  <title>Astrophysics  </title>
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png">
  <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest">
  <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5">
  <meta name="msapplication-TileColor" content="#da532c">
  <meta name="theme-color" content="#ffffff">
  <link rel="stylesheet" type="text/css" 

In [86]:
soup = BeautifulSoup(html, 'html.parser')
h3_tag = soup.find('h3', string=lambda x: x and 'New submissions' in x)
if h3_tag:
    number_of_papers = int(h3_tag.string.split('(')[1].split()[1])
    print(f"Number of papers: {number_of_papers}")
else:
    print("Tag not found")

Number of papers: 55


In [164]:
def extract_paper_metadata(xml_part):
    soup = BeautifulSoup(xml_part, 'html.parser')

    # title
    title_tag = soup.find('div', class_='list-title mathjax')
    title = title_tag.get_text(strip=True).replace('Title:', '').strip() if title_tag else None

    # abstract
    abstract_tag = soup.find('p', class_='mathjax')
    abstract = abstract_tag.get_text(strip=True) if abstract_tag else None

    # authors
    authors_section = soup.find('div', class_='list-authors')
    authors = [author.get_text(strip=True) for author in authors_section.find_all('a')] if authors_section else []

    # comments
    comments_tag = soup.find('div', class_='list-comments mathjax')
    comments = comments_tag.get_text(strip=True).replace('Comments:', '').strip() if comments_tag else ''
    
    # figures, pages, tables
    figures_match = re.search(r'(\d+)\s+figures', comments)
    figures = int(figures_match.group(1)) if figures_match else None
    pages_match = re.search(r'(\d+)\s+pages', comments)
    pages = int(pages_match.group(1)) if pages_match else None
    tables_match = re.search(r'(\d+)\s+table[s]?', comments)
    tables = int(tables_match.group(1)) if tables_match else None

    # PDF link
    pdf_tag = soup.find('a', title='Download PDF')
    pdf_link = pdf_tag['href'] if pdf_tag else None

    # primary subject
    primary_subject_tag = soup.find('span', class_='primary-subject')
    primary_subject = primary_subject_tag.get_text(strip=True) if primary_subject_tag else None

    # secondary subjects
    secondary_subjects_section = soup.find('div', class_='list-subjects').get_text(strip=True)
    subjects_split = secondary_subjects_section.split(';')
    secondary_subjects = [subject.strip() for subject in subjects_split[1:]] if len(subjects_split) > 1 else None

    # journal
    submitted_journal = comments.split('Submitted to ')[-1] if 'Submitted to' in comments else None
    submitted_journal = comments.split('Accepted to ')[-1] if 'Accepted to' in comments else submitted_journal
    submitted_journal = comments.split('Accepted for publication in ')[-1] if 'Accepted for publication in' in comments else submitted_journal
    submitted_journal = comments.split('Accepted by ')[-1] if 'Accepted by' in comments else submitted_journal
    submitted_journal = comments.split('Submitted by ')[-1] if 'Submitted by' in comments else submitted_journal

    return {
        'title': title,
        'abstract': abstract,
        'authors': authors,
        'figures': figures,
        'pages': pages,
        'tables': tables,
        'pdf_link': 'arxiv.org' + pdf_link,
        'primary_subject': primary_subject,
        'secondary_subjects': secondary_subjects,
        'submitted_journal': submitted_journal
    }

def extract_all_papers(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    all_metadata = []

    # <a name='itemX'>
    items = soup.find_all('a', attrs={'name': True})

    for i in range(number_of_papers - 1):
        start = items[i]
        end = items[i + 1]

        start_index = str(soup).find(str(start))
        end_index = str(soup).find(str(end))
        xml_part = str(soup)[start_index:end_index]

        metadata = extract_paper_metadata(xml_part)
        all_metadata.append(metadata)

    last_item = end
    start_index = str(soup).find(str(last_item))
    xml_part = str(soup)[start_index:]
    metadata = extract_paper_metadata(xml_part)
    all_metadata.append(metadata)

    return all_metadata


def metadata_to_dataframe(metadata_list):
    return pd.DataFrame(metadata_list)


In [74]:
metadata = extract_paper_metadata(html)
print(metadata)

{'title': 'Insights on Galaxy Evolution from Interpretable Sparse Feature Networks', 'abstract': "Galaxy appearances reveal the physics of how they formed and evolved. Machine learning models can now exploit galaxies' information-rich morphologies to predict physical properties directly from image cutouts. Learning the relationship between pixel-level features and galaxy properties is essential for building a physical understanding of galaxy evolution, but we are still unable to explicate the details of how deep neural networks represent image features. To address this lack of interpretability, we present a novel neural network architecture called a Sparse Feature Network (SFNet). SFNets produce interpretable features that can be linearly combined in order to estimate galaxy properties like optical emission line ratios or gas-phase metallicity. We find that SFNets do not sacrifice accuracy in order to gain interpretability, and that they perform comparably well to cutting-edge models o

In [75]:
metadata_list = [(key, value) for key, value in metadata.items()]
print(metadata_list)

[('title', 'Insights on Galaxy Evolution from Interpretable Sparse Feature Networks'), ('abstract', "Galaxy appearances reveal the physics of how they formed and evolved. Machine learning models can now exploit galaxies' information-rich morphologies to predict physical properties directly from image cutouts. Learning the relationship between pixel-level features and galaxy properties is essential for building a physical understanding of galaxy evolution, but we are still unable to explicate the details of how deep neural networks represent image features. To address this lack of interpretability, we present a novel neural network architecture called a Sparse Feature Network (SFNet). SFNets produce interpretable features that can be linearly combined in order to estimate galaxy properties like optical emission line ratios or gas-phase metallicity. We find that SFNets do not sacrifice accuracy in order to gain interpretability, and that they perform comparably well to cutting-edge model

In [76]:
df = pd.DataFrame([metadata])
df

Unnamed: 0,title,abstract,authors,figures,pages,tables,pdf_link,primary_subject,submitted_journal
0,Insights on Galaxy Evolution from Interpretabl...,Galaxy appearances reveal the physics of how t...,[John F. Wu],4,10,2,/pdf/2501.00089,Astrophysics of Galaxies (astro-ph.GA),"AAS Journals. 10 pages, 4 figures, 2 tables"


In [165]:
metadata_list = extract_all_papers(html)
df = metadata_to_dataframe(metadata_list)

In [170]:
def remove_brackets(text):
    return re.sub(r'\(.*?\)', '', text).strip()

df['primary_subject'] = df['primary_subject'].map(remove_brackets)
df['secondary_subjects'] = df['secondary_subjects'].map(lambda x: [remove_brackets(subject) for subject in x], na_action='ignore') 

In [175]:
df['submitted_journal'] = df['submitted_journal'].str.split(r'[,;:.]').str[0]

In [173]:
for i in range(len(df)):
    if pd.isna(df['pages'][i]) or pd.isna(df['figures'][i]) or pd.isna(df['tables'][i]):

        pdf_link = df['pdf_link'][i]
        pdf_response = libreq.urlopen('https://' + pdf_link)
        pdf_file = pdf_response.read()
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))

        # Number of Pages
        if pd.isna(df['pages'][i]):
            num_pages = len(pdf_reader.pages)
            df['pages'][i] = num_pages

        # Number of Figures
        if pd.isna(df['figures'][i]):
            highest_figure_number = 0
            for page in pdf_reader.pages:
                text = page.extract_text()
                figure_numbers = re.findall(r'(?i)(?:Figure|Fig.|Figure.|Fig})\s+(\d+)', text)
                if figure_numbers:
                    highest_figure_number = max(highest_figure_number, max(map(int, figure_numbers)))
            df['figures'][i] = highest_figure_number

        # Number of Tables
        if pd.isna(df['tables'][i]):
            highest_table_number = 0
            for page in pdf_reader.pages:
                text = page.extract_text()
                table_numbers = re.findall(r'(?i)(?:Table|Table.})\s+(\d+)', text)
                if table_numbers:
                    highest_table_number = max(highest_table_number, max(map(int, table_numbers)))
            df['tables'][i] = highest_table_number

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pages'][i] = num_pages
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['figures'][i] = highest_figure_number
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tables'][i] = highest_table_number
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['figures'][i] = highest_figure_number
A value is trying to

In [176]:
df

Unnamed: 0,title,abstract,authors,figures,pages,tables,pdf_link,primary_subject,secondary_subjects,submitted_journal
0,Insights on Galaxy Evolution from Interpretabl...,Galaxy appearances reveal the physics of how t...,[John F. Wu],4.0,10.0,2.0,arxiv.org/pdf/2501.00089,Astrophysics of Galaxies,[Machine Learning],AAS Journals
1,Hunting for the candidates of Changing-Look Bl...,The changing-look blazars (CLBs) are the blaza...,"[Shi-Ju Kang, Shan-Shan Ren, Yong-Gang Zheng, ...",7.0,13.0,3.0,arxiv.org/pdf/2501.00094,High Energy Astrophysical Phenomena,,ApJ
2,First unambiguous detection of ammonia in the ...,The newly accessible mid-infrared (MIR) window...,"[Mathilde Mâlin, Anthony Boccaletti, Clément P...",6.0,18.0,4.0,arxiv.org/pdf/2501.00104,Earth and Planetary Astrophysics,,A&A
3,Exchange of meteorites between the terrestrial...,The evolution of the orbits of bodies ejected ...,[S.I. Ipatov],0.0,6.0,0.0,arxiv.org/pdf/2501.00134,Earth and Planetary Astrophysics,,
4,Autoencoder Reconstruction of Cosmological Mic...,Enhanced modeling of microlensing variations i...,"[Somayeh Khakpash, Federica Bianco, Georgios V...",11.0,18.0,1.0,arxiv.org/pdf/2501.00153,Instrumentation and Methods for Astrophysics,,The Astrophysical Journal
5,The Extreme Space Weather Event of 1872 Februa...,"We review observations of solar activity, geom...","[Hisashi Hayakawa, Edward W. Cliver, Frédéric ...",13.0,20.0,2.0,arxiv.org/pdf/2501.00176,Solar and Stellar Astrophysics,"[Earth and Planetary Astrophysics, Geophysics,...",
6,Electromotive field in space and astrophysical...,The concept of electromotive field appears in ...,"[Philippe-A. Bourdin, Yasuhito Narita]",8.0,23.0,0.0,arxiv.org/pdf/2501.00181,Solar and Stellar Astrophysics,"[Earth and Planetary Astrophysics, High Energy...",
7,Can tensor-scalar induced GWs dominate PTA obs...,Observational constraints on small-scale primo...,"[Di Wu, Jing-Zhi Zhou, Yu-Ting Kuang, Zhi-Chao...",7.0,19.0,1.0,arxiv.org/pdf/2501.00228,Cosmology and Nongalactic Astrophysics,"[High Energy Astrophysical Phenomena, General ...",
8,Improving image quality of the Solar Disk Imag...,The in-flight calibration and performance of t...,"[Hui Liu, Hui Li, Sizhong Zou, Kaifan Ji, Zhen...",10.0,14.0,1.0,arxiv.org/pdf/2501.00231,Solar and Stellar Astrophysics,[Instrumentation and Methods for Astrophysics],
9,On the Duration of Gamma-Ray Bursts,"Recently, a short-duration GRB with supernova ...",[Bing Zhang],1.0,10.0,0.0,arxiv.org/pdf/2501.00239,High Energy Astrophysical Phenomena,,
