In [39]:
import pandas as pd
import numpy as np
import urllib.request as libreq
from groq import Groq
import xml.etree.ElementTree as ET
from IPython.display import display, Latex
from scholarly import ProxyGenerator, scholarly
import certifi
import os
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
load_dotenv()

os.environ['SSL_CERT_FILE'] = certifi.where()

In [19]:
todays_date = datetime.today().strftime('%Y%m%d')
print(todays_date)
yesterdays_date = (datetime.today() - pd.Timedelta(days=1)).strftime('%Y%m%d')
print(yesterdays_date)
todays_date = '20241231'
yesterdays_date = '20241230'

20250102
20250101


In [20]:
query = 'search_query=cat:cs*+AND+submittedDate:[' + str(yesterdays_date) + '+TO+' + str(todays_date) + ']&start=0&max_results=50&sortBy=submittedDate&sortOrder=ascending'
print(query)
base_url = 'http://export.arxiv.org/api/query?'

with libreq.urlopen(base_url + query) as url:
    r = url.read()
print(r)

# Parse the XML content
root = ET.fromstring(r)

# Function to print the XML in a readable format
def print_readable_xml(element, indent=""):
    for child in element:
        print(f"{indent}{child.tag}: {child.text.strip() if child.text else ''}")
        print_readable_xml(child, indent + "  ")


search_query=cat:cs*+AND+submittedDate:[20241230+TO+20241231]&start=0&max_results=50&sortBy=submittedDate&sortOrder=ascending
b'<?xml version="1.0" encoding="UTF-8"?>\n<feed xmlns="http://www.w3.org/2005/Atom">\n  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs%2A%20AND%20submittedDate%3A%5B20241230%20TO%2020241231%5D%26id_list%3D%26start%3D0%26max_results%3D50" rel="self" type="application/atom+xml"/>\n  <title type="html">ArXiv Query: search_query=cat:cs* AND submittedDate:[20241230 TO 20241231]&amp;id_list=&amp;start=0&amp;max_results=50</title>\n  <id>http://arxiv.org/api/fbaYJmFEKyiMDiaDZQ3ckj88AEI</id>\n  <updated>2025-01-01T00:00:00-05:00</updated>\n  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">243</opensearch:totalResults>\n  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>\n  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">50</opensearch:

In [21]:
data = pd.DataFrame(columns=['title', 'abstract', 'authors', 'published', 'link', 'category'])
namespace = {'atom': 'http://www.w3.org/2005/Atom'}

entries = []
for entry in root.findall('atom:entry', namespace):
    title = entry.find('atom:title', namespace).text.strip()
    abstract = entry.find('atom:summary', namespace).text.strip()
    authors = ', '.join([author.find('atom:name', namespace).text.strip() for author in entry.findall('atom:author', namespace)])
    published = entry.find('atom:published', namespace).text.strip()
    link = entry.find('atom:link[@rel="alternate"]', namespace).attrib['href']
    category = entry.find('atom:category', namespace).attrib['term']
    
    entries.append({'title': title, 'abstract': abstract, 'authors': authors, 'published': published, 'link': link, 'category': category})

data = pd.concat([data, pd.DataFrame(entries)], ignore_index=True)

print(data)

                                                title  \
0                Computing with D-Algebraic Sequences   
1   Slow Perception: Let's Perceive Geometric Figu...   
2   EVOLVE: Emotion and Visual Output Learning via...   
3   Graph Neural Networks for Next-Generation-IoT:...   
4   NetFlowGen: Leveraging Generative Pre-training...   
5   Knowledge Editing for Large Language Model wit...   
6   Predicting Long Term Sequential Policy Value U...   
7   SafeSynthDP: Leveraging Large Language Models ...   
8   Uncertainty Herding: One Active Learning Metho...   
9   YOLO-UniOW: Efficient Universal Open-World Obj...   
10  Enhancing Visual Representation for Text-based...   
11  Latent Drifting in Diffusion Models for Counte...   
12  Impact of Cognitive Load on Human Trust in Hyb...   
13  Environmental and Economic Impact of I/O Devic...   
14  Overcoming Class Imbalance: Unified GNN Learni...   
15  Diffgrasp: Whole-Body Grasping Synthesis Guide...   
16  Nanosatellite Design Consid

In [22]:
data

Unnamed: 0,title,abstract,authors,published,link,category
0,Computing with D-Algebraic Sequences,A sequence is difference algebraic (or D-algeb...,Bertrand Teguia Tabuguia,2024-12-30T00:39:57Z,http://arxiv.org/abs/2412.20630v1,math.AG
1,Slow Perception: Let's Perceive Geometric Figu...,"Recently, ""visual o1"" began to enter people's ...","Haoran Wei, Youyang Yin, Yumeng Li, Jia Wang, ...",2024-12-30T00:40:35Z,http://arxiv.org/abs/2412.20631v1,cs.CV
2,EVOLVE: Emotion and Visual Output Learning via...,Human acceptance of social robots is greatly e...,"Jordan Sinclair, Christopher Reardon",2024-12-30T00:43:31Z,http://arxiv.org/abs/2412.20632v1,cs.RO
3,Graph Neural Networks for Next-Generation-IoT:...,Graph Neural Networks (GNNs) have emerged as a...,"Nguyen Xuan Tung, Le Tung Giang, Bui Duc Son, ...",2024-12-30T00:46:48Z,http://arxiv.org/abs/2412.20634v1,cs.IT
4,NetFlowGen: Leveraging Generative Pre-training...,Understanding the traffic dynamics in networks...,"Jiawei Zhou, Woojeong Kim, Zhiying Xu, Alexand...",2024-12-30T00:47:49Z,http://arxiv.org/abs/2412.20635v1,cs.LG
5,Knowledge Editing for Large Language Model wit...,As real-world knowledge is constantly evolving...,"Yongchang Li, Yujin Zhu, Tao Yan, Shijian Fan,...",2024-12-30T00:58:00Z,http://arxiv.org/abs/2412.20637v1,cs.CL
6,Predicting Long Term Sequential Policy Value U...,"Performing policy evaluation in education, hea...","Hyunji Nam, Allen Nie, Ge Gao, Vasilis Syrgkan...",2024-12-30T01:01:15Z,http://arxiv.org/abs/2412.20638v1,cs.AI
7,SafeSynthDP: Leveraging Large Language Models ...,Machine learning (ML) models frequently rely o...,"Md Mahadi Hasan Nahid, Sadid Bin Hasan",2024-12-30T01:10:10Z,http://arxiv.org/abs/2412.20641v1,cs.LG
8,Uncertainty Herding: One Active Learning Metho...,Most active learning research has focused on m...,"Wonho Bae, Gabriel L. Oliveira, Danica J. Suth...",2024-12-30T01:33:42Z,http://arxiv.org/abs/2412.20644v1,cs.LG
9,YOLO-UniOW: Efficient Universal Open-World Obj...,Traditional object detection models are constr...,"Lihao Liu, Juexiao Feng, Hui Chen, Ao Wang, Li...",2024-12-30T01:34:14Z,http://arxiv.org/abs/2412.20645v1,cs.CV


In [35]:
data['summary'] = np.nan

client_summary = Groq(
    api_key=os.environ['GROQ_API_KEY_1'],
)

for i in range(len(data)):
    content = data['abstract'][i]
    summary = client_summary.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You need to provide a one line summary of the provided abstract. You should not fabricate any information. Please do not include any other text other than the requested content.",
            },
            
            {
                "role": "user",
                "content": content,
            }
        ],
        model="llama3-8b-8192",
        stream=False,
    )
    data['summary'][i] = summary.choices[0].message.content

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['summary'][i] = summary.choices[0].message.content


In [None]:
data['keywords'] = np.nan

client_keywords = Groq(
    api_key= os.environ['GROQ_API_KEY_2'],
)

for i in range(len(data)):
    title = data['title'][i]
    keywords = client_keywords.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You need to pick out 2-3 keywords from the provided title that are relevnt to the topic of the paper. Please do not include any other text explaining what is the response other than the requested content itself. No need to number the keywords.",
            },
            
            {
                "role": "user",
                "content": title,
            }
        ],
        model="llama3-8b-8192",
        stream=False,
    )
    data['keywords'][i] = keywords.choices[0].message.content

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['keywords'][i] = keywords.choices[0].message.content


In [25]:
data['published'] = pd.to_datetime(data['published'])
data['published'] = data['published'].dt.date

In [15]:
category_replacements = {
    'CO': 'Cosmology and Nongalactic Astrophysics',
    'EP': 'Earth and Planetary Astrophysics',
    'GA': 'Astrophysics of Galaxies',
    'HE': 'High Energy Astrophysical Phenomena',
    'IM': 'Instrumentation and Methods for Astrophysics',
    'SR': 'Solar and Stellar Astrophysics'
}

def replace_category(category):
    category = category.removeprefix('astro-ph.')
    for key, value in category_replacements.items():
        category = category.replace(key, value)
    return category

data['category'] = data['category'].map(replace_category)

In [None]:
data['affiliation'] = np.nan

for i in range(len(data)):
    authors = data['authors'][i].split(', ')
    if len(authors) > 5:
        authors = authors[:5]
    
    print(authors)
    
    affiliation = []

    for j in range(len(authors)):
        print(authors[j])
        pg = ProxyGenerator()
        pg.FreeProxies()
        scholarly.use_proxy(pg)
        
        search_query = scholarly.search_author(authors[j])

        check = next(search_query, None)

        if check is None:
            affiliation.append('')
            continue
        
        # Retrieve all the details for the author
        author = scholarly.fill(check)
        if author is None:
            affiliation.append('')

        scholarly.pprint(author)
        affil = author['affiliation']
        print(affil)

        if ',' in affil:
            parts = affil.split(',')
            position = parts[0]
            a = ','.join(parts[1:]).strip()
        else:
            position = ''
            a = affil

        print(a)

        affiliation.append(a)
    
    print(affiliation)
    
    data['affiliation'][i] = affiliation


['Bertrand Teguia Tabuguia']
Bertrand Teguia Tabuguia
['']
['Haoran Wei', 'Youyang Yin', 'Yumeng Li', 'Jia Wang', 'Liang Zhao']
Haoran Wei


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['affiliation'][i] = affiliation


b"{'affiliation': 'Assistant Professor, University of Wisconsin\xe2\x80\x93Madison',\n 'citedby': 3398,\n 'citedby5y': 2188,\n 'cites_per_year': {2013: 22,\n                    2014: 49,\n                    2015: 154,\n                    2016: 163,\n                    2017: 214,\n                    2018: 290,\n                    2019: 309,\n                    2020: 367,\n                    2021: 455,\n                    2022: 446,\n                    2023: 448,\n                    2024: 450,\n                    2025: 21},\n 'coauthors': [{'affiliation': 'Professor of Civil and Environmental '\n                               'Engineering, Virginia Tech',\n                'filled': False,\n                'name': 'Peter Vikesland',\n                'scholar_id': 'utmwmjwAAAAJ',\n                'source': 'CO_AUTHORS_LIST'},\n               {'affiliation': 'University Distinguished Professor, Charles P. '\n                               'Lunsford Professor, CEE, Virginia Tech',

MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [38]:
data

Unnamed: 0,title,abstract,authors,published,link,category,summary,keywords,affiliation
0,Computing with D-Algebraic Sequences,A sequence is difference algebraic (or D-algeb...,Bertrand Teguia Tabuguia,2024-12-30,http://arxiv.org/abs/2412.20630v1,math.AG,Subsequences of difference algebraic sequences...,Sequences D-Algebraic Computing,[]
1,Slow Perception: Let's Perceive Geometric Figu...,"Recently, ""visual o1"" began to enter people's ...","Haoran Wei, Youyang Yin, Yumeng Li, Jia Wang, ...",2024-12-30,http://arxiv.org/abs/2412.20631v1,cs.CV,"The abstract introduces ""slow perception"" (SP)...",slow perception geometric figures,
2,EVOLVE: Emotion and Visual Output Learning via...,Human acceptance of social robots is greatly e...,"Jordan Sinclair, Christopher Reardon",2024-12-30,http://arxiv.org/abs/2412.20632v1,cs.RO,The development of large language models (LLMs...,"Emotion, Learning, LLM",
3,Graph Neural Networks for Next-Generation-IoT:...,Graph Neural Networks (GNNs) have emerged as a...,"Nguyen Xuan Tung, Le Tung Giang, Bui Duc Son, ...",2024-12-30,http://arxiv.org/abs/2412.20634v1,cs.IT,This survey explores the application of Graph ...,Graph Neural Networks IoT Advances Challenges,
4,NetFlowGen: Leveraging Generative Pre-training...,Understanding the traffic dynamics in networks...,"Jiawei Zhou, Woojeong Kim, Zhiying Xu, Alexand...",2024-12-30,http://arxiv.org/abs/2412.20635v1,cs.LG,Here is a one-line summary of the abstract:\n\...,"generative pre-training, network traffic, dyna...",
5,Knowledge Editing for Large Language Model wit...,As real-world knowledge is constantly evolving...,"Yongchang Li, Yujin Zhu, Tao Yan, Shijian Fan,...",2024-12-30,http://arxiv.org/abs/2412.20637v1,cs.CL,A novel knowledge editing method called Knowle...,Knowledge Neural,
6,Predicting Long Term Sequential Policy Value U...,"Performing policy evaluation in education, hea...","Hyunji Nam, Allen Nie, Ge Gao, Vasilis Syrgkan...",2024-12-30,http://arxiv.org/abs/2412.20638v1,cs.AI,The authors develop methods to estimate the pe...,Sequential Policy Value Softer Surrogates,
7,SafeSynthDP: Leveraging Large Language Models ...,Machine learning (ML) models frequently rely o...,"Md Mahadi Hasan Nahid, Sadid Bin Hasan",2024-12-30,http://arxiv.org/abs/2412.20641v1,cs.LG,The paper investigates the use of Large Langua...,"Large Language Models, Synthetic Data, Differe...",
8,Uncertainty Herding: One Active Learning Metho...,Most active learning research has focused on m...,"Wonho Bae, Gabriel L. Oliveira, Danica J. Suth...",2024-12-30,http://arxiv.org/abs/2412.20644v1,cs.LG,"A new active learning approach, uncertainty co...",uncertainty herding active learning,
9,YOLO-UniOW: Efficient Universal Open-World Obj...,Traditional object detection models are constr...,"Lihao Liu, Juexiao Feng, Hui Chen, Ao Wang, Li...",2024-12-30,http://arxiv.org/abs/2412.20645v1,cs.CV,This work introduces Universial Open-World Obj...,"Object Detection, Open-World, Efficient",


In [31]:
data.to_csv('astro-ph.csv', index=False)