In [18]:
import arxiv
from semanticscholar import SemanticScholar
from tqdm import tqdm
from pathlib import Path
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
from typing import List, Dict, Any
from config import Config
import requests
from src.embed.vector_db import MilvusManager

In [4]:
# Construct the default API client.
client = arxiv.Client()
sch = SemanticScholar()


In [13]:
search = arxiv.Search(
  query = "ti:\"Lightweight G-YOLOv11: Advancing Efficient Fracture Detection\"",
  max_results = 1,
  sort_by = arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

In [14]:
for r in results:
    print(r.title)
    print(r.published)
    print(r.primary_category)


Lightweight G-YOLOv11: Advancing Efficient Fracture Detection in Pediatric Wrist X-rays
2024-12-31 21:07:40+00:00
eess.IV


In [9]:
r.pdf_url

'http://arxiv.org/pdf/2502.19414v1'

In [21]:
# https://arxiv.org/category_taxonomy

arxiv.Result(entry_id='http://arxiv.org/abs/2502.19414v1', updated=datetime.datetime(2025, 2, 26, 18, 58, 13, tzinfo=datetime.timezone.utc), published=datetime.datetime(2025, 2, 26, 18, 58, 13, tzinfo=datetime.timezone.utc), title='Can Language Models Falsify? Evaluating Algorithmic Reasoning with Counterexample Creation', authors=[arxiv.Result.Author('Shiven Sinha'), arxiv.Result.Author('Shashwat Goel'), arxiv.Result.Author('Ponnurangam Kumaraguru'), arxiv.Result.Author('Jonas Geiping'), arxiv.Result.Author('Matthias Bethge'), arxiv.Result.Author('Ameya Prabhu')], summary="There is growing excitement about the potential of Language Models (LMs) to\naccelerate scientific discovery. Falsifying hypotheses is key to scientific\nprogress, as it allows claims to be iteratively refined over time. This process\nrequires significant researcher effort, reasoning, and ingenuity. Yet current\nbenchmarks for LMs predominantly assess their ability to generate solutions\nrather than challenge them. 

In [16]:
query = "2412.19446"  # Replace with the arXiv ID you're interested in
paper = sch.get_paper(query)

In [57]:
print(x['numCitedBy']) # number of citations
print(x['influentialCitationCount']) # number of influential citations
print(x['citationVelocity']) # citation velocity 

0
0
0


In [122]:
def get_citation_info(arxiv_id):
    base_url = "https://api.semanticscholar.org/v1/paper/"
    url = f"{base_url}arXiv:{arxiv_id}"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('citationCount', 0), data.get('influentialCitationCount', 0), data.get('citationVelocity', 0)



In [77]:
config = Config()

In [16]:
# This creates a search for all CS papers from 2024
search = arxiv.Search(
    query=f"cat:cs.* AND submittedDate:[2024 TO 2025]",
    max_results=1000,  # Fetch all papers
    sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

In [17]:
len(list(results))

1000

In [None]:
for 

In [114]:
papers = []
chunk_count = 0
results = client.results(search)

for paper in tqdm(results):
    try:
        papers.append({
            'title': paper.title,
            'arxiv_id': paper.get_short_id(),
            'category': paper.primary_category,
            'summary': paper.summary,
            'submitted_date': paper.published.strftime('%Y-%m-%d')
        })
        citation_count, influential_citation_count, citation_velocity = get_citation_info(paper.get_short_id())
    except Exception as e:
        print(f"Error fetching paper {paper.get_short_id()}: {e}")
        continue





100it [00:01, 86.68it/s]


In [117]:
papers[5]

{'title': 'Titans: Learning to Memorize at Test Time',
 'arxiv_id': '2501.00663v1',
 'category': 'cs.LG',
 'summary': 'Over more than a decade there has been an extensive research effort on how to\neffectively utilize recurrent models and attention. While recurrent models aim\nto compress the data into a fixed-size memory (called hidden state), attention\nallows attending to the entire context window, capturing the direct\ndependencies of all tokens. This more accurate modeling of dependencies,\nhowever, comes with a quadratic cost, limiting the model to a fixed-length\ncontext. We present a new neural long-term memory module that learns to\nmemorize historical context and helps attention to attend to the current\ncontext while utilizing long past information. We show that this neural memory\nhas the advantage of fast parallelizable training while maintaining a fast\ninference. From a memory perspective, we argue that attention due to its\nlimited context but accurate dependency modeli

In [20]:
from pathlib import Path
import json

def analyze_jsonl_files(directory: str = "data/raw"):
    """Analyze JSONL files in directory"""
    path = Path(directory)
    jsonl_files = list(path.glob("*.jsonl"))
    
    total_files = len(jsonl_files)
    total_papers = 0
    papers_by_file = {}
    
    for file in jsonl_files:
        paper_count = sum(1 for _ in open(file))
        papers_by_file[file.name] = paper_count
        total_papers += paper_count
    
    print(f"\nFound {total_files} JSONL files:")
    for filename, count in papers_by_file.items():
        print(f"- {filename}: {count} papers")
        
    print(f"\nTotal papers across all files: {total_papers}")
    
    return total_files, total_papers

total_files, total_papers = analyze_jsonl_files()



Found 38 JSONL files:
- machine_learning_2024.jsonl: 1359 papers
- game_theory_2024.jsonl: 68 papers
- computation_and_language_2024.jsonl: 1030 papers
- neural_and_evolutionary_computing_2024.jsonl: 57 papers
- information_theory_2024.jsonl: 125 papers
- performance_2024.jsonl: 8 papers
- networking_and_internet_2024.jsonl: 98 papers
- logic_in_computer_science_2024.jsonl: 65 papers
- robotics_2024.jsonl: 382 papers
- computational_geometry_2024.jsonl: 25 papers
- software_engineering_2024.jsonl: 196 papers
- mathematical_software_2024.jsonl: 1 papers
- computers_and_society_2024.jsonl: 155 papers
- computational_engineering_2024.jsonl: 63 papers
- multimedia_2024.jsonl: 21 papers
- human_computer_interaction_2024.jsonl: 199 papers
- information_retrieval_2024.jsonl: 147 papers
- hardware_architecture_2024.jsonl: 54 papers
- programming_languages_2024.jsonl: 28 papers
- artificial_intelligence_2024.jsonl: 420 papers
- data_structures_and_algorithms_2024.jsonl: 74 papers
- sound_2024.