In [1]:
import arxiv
from semanticscholar import SemanticScholar
from tqdm import tqdm
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
from typing import List, Dict, Any
from config import Config
import requests
from src.embed.vector_db import MilvusManager
from src.embed.embedder import embed_batch
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from github import Github

In [60]:
# Construct the default API client.
client = arxiv.Client()
sch = SemanticScholar()


In [None]:
search = arxiv.Search(
query=f"cat:cs.* AND submittedDate:[{2024} TO {2025}]",
max_results=1,
sort_by=arxiv.SortCriterion.SubmittedDate
)

res = client.results(search)

for p in res:
    print(p.title)
    print(p.summary)
    print(p.published)
#    print(p.arxiv_id)
    print(p.pdf_url)
    print(p.primary_category)
    print([a.name for a in p.authors])


Deeply Learned Robust Matrix Completion for Large-scale Low-rank Data Recovery
Robust matrix completion (RMC) is a widely used machine learning tool that
simultaneously tackles two critical issues in low-rank data analysis: missing
data entries and extreme outliers. This paper proposes a novel scalable and
learnable non-convex approach, coined Learned Robust Matrix Completion (LRMC),
for large-scale RMC problems. LRMC enjoys low computational complexity with
linear convergence. Motivated by the proposed theorem, the free parameters of
LRMC can be effectively learned via deep unfolding to achieve optimum
performance. Furthermore, this paper proposes a flexible
feedforward-recurrent-mixed neural network framework that extends deep
unfolding from fix-number iterations to infinite iterations. The superior
empirical performance of LRMC is verified with extensive experiments against
state-of-the-art on synthetic datasets and real applications, including video
background subtraction, ultrasou

In [5]:
paper.

['Jianheng Tang', 'Qifan Zhang', 'Yuhan Li', 'Nuo Chen', 'Jia Li']

In [None]:
pdf = PyPDFLoader("/home/ssaeed/arxiv-paper-research-agent/2407.00379v2.GraphArena__Evaluating_and_Exploring_Large_Language_Models_on_Graph_Computation.pdf")

In [25]:
loaded_pdf = pdf.load()

In [None]:
text = "\n".join(page.page_content for page in loaded_pdf)
text

In [None]:
# https://arxiv.org/category_taxonomy

In [16]:
query = "2412.19446"  # Replace with the arXiv ID you're interested in
paper = sch.get_paper(query)

In [None]:
print(x['numCitedBy']) # number of citations
print(x['influentialCitationCount']) # number of influential citations
print(x['citationVelocity']) # citation velocity 

In [122]:
def get_citation_info(arxiv_id):
    base_url = "https://api.semanticscholar.org/v1/paper/"
    url = f"{base_url}arXiv:{arxiv_id}"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('citationCount', 0), data.get('influentialCitationCount', 0), data.get('citationVelocity', 0)



In [20]:
# This creates a search for all CS papers from 2024
search = arxiv.Search(
    query=f"cat:cs.* AND submittedDate:[20240101 TO 20240131]",
    max_results=None,  # Fetch all papers
#    sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

In [None]:
papers = []
chunk_count = 0
results = client.results(search)

for paper in tqdm(results):
    try:
        papers.append({
            'title': paper.title,
            'arxiv_id': paper.get_short_id(),
            'category': paper.primary_category,
            'summary': paper.summary,
            'submitted_date': paper.published.strftime('%Y-%m-%d')
        })
        citation_count, influential_citation_count, citation_velocity = get_citation_info(paper.get_short_id())
    except Exception as e:
        print(f"Error fetching paper {paper.get_short_id()}: {e}")
        continue


In [5]:
from pathlib import Path
import json

def analyze_jsonl_files(directory: str = "data/raw"):
    """Analyze JSONL files in directory including subdirectories"""
    path = Path(directory)
    # Recursively find all JSONL files in all subdirectories
    jsonl_files = list(path.glob("**/*.jsonl"))
    
    total_files = len(jsonl_files)
    total_papers = 0
    papers_by_month = {}
    
    for file in jsonl_files:
        # Get month directory (e.g., '2024-01') and filename
        month_dir = file.parent.name
        if month_dir not in papers_by_month:
            papers_by_month[month_dir] = {}
            
        paper_count = sum(1 for _ in open(file))
        papers_by_month[month_dir][file.name] = paper_count
        total_papers += paper_count
    
    print(f"\nFound {total_files} JSONL files across all months:")
    for month in sorted(papers_by_month.keys()):
        month_total = sum(papers_by_month[month].values())
        print(f"\n{month} (Total: {month_total} papers):")
        for filename, count in papers_by_month[month].items():
            print(f"  - {filename}: {count} papers")
        
    print(f"\nTotal papers across all files: {total_papers}")
    
    return total_files, total_papers

total_files, total_papers = analyze_jsonl_files()


Found 460 JSONL files across all months:

20240101 (Total: 7870 papers):
  - game_theory.jsonl: 60 papers
  - discrete_mathematics.jsonl: 18 papers
  - hardware_architecture.jsonl: 57 papers
  - information_theory.jsonl: 324 papers
  - other_computer_science.jsonl: 5 papers
  - databases.jsonl: 35 papers
  - machine_learning.jsonl: 1096 papers
  - operating_systems.jsonl: 7 papers
  - cryptography_and_security.jsonl: 266 papers
  - digital_libraries.jsonl: 24 papers
  - data_structures_and_algorithms.jsonl: 67 papers
  - programming_languages.jsonl: 27 papers
  - neural_and_evolutionary_computing.jsonl: 76 papers
  - multiagent_systems.jsonl: 39 papers
  - performance.jsonl: 10 papers
  - human_computer_interaction.jsonl: 228 papers
  - computer_vision.jsonl: 1362 papers
  - networking_and_internet.jsonl: 135 papers
  - mathematical_software.jsonl: 8 papers
  - computational_engineering.jsonl: 56 papers
  - emerging_technologies.jsonl: 29 papers
  - graphics.jsonl: 34 papers
  - distr

In [7]:
# Get all JSONL files in the data directory
data_dir = Path("data/raw")  # Adjust path as needed
jsonl_files = list(data_dir.rglob("*.jsonl"))

max_authors_length = 0
file_with_max = None
paper_with_max = None
authors_with_max = None

# Process each JSONL file
for file_path in jsonl_files:
    print(f"Processing {file_path}")
    with open(file_path, 'r') as f:
        for line in f:
            paper = json.loads(line)
            authors_str = paper.get('authors', '')
            authors_length = len(authors_str)
            if authors_length > max_authors_length:
                max_authors_length = authors_length
                file_with_max = file_path
                paper_with_max = paper['title']
                authors_with_max = authors_str

print(f"\nMaximum authors field length: {max_authors_length}")
print(f"Found in file: {file_with_max}")
print(f"Paper title: {paper_with_max}")
print("\nFirst 200 chars of authors field:")
print(authors_with_max[:200] + "...")
print("\nNumber of authors:", len(authors_with_max.split(',')))

Processing data/raw/20240401/game_theory.jsonl
Processing data/raw/20240401/discrete_mathematics.jsonl
Processing data/raw/20240401/hardware_architecture.jsonl
Processing data/raw/20240401/information_theory.jsonl
Processing data/raw/20240401/other_computer_science.jsonl
Processing data/raw/20240401/databases.jsonl
Processing data/raw/20240401/machine_learning.jsonl
Processing data/raw/20240401/operating_systems.jsonl
Processing data/raw/20240401/cryptography_and_security.jsonl
Processing data/raw/20240401/general_literature.jsonl
Processing data/raw/20240401/digital_libraries.jsonl
Processing data/raw/20240401/data_structures_and_algorithms.jsonl
Processing data/raw/20240401/programming_languages.jsonl
Processing data/raw/20240401/neural_and_evolutionary_computing.jsonl
Processing data/raw/20240401/multiagent_systems.jsonl
Processing data/raw/20240401/performance.jsonl
Processing data/raw/20240401/human_computer_interaction.jsonl
Processing data/raw/20240401/computer_vision.jsonl
Proc

In [3]:
import json


def remove_duplicates(file_path):
    """Remove duplicate entries from a JSONL file based on arxiv_id."""
    # Read all lines and keep track of seen arxiv_ids
    seen_ids = set()
    unique_entries = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                entry = json.loads(line.strip())
                arxiv_id = entry['arxiv_id']
                
                if arxiv_id not in seen_ids:
                    seen_ids.add(arxiv_id)
                    unique_entries.append(entry)
            except json.JSONDecodeError:
                print(f"Warning: Skipping invalid JSON line in {file_path}")
                continue
    
    # Write back only unique entries
    with open(file_path, 'w', encoding='utf-8') as f:
        for entry in unique_entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write('\n')
    
    return len(seen_ids)

In [4]:

# def main():
#     # Process all JSONL files in data/raw directory
#     import glob
#     data_dir = Path('data/raw')
#     jsonl_files = glob.glob(str(data_dir / '**' / '*.jsonl'), recursive=True)
#     print(jsonl_files)
#     total_files = len(jsonl_files)
#     print(f"Found {total_files} JSONL files")
    
#     for i, file_path in enumerate(jsonl_files, 1):
#         print(f"Processing file {i}/{total_files}: {file_path}")
#         unique_count = remove_duplicates(file_path)
#         print(f"Kept {unique_count} unique entries")

# if __name__ == '__main__':
#     main()

['data/raw/20240401/game_theory.jsonl', 'data/raw/20240401/discrete_mathematics.jsonl', 'data/raw/20240401/hardware_architecture.jsonl', 'data/raw/20240401/information_theory.jsonl', 'data/raw/20240401/other_computer_science.jsonl', 'data/raw/20240401/databases.jsonl', 'data/raw/20240401/machine_learning.jsonl', 'data/raw/20240401/operating_systems.jsonl', 'data/raw/20240401/cryptography_and_security.jsonl', 'data/raw/20240401/general_literature.jsonl', 'data/raw/20240401/digital_libraries.jsonl', 'data/raw/20240401/data_structures_and_algorithms.jsonl', 'data/raw/20240401/programming_languages.jsonl', 'data/raw/20240401/neural_and_evolutionary_computing.jsonl', 'data/raw/20240401/multiagent_systems.jsonl', 'data/raw/20240401/performance.jsonl', 'data/raw/20240401/human_computer_interaction.jsonl', 'data/raw/20240401/computer_vision.jsonl', 'data/raw/20240401/networking_and_internet.jsonl', 'data/raw/20240401/mathematical_software.jsonl', 'data/raw/20240401/computational_engineering.js

In [2]:
from src.embed.vector_db import MilvusManager
import json
CATEGORIES = json.load(open("categories.json"))

vector_db = MilvusManager(CATEGORIES)



Connected to Milvus
Collection artificial_intelligence already exists. Skipping...
Loaded collection: artificial_intelligence
Collection hardware_architecture already exists. Skipping...
Loaded collection: hardware_architecture
Collection computational_complexity already exists. Skipping...
Loaded collection: computational_complexity
Collection computational_engineering already exists. Skipping...
Loaded collection: computational_engineering
Collection computational_geometry already exists. Skipping...
Loaded collection: computational_geometry
Collection computation_and_language already exists. Skipping...
Loaded collection: computation_and_language
Collection cryptography_and_security already exists. Skipping...
Loaded collection: cryptography_and_security
Collection computer_vision already exists. Skipping...
Loaded collection: computer_vision
Collection computers_and_society already exists. Skipping...
Loaded collection: computers_and_society
Collection databases already exists. Ski

In [11]:
from github import Github
import os
from dotenv import load_dotenv

load_dotenv()
g = Github(os.getenv("GITHUB_TOKEN"))




In [13]:
query = 'time series dense encoder'
search_queries = [
                query,                     # Original query
                f"{query} implementation", # Look for implementations
                f"{query} official"       # Look for official repos
            ]


all_repos = []
seen_urls = set()

for search_query in search_queries:
    # Search repositories
    repos = g.search_repositories(
        query=search_query,
        sort="stars",      # Sort by stars
        order="desc"       # Highest stars first
    )
    
repos

<github.PaginatedList.PaginatedList at 0x7fefea332cd0>

In [20]:
repositories = g.search_repositories(query='time series dense encoder implementation', sort='stars', order='desc')
for repo in repositories:
   print(repo.full_name)

lich99/TiDE
Helloworld2345567/Google_TiDE_implementation
frinkleko/TiDE-Applications
martins0n/tide
ZihangHLiu/TiDE
HatcherRobotics/TiDE
TommyNan/TiDE


In [16]:
import requests

semantic_scholar_base_url = "https://api.semanticscholar.org/v1/paper/"

def get_citations(arxiv_id: str) -> Dict:
    """Get citation metrics from Semantic Scholar API"""

    url = f"{semantic_scholar_base_url}arXiv:{arxiv_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        print(data)
        return {
            'citation_count': data.get('citationCount', 0),
            'influential_citation_count': data.get('influentialCitationCount', 0),
            'citation_velocity': data.get('citationVelocity', 0),

        }
    else:
        raise Exception(f"Failed to fetch citation info for {arxiv_id}")
    

In [48]:
from config import Config
config = Config()

github_client = Github(config.github.token)


def get_github_repo(arxiv_title: str) -> List[str]:
    """Get code repository of a paper."""
    
    query = f"{arxiv_title} implementation"
    repos = github_client.search_repositories(query=query, sort='stars', order='desc')
    
    if repos:
        return [x.full_name for x in repos[:2]]
    else:
        return [] # found no repos

In [2]:
data_dir = Path("data/raw")
jsonl_files = list(data_dir.glob("**/*.jsonl"))

for jsonl_file in jsonl_files:
    print(jsonl_file)


data/raw/20240401/game_theory.jsonl
data/raw/20240401/discrete_mathematics.jsonl
data/raw/20240401/hardware_architecture.jsonl
data/raw/20240401/information_theory.jsonl
data/raw/20240401/other_computer_science.jsonl
data/raw/20240401/databases.jsonl
data/raw/20240401/machine_learning.jsonl
data/raw/20240401/operating_systems.jsonl
data/raw/20240401/cryptography_and_security.jsonl
data/raw/20240401/general_literature.jsonl
data/raw/20240401/digital_libraries.jsonl
data/raw/20240401/data_structures_and_algorithms.jsonl
data/raw/20240401/programming_languages.jsonl
data/raw/20240401/neural_and_evolutionary_computing.jsonl
data/raw/20240401/multiagent_systems.jsonl
data/raw/20240401/performance.jsonl
data/raw/20240401/human_computer_interaction.jsonl
data/raw/20240401/computer_vision.jsonl
data/raw/20240401/networking_and_internet.jsonl
data/raw/20240401/mathematical_software.jsonl
data/raw/20240401/computational_engineering.jsonl
data/raw/20240401/emerging_technologies.jsonl
data/raw/202

In [6]:
jsonl_files[371:]

[PosixPath('data/raw/20240701/unknown.jsonl'),
 PosixPath('data/raw/20240701/social_and_information_networks.jsonl'),
 PosixPath('data/raw/20240701/formal_languages.jsonl'),
 PosixPath('data/raw/20240701/logic_in_computer_science.jsonl'),
 PosixPath('data/raw/20240701/computation_and_language.jsonl'),
 PosixPath('data/raw/20240701/sound.jsonl'),
 PosixPath('data/raw/20240701/symbolic_computation.jsonl'),
 PosixPath('data/raw/20240701/information_retrieval.jsonl'),
 PosixPath('data/raw/20240701/computers_and_society.jsonl'),
 PosixPath('data/raw/20240701/multimedia.jsonl'),
 PosixPath('data/raw/20240701/computational_complexity.jsonl'),
 PosixPath('data/raw/20240701/artificial_intelligence.jsonl'),
 PosixPath('data/raw/20240701/robotics.jsonl'),
 PosixPath('data/raw/20241101/game_theory.jsonl'),
 PosixPath('data/raw/20241101/discrete_mathematics.jsonl'),
 PosixPath('data/raw/20241101/hardware_architecture.jsonl'),
 PosixPath('data/raw/20241101/information_theory.jsonl'),
 PosixPath('dat