In [1]:
import os
import time
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import re
import requests
import pandas as pd
from tqdm import tqdm

import fitz  # pymupdf
import arxiv

# --- Embeddings for Chonkie's semantic chunker ---
from sentence_transformers import SentenceTransformer

# Try to import chonkie (required for semantic chunking here)
import importlib
try:
    chonkie = importlib.import_module("chonkie")
    _HAS_CHONKIE = True
except Exception:
    _HAS_CHONKIE = False
    raise ImportError("Chonkie is required for the semantic chunker. Please install it: `pip install chonkie`")

DATA_DIR = Path('data/raw/arxiv')
PDF_DIR = DATA_DIR / 'pdfs'
TEXT_DIR = DATA_DIR / 'texts'
for d in [DATA_DIR, PDF_DIR, TEXT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

ARXIV_REQUEST_DELAY_S = 3.0
HEADERS = {'User-Agent': 'Arxiv-RAG-Preproc/0.1 (contact: shrutikmkulkarni.work@gmail.com)'}  # <-- put your email

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def download_pdf(pdf_url: str, out_path: Path, headers: Optional[dict] = None, retry: int = 3) -> Path:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    for attempt in range(retry):
        try:
            with requests.get(pdf_url, stream=True, headers=headers, timeout=60) as r:
                r.raise_for_status()
                with open(out_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
            return out_path
        except Exception:
            if attempt == retry - 1:
                raise
            time.sleep(2 + attempt)
    return out_path

def extract_pdf_text(pdf_path: Path) -> Tuple[str, List[Tuple[int, str]]]:
    """Return full_text and per-page list[(page_index, text)]."""
    doc = fitz.open(pdf_path)
    pages = []
    for i, page in enumerate(doc):
        txt = page.get_text("text")
        pages.append((i, txt))
    full_text = "\n".join(t for _, t in pages)
    return full_text, pages

In [3]:
# --- Hugging Face auth (optional) ---
import os
HF_TOKEN = os.getenv("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")  # set this in your env if needed

# ---------- Chonkie Semantic Chunker adapter ----------
# Use all-MiniLM-L6-v2 by name; pass only kwargs supported by SentenceTransformer
def _semantic_chunk_with_known_apis(full_text: str) -> list[dict]:
    import importlib
    chonkie = importlib.import_module("chonkie")

    # Kwargs that SentenceTransformer actually understands
    st_kwargs = {}
    if HF_TOKEN:
        st_kwargs["use_auth_token"] = HF_TOKEN

    # Prefer giving the model name string. Chonkie will instantiate it.
    # You may also add device="mps" on Apple Silicon, or "cpu"/"cuda" elsewhere.
    sc = None
    if hasattr(chonkie, "SemanticChunker"):
        sc = chonkie.SemanticChunker(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            # device="mps",        # uncomment for Apple Silicon
            **st_kwargs
        )
    elif hasattr(chonkie, "semantic") and hasattr(chonkie.semantic, "SemanticChunker"):
        sc = chonkie.semantic.SemanticChunker(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            # device="mps",
            **st_kwargs
        )
    else:
        raise ImportError("Could not locate Chonkie's SemanticChunker in this version.")

    # Try the common methods in order
    if hasattr(sc, "chunk_text") and callable(sc.chunk_text):
        pieces = sc.chunk_text(full_text)
    elif hasattr(sc, "split") and callable(sc.split):
        pieces = sc.split(full_text)
    elif hasattr(sc, "chunk") and callable(sc.chunk):
        pieces = sc.chunk(full_text)
    else:
        raise AttributeError("SemanticChunker found, but no method chunk_text/split/chunk detected.")

    out = []
    for i, p in enumerate(pieces):
        if isinstance(p, dict):
            text = p.get("text") or p.get("chunk") or ""
            title = p.get("title") or p.get("section_title") or f"Chunk {i}"
            sp = p.get("start_page")
            ep = p.get("end_page")
        else:
            text = getattr(p, "text", str(p))
            title = getattr(p, "title", f"Chunk {i}")
            sp = getattr(p, "start_page", None)
            ep = getattr(p, "end_page", None)
        out.append(
            {"section_title": title, "text": text, "start_page": sp, "end_page": ep}
        )
    return out

def chunk_document_semantic(full_text: str) -> list[dict]:
    return _semantic_chunk_with_known_apis(full_text)

In [4]:
def fetch_four_arxiv_papers(query: str = 'cat:cs.CL', max_results: int = 4) -> List[arxiv.Result]:
    client = arxiv.Client(page_size=25, delay_seconds=ARXIV_REQUEST_DELAY_S, num_retries=3)
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
    results = list(client.results(search))
    return results

results = fetch_four_arxiv_papers()
len(results), [r.entry_id for r in results]

(4,
 ['http://arxiv.org/abs/2508.13152v1',
  'http://arxiv.org/abs/2508.13144v1',
  'http://arxiv.org/abs/2508.13142v1',
  'http://arxiv.org/abs/2508.13141v1'])

In [5]:
paper_rows = []
for r in tqdm(results, desc='Downloading PDFs'):
    paper_id = r.get_short_id()
    pdf_path = PDF_DIR / f"{paper_id}.pdf"
    if not pdf_path.exists():
        download_pdf(r.pdf_url, pdf_path, headers=HEADERS)
        time.sleep(ARXIV_REQUEST_DELAY_S)

    full_text, _pages = extract_pdf_text(pdf_path)
    (TEXT_DIR / f"{paper_id}.txt").write_text(full_text, encoding='utf-8')

    meta = {
        'paper_id': paper_id,
        'title': r.title,
        'authors': ", ".join(a.name for a in r.authors),
        'abstract': r.summary,
        'primary_category': r.primary_category,
        'categories': ", ".join(r.categories),
        'published': r.published,
        'updated': r.updated,
        'pdf_url': r.pdf_url,
        'entry_id': r.entry_id,
        'source': 'arxiv',
        'raw_text': full_text,
    }
    paper_rows.append(meta)

papers_df = pd.DataFrame(paper_rows)
papers_df.head()

Downloading PDFs: 100%|██████████| 4/4 [00:01<00:00,  2.72it/s]


Unnamed: 0,paper_id,title,authors,abstract,primary_category,categories,published,updated,pdf_url,entry_id,source,raw_text
0,2508.13152v1,RepreGuard: Detecting LLM-Generated Text by Re...,"Xin Chen, Junchao Wu, Shu Yang, Runzhe Zhan, Z...",Detecting content generated by large language ...,cs.CL,"cs.CL, cs.AI",2025-08-18 17:59:15+00:00,2025-08-18 17:59:15+00:00,http://arxiv.org/pdf/2508.13152v1,http://arxiv.org/abs/2508.13152v1,arxiv,RepreGuard: Detecting LLM-Generated Text by Re...
1,2508.13144v1,Signal and Noise: A Framework for Reducing Unc...,"David Heineman, Valentin Hofmann, Ian Magnusso...",Developing large language models is expensive ...,cs.CL,"cs.CL, cs.LG",2025-08-18 17:56:04+00:00,2025-08-18 17:56:04+00:00,http://arxiv.org/pdf/2508.13144v1,http://arxiv.org/abs/2508.13144v1,arxiv,Signal and Noise: A Framework for Reducing\nUn...
2,2508.13142v1,Has GPT-5 Achieved Spatial Intelligence? An Em...,"Zhongang Cai, Yubo Wang, Qingping Sun, Ruisi W...",Multi-modal models have achieved remarkable pr...,cs.CV,"cs.CV, cs.CL, cs.LG, cs.MM, cs.RO",2025-08-18 17:55:17+00:00,2025-08-18 17:55:17+00:00,http://arxiv.org/pdf/2508.13142v1,http://arxiv.org/abs/2508.13142v1,arxiv,Has GPT-5 Achieved Spatial Intelligence?\nAn E...
3,2508.13141v1,OptimalThinkingBench: Evaluating Over and Unde...,"Pranjal Aggarwal, Seungone Kim, Jack Lanchanti...",Thinking LLMs solve complex tasks at the expen...,cs.CL,"cs.CL, cs.LG",2025-08-18 17:53:10+00:00,2025-08-18 17:53:10+00:00,http://arxiv.org/pdf/2508.13141v1,http://arxiv.org/abs/2508.13141v1,arxiv,OptimalThinkingBench: Evaluating Over and\nUnd...


In [6]:
chunk_rows = []
for _, row in tqdm(papers_df.iterrows(), total=len(papers_df), desc='Semantic chunking'):
    full_text = row.raw_text
    chunks = chunk_document_semantic(full_text)
    for idx, ch in enumerate(chunks):
        chunk_rows.append({
            'paper_id': row.paper_id,
            'chunk_id': f"{row.paper_id}::chunk_{idx:04d}",
            'section_title': ch.get('section_title'),
            'text': ch.get('text'),
            'start_page': ch.get('start_page'),
            'end_page': ch.get('end_page'),
        })

chunks_df = pd.DataFrame(chunk_rows)
chunks_df.head()

Semantic chunking: 100%|██████████| 4/4 [00:52<00:00, 13.18s/it]


Unnamed: 0,paper_id,chunk_id,section_title,text,start_page,end_page
0,2508.13152v1,2508.13152v1::chunk_0000,Chunk 0,RepreGuard: Detecting LLM-Generated Text by Re...,,
1,2508.13152v1,2508.13152v1::chunk_0001,Chunk 1,"Junchao Wu1,∗\nShu Yang3\nRunzhe Zhan1\nZeyu W...",,
2,2508.13152v1,2508.13152v1::chunk_0002,Chunk 2,"1NLP2CT Lab, Department of Computer and Inform...",,
3,2508.13152v1,2508.13152v1::chunk_0003,Chunk 3,"min.yang@siat.ac.cn, cszyluo@comp.hkbu.edu.hk,...",,
4,2508.13152v1,2508.13152v1::chunk_0004,Chunk 4,Abstract\nDetecting content generated by large...,,
