In [7]:
import urllib, urllib.request
from urllib.parse import urlencode, quote
import json
import uuid

import xml.etree.ElementTree as ET
from documents import ChunkType, Document

namespace = "{http://www.w3.org/2005/Atom}"


arxiv_api = "http://export.arxiv.org/api/query?search_query=cat:{category}&start={start}&max_results={num_results}&sortBy=lastUpdatedDate&sortOrder=descending"

In [2]:
def parse_entry(entry: ET.Element):
  summary_text = entry.find(f"{namespace}summary").text
  title_text = entry.find(f"{namespace}title").text.replace('\n','')
  published_on = entry.find(f"{namespace}published").text

  for ele in entry.findall(f"{namespace}link"):
    if ele.attrib.get("title") == "pdf":
      pdf_url = ele.get("href")
      break
  
  return title_text, summary_text, published_on, pdf_url

In [8]:
def get_articles(category: str, num_results=20, batch=10) -> list[Document]:

  documents = []

  for i in range(0, num_results, batch):
    url = arxiv_api.format(category=category, start=i, num_results=batch)
    data = urllib.request.urlopen(url)
    file_string = data.read().decode('utf-8')
    root = ET.fromstring(file_string)

    entries=root.findall(f"{namespace}entry")
    for i, entry_tag in enumerate(entries):
      title, summary, published_on, pdf_url = parse_entry(entry_tag)
      document = Document(
        id=uuid.uuid4(),
        category=category,
        title=title,
        summary=summary,
        published_on=published_on,
        pdf_url=pdf_url
      )
      documents.append(document)

  return documents

In [9]:
with open("arxiv_taxonomy.json") as f:
  arxiv_taxonomy: dict[str, list[str]] = json.load(f)

docs: list[Document] = []

for section, item in arxiv_taxonomy.items():
  categories = item
  print(section)
  for category in categories:
    docs.extend(
      get_articles(category, num_results=100, batch=50)
    )

q-fin
cs
stat
eess


In [10]:
from chroma import ChromaDB

chroma = ChromaDB(embedding_model="hf", hf_model="Alibaba-NLP/gte-large-en-v1.5")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Alibaba-NLP/gte-large-en-v1.5


INFO:chromadb.api.segment:Collection default is not created.


In [12]:
chroma.add(
  [doc.to_chunk_type() for doc in docs]
)

INFO:root:Embeddings Result: (3200, 1024)
