# Fact Rank

In [1]:
import chromadb

from config import DATA_DIR

In [2]:
chroma_db_path = DATA_DIR.joinpath("chroma").resolve()

chroma = chromadb.PersistentClient(path=str(chroma_db_path))
vecstore = chroma.get_or_create_collection("factrank")

## Load Data

In [3]:
import json

from buff.openalex import Work
from buff.openalex.download import get_paper_text
from buff.llm.utils import get_token_count

In [4]:
with open("works.json", "r", encoding="utf-8") as file:
    FOCUS_PAPER = json.load(file).get("id") 
    
with open("citations.json", "r", encoding="utf-8") as file:
    CITATIONS = list(json.load(file).keys())
    
with open("references.json", "r", encoding="utf-8") as file:
    REFERENCES = list(json.load(file).keys())
    
print("Main paper:", FOCUS_PAPER)
print("Citations:", len(CITATIONS))
print("References:", len(REFERENCES))

Main paper: https://openalex.org/W2994792393
Citations: 39
References: 23


In [5]:
WORKS = {
    work_id: await Work(work_id).data
    for work_id in [FOCUS_PAPER] + CITATIONS + REFERENCES
}
print("Works:", len(WORKS))

Works: 63


In [6]:
TEXTS = {
    work_id: await get_paper_text(work)
    for work_id, work in WORKS.items()
}
print("Texts:", len(TEXTS))

Texts: 63


## Process and Prepare

In [7]:
# from buff.llm.split import split_text
# 
# TEXT_CHUNKS = {
#     work_id: split_text(text)
#     for work_id, text in TEXTS.items()
# }
# 
# # Save the chunks
# with open("chunks.json", "w", encoding="utf-8") as f:
#     json.dump(TEXT_CHUNKS, f, indent=2)

In [8]:
# Load the chunks
with open("chunks.json", "r", encoding="utf-8") as f:
    TEXT_CHUNKS = json.load(f)

In [9]:
print("Chunks:", sum(len(chunks) for chunks in TEXT_CHUNKS.values()))

Chunks: 2500


### Embed

In [10]:
# from buff.llm.embed import embed_texts
# 
# EMBEDDINGS = {
#     work_id: await embed_texts(chunks)
#     for work_id, chunks in TEXT_CHUNKS.items()
# }
# 
# # Save the embeddings
# with open("embeddings.json", "w", encoding="utf-8") as f:
#     json.dump(EMBEDDINGS, f, indent=2)

In [11]:
# Load the embeddings
with open("embeddings.json", "r", encoding="utf-8") as f:
    EMBEDDINGS = json.load(f)

In [12]:
print("Embeddings:", sum(len(embeds) for embeds in EMBEDDINGS.values()))

Embeddings: 2500


### Store

In [16]:
# from buff.llm.models import Document, DocumentMetadata
# 
# # Build the documents
# DOCUMENTS = {
#     work_id: [
#         Document(
#             metadata=DocumentMetadata(
#                 index=i,
#                 work_id=work_id,
#                 doi=str(WORKS[work_id].doi),
#                 text=TEXT_CHUNKS[work_id][i]
#             ),
#             id=f"{work_id}-{i}",
#             values=embed
#         )
#         for i, embed in enumerate(embeds)
#     ]
#     for work_id, embeds in EMBEDDINGS.items()
# }
# 
# # Create the following lists of data from the documents
# TEXT_CHUNK_IDS = [doc.id for docs in DOCUMENTS.values() for doc in docs]
# TEXT_CHUNK_EMBEDDINGS = [doc.values for docs in DOCUMENTS.values() for doc in docs]
# TEXT_CHUNK_METADATA = [doc.metadata.model_dump(mode="json") for docs in DOCUMENTS.values() for doc in docs]
# print(f"Total: {len(TEXT_CHUNK_IDS)} documents.")
# if not len(TEXT_CHUNK_EMBEDDINGS) == len(TEXT_CHUNK_METADATA) == len(TEXT_CHUNK_IDS):
#     raise ValueError("Invalid data")
# 
# # Store the embeddings
# vecstore.add(
#     ids=TEXT_CHUNK_IDS,
#     embeddings=TEXT_CHUNK_EMBEDDINGS,
#     metadatas=TEXT_CHUNK_METADATA
# )
# print("Embeddings stored")

Total: 2500 documents.
Embeddings stored


### Stats

In [17]:
# Print the min-median-max number of chunks per work
print("Chunks per work:", min(len(chunks) for chunks in TEXT_CHUNKS.values()), "-", sum(len(chunks) for chunks in TEXT_CHUNKS.values()) / len(TEXT_CHUNKS), "-", max(len(chunks) for chunks in TEXT_CHUNKS.values()))

# Print the min-median-max number of tokens per chunk
print("Tokens per chunk:", min(get_token_count(chunk) for chunks in TEXT_CHUNKS.values() for chunk in chunks), "-", sum(get_token_count(chunk) for chunks in TEXT_CHUNKS.values() for chunk in chunks) / sum(len(chunks) for chunks in TEXT_CHUNKS.values()), "-", max(get_token_count(chunk) for chunks in TEXT_CHUNKS.values() for chunk in chunks))

Chunks per work: 11 - 39.682539682539684 - 81
Tokens per chunk: 25 - 526.578 - 929
