# Citations Analysis using Embeddings

Initialize ChromaDB

In [1]:
import chromadb

from config import DATA_DIR

In [2]:
chroma_db_path = DATA_DIR.joinpath("chroma").resolve()

chroma = chromadb.PersistentClient(path=str(chroma_db_path))
embeddings = chroma.get_or_create_collection("analysis")

## Load Data

In [3]:
from buff.openalex import Work
from buff.openalex.download import get_paper_text
from buff.llm.utils import get_token_count

## Papers

**Paper 2 - Cites - Paper 1**
$\text{Paper 2} \rightarrow \text{Paper 1}$

In [4]:
paper_1 = "W2994792393"
paper_2 = "W3190631166"

In [5]:
work_1 = await Work(paper_1).data
work_2 = await Work(paper_2).data

In [6]:
print("Paper 1: ", work_1.title)
print("Paper 2: ", work_2.title)

Paper 1:  The autophagy receptor p62/SQST-1 promotes proteostasis and longevity in C. elegans by inducing autophagy
Paper 2:  Autophagy in healthy aging and disease


In [7]:
text_1 = await get_paper_text(work=work_1)
text_2 = await get_paper_text(work=work_2)

print(f"Paper 1 loaded: {get_token_count(text_1)} tokens")
print(f"Paper 2 loaded: {get_token_count(text_2)} tokens")

Paper 1 loaded: 22380 tokens
Paper 2 loaded: 32836 tokens


## Text Processing

In [ ]:
# from buff.llm.split import split_text
# 
# text_1_chunks = split_text(text_1)
# text_2_chunks = split_text(text_2)
# 
# # Save the chunks
# with open("chunks.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps({
#         "paper_1": text_1_chunks,
#         "paper_2": text_2_chunks
#     }))
#     print("Chunks saved")

In [9]:
import json

# Load the chunks
with open("chunks.json", "r", encoding="utf-8") as f:
    text_chunks = json.load(f)
    text_1_chunks = text_chunks["paper_1"]
    text_2_chunks = text_chunks["paper_2"]
    print("Chunks loaded")

print(f"Paper 1 split into {len(text_1_chunks)} chunks")
print(f"Paper 2 split into {len(text_2_chunks)} chunks")

Chunks saved
Paper 1 split into 42 chunks
Paper 2 split into 66 chunks


Print min-mean-max token count for each chunk

In [10]:
text_1_chunks_tokens = [get_token_count(chunk) for chunk in text_1_chunks]
text_2_chunks_tokens = [get_token_count(chunk) for chunk in text_2_chunks]

print(
    f"Paper 1 chunk stats: {min(text_1_chunks_tokens)} - {sum(text_1_chunks_tokens) // len(text_1_chunks_tokens)} - {max(text_1_chunks_tokens)}")
print(
    f"Paper 2 chunk stats: {min(text_2_chunks_tokens)} - {sum(text_2_chunks_tokens) // len(text_2_chunks_tokens)} - {max(text_2_chunks_tokens)}")

Paper 1 chunk stats: 267 - 532 - 815
Paper 2 chunk stats: 81 - 497 - 684


### Embeddings

In [11]:
# from buff.llm.embed import embed_texts
# 
# text_1_embeddings = await embed_texts(text_1_chunks)
# text_2_embeddings = await embed_texts(text_2_chunks)
# 
# # Save the embeddings
# with open("embeddings.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps({
#         "paper_1": text_1_embeddings,
#         "paper_2": text_2_embeddings
#     }))
#     print("Embeddings saved")

# Load the embeddings
with open("embeddings.json", "r", encoding="utf-8") as f:
    text_embeddings = json.load(f)
    text_1_embeddings = text_embeddings["paper_1"]
    text_2_embeddings = text_embeddings["paper_2"]
    print("Embeddings loaded")

print(f"Paper 1 embedded into {len(text_1_embeddings)} chunks")
print(f"Paper 2 embedded into {len(text_2_embeddings)} chunks")

Embeddings saved
Paper 1 embedded into 42 chunks
Paper 2 embedded into 66 chunks


Store the documents

In [15]:
# from buff.llm.models import Document, DocumentMetadata
# 
# # Build the documents
# text_1_documents = [
#     Document(
#         metadata=DocumentMetadata(
#             index=i,
#             title=work_1.title,
#             work_id=str(work_1.id),
#             doi=str(work_1.doi),
#             text=text_1_chunks[i]
#         ),
#         id=f"{work_1.id}-{i}",
#         values=text_1_embeddings[i]
#     )
#     for i in range(len(text_1_chunks))
# ]
# 
# text_2_documents = [
#     Document(
#         metadata=DocumentMetadata(
#             index=i,
#             title=work_2.title,
#             work_id=str(work_2.id),
#             doi=str(work_2.doi),
#             text=text_2_chunks[i]
#         ),
#         id=f"{work_2.id}-{i}",
#         values=text_2_embeddings[i]
#     )
#     for i in range(len(text_2_chunks))
# ]
# print(f"Paper 1: {len(text_1_documents)} documents")
# print(f"Paper 2: {len(text_2_documents)} documents")
# 
# 
# # Create the following lists of data from the 2 documents arrays
# text_chunk_ids = [doc.id for doc in text_1_documents + text_2_documents]
# text_chunk_embeddings = [doc.values for doc in text_1_documents + text_2_documents]
# text_chunk_metadata = [doc.metadata.model_dump(mode="json") for doc in text_1_documents + text_2_documents]
# print(f"Total: {len(text_chunk_ids)} documents.")
# if not len(text_chunk_embeddings) == len(text_chunk_metadata) == len(text_chunk_ids):
#     raise ValueError("Invalid data")
# 
# # Store the embeddings
# embeddings.add(
#     ids=text_chunk_ids,
#     embeddings=text_chunk_embeddings,
#     metadatas=text_chunk_metadata
# )
# print("Embeddings stored")

Paper 1: 42 documents
Paper 2: 66 documents
Total: 108 documents.
Embeddings stored
