# Citations Analysis using Embeddings

Initialize ChromaDB

In [1]:
import chromadb

from config import DATA_DIR

In [2]:
chroma_db_path = DATA_DIR.joinpath("chroma").resolve()

chroma = chromadb.PersistentClient(path=str(chroma_db_path))
embeddings = chroma.get_or_create_collection("analysis")

## Load Data

In [3]:
from buff.openalex import Work
from buff.openalex.download import get_paper_text
from buff.llm.utils import get_token_count

## Papers

**Paper 2 - Cites - Paper 1**
$\text{Paper 2} \rightarrow \text{Paper 1}$

In [4]:
paper_1 = "W2994792393"
paper_2 = "W3190631166"

In [5]:
work_1 = await Work(paper_1).data
work_2 = await Work(paper_2).data

In [6]:
print("Paper 1: ", work_1.title)
print("Paper 2: ", work_2.title)

Paper 1:  The autophagy receptor p62/SQST-1 promotes proteostasis and longevity in C. elegans by inducing autophagy
Paper 2:  Autophagy in healthy aging and disease


In [7]:
text_1 = await get_paper_text(work=work_1)
text_2 = await get_paper_text(work=work_2)

print(f"Paper 1 loaded: {get_token_count(text_1)} tokens")
print(f"Paper 2 loaded: {get_token_count(text_2)} tokens")

Paper 1 loaded: 22380 tokens
Paper 2 loaded: 32836 tokens


## Text Processing

In [ ]:
# from buff.llm.split import split_text
# 
# text_1_chunks = split_text(text_1)
# text_2_chunks = split_text(text_2)
# 
# # Save the chunks
# with open("chunks.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps({
#         "paper_1": text_1_chunks,
#         "paper_2": text_2_chunks
#     }))
#     print("Chunks saved")

In [9]:
import json

# Load the chunks
with open("chunks.json", "r", encoding="utf-8") as f:
    text_chunks = json.load(f)
    text_1_chunks = text_chunks["paper_1"]
    text_2_chunks = text_chunks["paper_2"]
    print("Chunks loaded")

print(f"Paper 1 split into {len(text_1_chunks)} chunks")
print(f"Paper 2 split into {len(text_2_chunks)} chunks")

Chunks saved
Paper 1 split into 42 chunks
Paper 2 split into 66 chunks


Print min-mean-max token count for each chunk

In [10]:
text_1_chunks_tokens = [get_token_count(chunk) for chunk in text_1_chunks]
text_2_chunks_tokens = [get_token_count(chunk) for chunk in text_2_chunks]

print(
    f"Paper 1 chunk stats: {min(text_1_chunks_tokens)} - {sum(text_1_chunks_tokens) // len(text_1_chunks_tokens)} - {max(text_1_chunks_tokens)}")
print(
    f"Paper 2 chunk stats: {min(text_2_chunks_tokens)} - {sum(text_2_chunks_tokens) // len(text_2_chunks_tokens)} - {max(text_2_chunks_tokens)}")

Paper 1 chunk stats: 267 - 532 - 815
Paper 2 chunk stats: 81 - 497 - 684


### Embeddings

In [11]:
# from buff.llm.embed import embed_texts
# 
# text_1_embeddings = await embed_texts(text_1_chunks)
# text_2_embeddings = await embed_texts(text_2_chunks)
# 
# # Save the embeddings
# with open("embeddings.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps({
#         "paper_1": text_1_embeddings,
#         "paper_2": text_2_embeddings
#     }))
#     print("Embeddings saved")

# Load the embeddings
with open("embeddings.json", "r", encoding="utf-8") as f:
    text_embeddings = json.load(f)
    text_1_embeddings = text_embeddings["paper_1"]
    text_2_embeddings = text_embeddings["paper_2"]
    print("Embeddings loaded")

print(f"Paper 1 embedded into {len(text_1_embeddings)} chunks")
print(f"Paper 2 embedded into {len(text_2_embeddings)} chunks")

Embeddings saved
Paper 1 embedded into 42 chunks
Paper 2 embedded into 66 chunks


Store the documents

In [15]:
# from buff.llm.models import Document, DocumentMetadata
# 
# # Build the documents
# text_1_documents = [
#     Document(
#         metadata=DocumentMetadata(
#             index=i,
#             title=work_1.title,
#             work_id=str(work_1.id),
#             doi=str(work_1.doi),
#             text=text_1_chunks[i]
#         ),
#         id=f"{work_1.id}-{i}",
#         values=text_1_embeddings[i]
#     )
#     for i in range(len(text_1_chunks))
# ]
# 
# text_2_documents = [
#     Document(
#         metadata=DocumentMetadata(
#             index=i,
#             title=work_2.title,
#             work_id=str(work_2.id),
#             doi=str(work_2.doi),
#             text=text_2_chunks[i]
#         ),
#         id=f"{work_2.id}-{i}",
#         values=text_2_embeddings[i]
#     )
#     for i in range(len(text_2_chunks))
# ]
# print(f"Paper 1: {len(text_1_documents)} documents")
# print(f"Paper 2: {len(text_2_documents)} documents")
# 
# 
# # Create the following lists of data from the 2 documents arrays
# text_chunk_ids = [doc.id for doc in text_1_documents + text_2_documents]
# text_chunk_embeddings = [doc.values for doc in text_1_documents + text_2_documents]
# text_chunk_metadata = [doc.metadata.model_dump(mode="json") for doc in text_1_documents + text_2_documents]
# print(f"Total: {len(text_chunk_ids)} documents.")
# if not len(text_chunk_embeddings) == len(text_chunk_metadata) == len(text_chunk_ids):
#     raise ValueError("Invalid data")
# 
# # Store the embeddings
# embeddings.add(
#     ids=text_chunk_ids,
#     embeddings=text_chunk_embeddings,
#     metadatas=text_chunk_metadata
# )
# print("Embeddings stored")

Paper 1: 42 documents
Paper 2: 66 documents
Total: 108 documents.
Embeddings stored


In [29]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert the lists to numpy arrays
text_1_vecs = np.array(text_1_embeddings)
text_2_vecs = np.array(text_2_embeddings)

# Ignore the first and last vectors from each array
text_1_vecs = text_1_vecs[4:-4]
text_2_vecs = text_2_vecs[4:-4]

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(text_1_vecs, text_2_vecs)

# Find the indices of the 5 most common embeddings
most_common_indices = np.unravel_index(similarity_matrix.argpartition(-5, axis=None)[-5:], similarity_matrix.shape)

# Find the indices of the 5 most different embeddings
most_different_indices = np.unravel_index(similarity_matrix.argpartition(5, axis=None)[:5], similarity_matrix.shape)

# Print the index pairs and similarity scores for the most common embeddings
print("5 most common embeddings:")
for i in range(5):
    index_1, index_2 = most_common_indices[0][i], most_common_indices[1][i]
    similarity_score = similarity_matrix[index_1, index_2]
    print(f"Index pair: ({index_1 + 1}, {index_2 + 1}), Similarity score: {similarity_score:.4f}")

# Print the index pairs and similarity scores for the most different embeddings
print("\n5 most different embeddings:")
for i in range(5):
    index_1, index_2 = most_different_indices[0][i], most_different_indices[1][i]
    similarity_score = similarity_matrix[index_1, index_2]
    print(f"Index pair: ({index_1 + 1}, {index_2 + 1}), Similarity score: {similarity_score:.4f}")

5 most common embeddings:
Index pair: (33, 45), Similarity score: 0.8268
Index pair: (33, 44), Similarity score: 0.8282
Index pair: (34, 45), Similarity score: 0.8310
Index pair: (31, 46), Similarity score: 0.8319
Index pair: (31, 42), Similarity score: 0.8380

5 most different embeddings:
Index pair: (10, 53), Similarity score: 0.2544
Index pair: (11, 10), Similarity score: 0.2403
Index pair: (10, 10), Similarity score: 0.2470
Index pair: (10, 24), Similarity score: 0.2588
Index pair: (10, 20), Similarity score: 0.2668


In [30]:
# Print the text chunks for the most common embeddings
print("\n5 most common text chunks:")
for i in range(5):
    index_1, index_2 = most_common_indices[0][i], most_common_indices[1][i]
    print(f"\nText 1 chunk: {text_1_chunks[index_1+4]}")
    print(f"\nText 2 chunk: {text_2_chunks[index_2+4]}")
    print("-"*64 + "\n\n\n")


5 most common text chunks:

Text 1 chunk: Iron-starvation-induced mitophagy mediates lifespan extension upon mitochondrial stress in C. elegans. Curr. Biol. 25, 1810–1822 (2015). 24. Lin, L. et al. The scaffold protein EPG-7 links cargo-receptor complexes with the autophagic assembly machinery. J. Cell Biol. 201, 113–129 (2013). 25. Du, Y., Wooten, M. C., Gearing, M. & Wooten, M. W. Age-associated oxidative damage to the p62 promoter: implications for Alzheimer disease. Free Radic. Biol. Med. 46, 492–501 (2009). 26. Du, Y., Wooten, M. C. & Wooten, M. W. Oxidative damage to the promoter region of SQSTM1/p62 is common to neurodegenerative disease. Neurobiol. Dis. 35, 302–310 (2009). 27. Richter, K., Haslbeck, M. & Buchner, J. The heat shock response: life on the verge of death. Mol. Cell 40, 253–266 (2010). 28. Vabulas, R. M., Raychaudhuri, S., Hayer-Hartl, M. & Hartl, F. U. Protein folding in the cytoplasm and the heat shock response. Cold Spring Harb. Perspect. Biol. 2, a004390 (2010)

In [31]:
# Print the text chunks for the most different embeddings
print("\n5 most different text chunks:")
for i in range(5):
    index_1, index_2 = most_different_indices[0][i], most_different_indices[1][i]
    print(f"\nText 1 chunk: {text_1_chunks[index_1+4]}")
    print(f"\nText 2 chunk: {text_2_chunks[index_2+4]}")
    print("-"*64 + "\n\n\n")


5 most different text chunks:

Text 1 chunk: WT-CTRL animals (N = 126) compared with WT-HS animals (N = 110): P < 0.0001; sqst-1-CTRL animals (N = 128) (P = 0.3 to WT) compared with sqst-1-HS animals (N = 115): P = 0.004, by log-rank test. See Supplementary Table 2 for experimental details and additional repeats. c Neuronal PolyQ (rgef-1p::Q40::yfp) aggregates detected on day 4 of adulthood in WT and sqst-1(ok2892) (sqst-1) animals maintained under control conditions (20 °C, CTRL) or subjected to HS (1 h at 36 °C) on day 1 of adulthood (HS). Scale bar 20 μm. The number of neuronal PolyQ aggregates were quantiﬁed from three independent experiments in WT-CTRL, N = 43; WT-HS, N = 50; sqst-1-CTRL, N = 48, sqst-1-HS, N = 45 animals. Error bars indicate 95% CI. ns: P > 0.6, **P < 0.01, ****P < 0.0001, by two-way ANOVA with Tukey’s multiple comparisons test. Scale bar 20 and 10 μm for close-up. d, e FRAP measurements of neuronal PolyQ (rgef-1p::Q40::yfp) aggregates on day 8 of adulthood in W