# Citations Analysis using Embeddings

Initialize ChromaDB

In [1]:
import chromadb

from config import DATA_DIR

In [2]:
chroma_db_path = DATA_DIR.joinpath("chroma").resolve()

chroma = chromadb.PersistentClient(path=str(chroma_db_path))
embeddings = chroma.get_or_create_collection("analysis")

## Load Data

In [3]:
from buff.openalex import Work
from buff.openalex.download import get_paper_text
from buff.llm.utils import get_token_count

## Papers

**Paper 2 - Cites - Paper 1**
$\text{Paper 2} \rightarrow \text{Paper 1}$

In [4]:
paper_1 = "W2994792393"
paper_2 = "W3190631166"

In [5]:
work_1 = await Work(paper_1).data
work_2 = await Work(paper_2).data

In [6]:
print("Paper 1: ", work_1.title)
print("Paper 2: ", work_2.title)

Paper 1:  The autophagy receptor p62/SQST-1 promotes proteostasis and longevity in C. elegans by inducing autophagy
Paper 2:  Autophagy in healthy aging and disease


In [7]:
text_1 = await get_paper_text(work=work_1)
text_2 = await get_paper_text(work=work_2)

print(f"Paper 1 loaded: {get_token_count(text_1)} tokens")
print(f"Paper 2 loaded: {get_token_count(text_2)} tokens")

Paper 1 loaded: 22380 tokens
Paper 2 loaded: 32836 tokens


## Text Processing

In [ ]:
# from buff.llm.split import split_text
# 
# text_1_chunks = split_text(text_1)
# text_2_chunks = split_text(text_2)
# 
# # Save the chunks
# with open("chunks.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps({
#         "paper_1": text_1_chunks,
#         "paper_2": text_2_chunks
#     }))
#     print("Chunks saved")

In [9]:
import json

# Load the chunks
with open("chunks.json", "r", encoding="utf-8") as f:
    text_chunks = json.load(f)
    text_1_chunks = text_chunks["paper_1"]
    text_2_chunks = text_chunks["paper_2"]
    print("Chunks loaded")

print(f"Paper 1 split into {len(text_1_chunks)} chunks")
print(f"Paper 2 split into {len(text_2_chunks)} chunks")

Chunks saved
Paper 1 split into 42 chunks
Paper 2 split into 66 chunks


Print min-mean-max token count for each chunk

In [10]:
text_1_chunks_tokens = [get_token_count(chunk) for chunk in text_1_chunks]
text_2_chunks_tokens = [get_token_count(chunk) for chunk in text_2_chunks]

print(
    f"Paper 1 chunk stats: {min(text_1_chunks_tokens)} - {sum(text_1_chunks_tokens) // len(text_1_chunks_tokens)} - {max(text_1_chunks_tokens)}")
print(
    f"Paper 2 chunk stats: {min(text_2_chunks_tokens)} - {sum(text_2_chunks_tokens) // len(text_2_chunks_tokens)} - {max(text_2_chunks_tokens)}")

Paper 1 chunk stats: 267 - 532 - 815
Paper 2 chunk stats: 81 - 497 - 684


### Embeddings

In [11]:
# from buff.llm.embed import embed_texts
# 
# text_1_embeddings = await embed_texts(text_1_chunks)
# text_2_embeddings = await embed_texts(text_2_chunks)
# 
# # Save the embeddings
# with open("embeddings.json", "w", encoding="utf-8") as f:
#     f.write(json.dumps({
#         "paper_1": text_1_embeddings,
#         "paper_2": text_2_embeddings
#     }))
#     print("Embeddings saved")

# Load the embeddings
with open("embeddings.json", "r", encoding="utf-8") as f:
    text_embeddings = json.load(f)
    text_1_embeddings = text_embeddings["paper_1"]
    text_2_embeddings = text_embeddings["paper_2"]
    print("Embeddings loaded")

print(f"Paper 1 embedded into {len(text_1_embeddings)} chunks")
print(f"Paper 2 embedded into {len(text_2_embeddings)} chunks")

Embeddings saved
Paper 1 embedded into 42 chunks
Paper 2 embedded into 66 chunks


Store the documents

In [15]:
# from buff.llm.models import Document, DocumentMetadata
# 
# # Build the documents
# text_1_documents = [
#     Document(
#         metadata=DocumentMetadata(
#             index=i,
#             title=work_1.title,
#             work_id=str(work_1.id),
#             doi=str(work_1.doi),
#             text=text_1_chunks[i]
#         ),
#         id=f"{work_1.id}-{i}",
#         values=text_1_embeddings[i]
#     )
#     for i in range(len(text_1_chunks))
# ]
# 
# text_2_documents = [
#     Document(
#         metadata=DocumentMetadata(
#             index=i,
#             title=work_2.title,
#             work_id=str(work_2.id),
#             doi=str(work_2.doi),
#             text=text_2_chunks[i]
#         ),
#         id=f"{work_2.id}-{i}",
#         values=text_2_embeddings[i]
#     )
#     for i in range(len(text_2_chunks))
# ]
# print(f"Paper 1: {len(text_1_documents)} documents")
# print(f"Paper 2: {len(text_2_documents)} documents")
# 
# 
# # Create the following lists of data from the 2 documents arrays
# text_chunk_ids = [doc.id for doc in text_1_documents + text_2_documents]
# text_chunk_embeddings = [doc.values for doc in text_1_documents + text_2_documents]
# text_chunk_metadata = [doc.metadata.model_dump(mode="json") for doc in text_1_documents + text_2_documents]
# print(f"Total: {len(text_chunk_ids)} documents.")
# if not len(text_chunk_embeddings) == len(text_chunk_metadata) == len(text_chunk_ids):
#     raise ValueError("Invalid data")
# 
# # Store the embeddings
# embeddings.add(
#     ids=text_chunk_ids,
#     embeddings=text_chunk_embeddings,
#     metadatas=text_chunk_metadata
# )
# print("Embeddings stored")

Paper 1: 42 documents
Paper 2: 66 documents
Total: 108 documents.
Embeddings stored


In [35]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
# Convert the lists to numpy arrays
text_1_vecs = np.array(text_1_embeddings)
text_2_vecs = np.array(text_2_embeddings)

text_vecs_offset = 3

# Ignore the first and last few vectors from each array
text_1_vecs = text_1_vecs[text_vecs_offset:-text_vecs_offset]
text_2_vecs = text_2_vecs[text_vecs_offset:-text_vecs_offset]

In [40]:
# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(text_1_vecs, text_2_vecs)

# Find the indices of the 10 most common embeddings
most_common_indices = np.unravel_index(similarity_matrix.argpartition(-10, axis=None)[-10:], similarity_matrix.shape)

# Find the indices of the 10 most different embeddings
most_different_indices = np.unravel_index(similarity_matrix.argpartition(10, axis=None)[:10], similarity_matrix.shape)

# Sort the most common indices based on similarity scores in descending order
most_common_scores = similarity_matrix[most_common_indices]
most_common_sorted_indices = np.argsort(-most_common_scores)
most_common_indices = (most_common_indices[0][most_common_sorted_indices], most_common_indices[1][most_common_sorted_indices])

# Sort the most different indices based on similarity scores in ascending order
most_different_scores = similarity_matrix[most_different_indices]
most_different_sorted_indices = np.argsort(most_different_scores)
most_different_indices = (most_different_indices[0][most_different_sorted_indices], most_different_indices[1][most_different_sorted_indices])

# Print the index pairs and similarity scores for the most common embeddings in descending order
print("10 most common embeddings (in descending order of similarity):")
for i in range(10):
    index_1, index_2 = most_common_indices[0][i], most_common_indices[1][i]
    similarity_score = similarity_matrix[index_1, index_2]
    print(f"Index pair: ({index_1 + text_vecs_offset}, {index_2 + text_vecs_offset}), Similarity score: {similarity_score:.4f}")

# Print the index pairs and similarity scores for the most different embeddings in ascending order
print("\n10 most different embeddings (in ascending order of similarity):")
for i in range(10):
    index_1, index_2 = most_different_indices[0][i], most_different_indices[1][i]
    similarity_score = similarity_matrix[index_1, index_2]
    print(f"Index pair: ({index_1 + text_vecs_offset}, {index_2 + text_vecs_offset}), Similarity score: {similarity_score:.4f}")

10 most common embeddings (in descending order of similarity):
Index pair: (38, 48), Similarity score: 0.8527
Index pair: (38, 46), Similarity score: 0.8487
Index pair: (34, 45), Similarity score: 0.8380
Index pair: (38, 47), Similarity score: 0.8371
Index pair: (38, 62), Similarity score: 0.8338
Index pair: (34, 49), Similarity score: 0.8319
Index pair: (37, 48), Similarity score: 0.8310
Index pair: (36, 47), Similarity score: 0.8282
Index pair: (36, 48), Similarity score: 0.8268
Index pair: (36, 46), Similarity score: 0.8259

10 most different embeddings (in ascending order of similarity):
Index pair: (14, 13), Similarity score: 0.2403
Index pair: (13, 13), Similarity score: 0.2470
Index pair: (13, 56), Similarity score: 0.2544
Index pair: (13, 27), Similarity score: 0.2588
Index pair: (13, 23), Similarity score: 0.2668
Index pair: (15, 56), Similarity score: 0.2676
Index pair: (15, 13), Similarity score: 0.2693
Index pair: (13, 32), Similarity score: 0.2735
Index pair: (33, 23), Sim

In [41]:
# Print the text chunks for the most common embeddings
print("\n10 most common text chunks:")
for i in range(10):
    index_1, index_2 = most_common_indices[0][i], most_common_indices[1][i]
    print(f"\nText 1 chunk: {text_1_chunks[index_1 + text_vecs_offset]}")
    print(f"\nText 2 chunk: {text_2_chunks[index_2 + text_vecs_offset]}")
    print("-"*64 + "\n\n\n")


10 most common text chunks:

Text 1 chunk: Integrating the stress response: lessons for neurodegenerative diseases from C. elegans. Trends Cell Biol. 19, 52–61 (2009). 47. Ben-Zvi, A., Miller, E. A. & Morimoto, R. I. Collapse of proteostasis represents an early molecular event in Caenorhabditis elegans aging. Proc. Natl Acad. Sci. USA 106, 14914–14919 (2009). 48. Vilchez, D. et al. RPN-6 determines C. elegans longevity under proteotoxic stress conditions. Nature 489, 263–268 (2012). 49. Sun, T., Wang, X., Lu, Q., Ren, H. & Zhang, H. CUP-5, the C. elegans ortholog of the mammalian lysosomal channel protein MLN1/TRPML1, is required for proteolytic degradation in autolysosomes. Autophagy 7, 1308–1315 (2011). 50. Wu, Y. et al. PI3P phosphatase activity is required for autophagosome maturation and autolysosome formation. EMBO Rep. 15, 973–981 (2014). 51. Toth, M. L. et al. Longevity pathways converge on autophagy genes to regulate life span in Caenorhabditis elegans. Autophagy 4, 330–338 (

In [42]:
# Print the text chunks for the most different embeddings
print("\n10 most different text chunks:")
for i in range(10):
    index_1, index_2 = most_different_indices[0][i], most_different_indices[1][i]
    print(f"\nText 1 chunk: {text_1_chunks[index_1 + text_vecs_offset]}")
    print(f"\nText 2 chunk: {text_2_chunks[index_2 + text_vecs_offset]}")
    print("-"*64 + "\n\n\n")


10 most different text chunks:

Text 1 chunk: NATURE COMMUNICATIONS | https://doi.org/10.1038/s41467-019-13540-4 ARTICLE NATURE COMMUNICATIONS | (2019) 10:5648 | https://doi.org/10.1038/s41467-019-13540-4 | www.nature.com/naturecommunications 5 100 a b c d e h f g 75 Percent alive Percent alive Percent alive Percent alive Percent alive Percent fluorescence intensity Percent fluorescence intensity Percent alive Aggregates in head 50 25 0 100 75 50 25 0 100 75 50 25 0 100 75 50 25 0 100 75 50 25 0 100 75 50 25 0 0 10 20 WT-CTRL WT-CTRL WT-HS WT-CTRL WT-HS P < 0.0001 P < 0.0001 P < 0.0001 P < 0.0001 P < 0.0001 P = 0.4 P = 0.8 P = 0.0003 P = 0.9 WT WT sqst-1 sqst-1; SQST-1 Neuronal SQST-1 OE Neuronal SQST-1 OE-CTRL Neuronal SQST-1 OE-HS Neuronal Q40 Neuronal Q40 ;SQST-1 OE SQST-1 OE-CTRL SQST-1 OE SQST-1 OE SQST-1 OE WT SQST-1 OE-HS sqst-1; SQST-1ΔUBA SQST-1 OE-CTRL Days 30 0 10 20 Days 30 0 1 2 2 3 3 1 10 20 Days FRAP of neuronal Q40 Neuronal Q40 aggregates Diffuse neuronal Q40 WT WT SQS