In [None]:
cd ../..

In [None]:
import numpy as np
import pandas as pd
import chromadb

from chromadb.utils import embedding_functions


# Settings

In [None]:
TEXTS_FILE = "data/SB_publication_PMC_texts.parquet"
CHROMADB_PATH = "data/chroma"
OUTPUT_FILE = "data/SB_publication_PMC_recommendations.csv"

# Read data

In [None]:
data = pd.read_parquet(TEXTS_FILE)
data.head()

# Create vector database

In [None]:
client = chromadb.PersistentClient(path=CHROMADB_PATH)

In [None]:
embedder = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

In [None]:
def chunk_text(text, chunk_size=512, overlap=64):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
    return chunks

In [None]:
collection = client.create_collection("publications")

In [None]:
for idx, row in data.iterrows():
    pmc = row['pmc']
    # Index title
    collection.add(
        documents=[row["title"]],
        metadatas=[{"pmc": pmc, "type": "title"}],
        ids=[f"{pmc}_title"]
    )
    # Index abstract
    collection.add(
        documents=[row['abstract']],
        metadatas=[{"pmc": pmc, "type": "abstract"}],
        ids=[f"{pmc}_abstract"]
    )
    # Index text in chunks
    for j, chunk in enumerate(chunk_text(row["text"])):
        collection.add(
            documents=[chunk],
            metadatas=[{"pmc": pmc, "type": "text_chunk", "chunk_id": j}],
            ids=[f"{pmc}_text_{j}"]
        )

# Find similar articles per PMC

In [None]:
# Get the collection
collection = client.get_collection(name="publications")

# Get all pmc ids
all_pmcs = data['pmc'].astype(str).tolist()

# Gather all embeddings for each pmc (using only text_chunks for better representation)
pmc_embeddings = {}
for pmc in all_pmcs:
    # Query all text_chunks for this pmc
    results = collection.get(
        where={"pmc": pmc},
        include=["embeddings"]
    )
    # Some pmcs may not have text_chunks, skip them
    if results["embeddings"] is not None and len(results["embeddings"]) > 0:
        embeddings = np.array(results["embeddings"])
        pmc_embeddings[pmc] = embeddings

# Calculate mean embedding for each pmc
pmc_mean_embeddings = {pmc: emb.mean(axis=0) for pmc, emb in pmc_embeddings.items()}

# Stack all mean embeddings for distance calculation
pmc_ids = list(pmc_mean_embeddings.keys())
mean_emb_matrix = np.stack([pmc_mean_embeddings[pmc] for pmc in pmc_ids])

# Calculate the mean embedding of all pmcs
global_mean_embedding = mean_emb_matrix.mean(axis=0)

# Compute cosine similarity to the global mean embedding
def cosine_similarity(a, b):
    a = a / np.linalg.norm(a)
    b = b / np.linalg.norm(b)
    return np.dot(a, b)

similarities = [cosine_similarity(global_mean_embedding, emb) for emb in mean_emb_matrix]

# Get top 5 pmcs closest to the global mean embedding
top5_indices = np.argsort(similarities)[-5:][::-1]
top5_pmcs = [pmc_ids[i] for i in top5_indices]

In [None]:
# Compute top 5 closest pmcs for each pmc based on cosine similarity of mean embeddings
top5_similar_pmcs = {}

for i, pmc_id in enumerate(pmc_ids):
    emb = pmc_mean_embeddings[pmc_id]
    # Compute cosine similarity to all other pmcs
    sims = [cosine_similarity(emb, mean_emb_matrix[j]) for j in range(len(pmc_ids))]
    # Exclude self by setting its similarity to -inf
    sims[i] = float('-inf')
    # Get indices of top 5 most similar pmcs
    top5_idx = np.argsort(sims)[-5:][::-1]
    top5_similar_pmcs[pmc_id] = [pmc_ids[j] for j in top5_idx]

In [None]:
# Create a copy of data without 'abstract' and 'text' columns
data_similar = data.drop(columns=['abstract', 'text']).copy()

# Prepare a list to hold new rows
rows = []

for pmc_id in data_similar['pmc']:
    # Get top 5 similar pmcs for this pmc_id
    similar_pmcs = top5_similar_pmcs.get(str(pmc_id), [])
    for rank, rec_pmc in enumerate(similar_pmcs[:5], 1):
        # Get the recommended title
        rec_title = data.loc[data['pmc'] == rec_pmc, 'title']
        rec_title = rec_title.iloc[0] if not rec_title.empty else ""
        # Get the original row as dict, skip 'abstract' and 'text'
        base_row = data_similar.loc[data_similar['pmc'] == pmc_id].iloc[0].to_dict()
        # Add recommendation info
        base_row.update({
            'recommended_rank': rank,
            'recommended_pmc': rec_pmc,
            'recommended_title': rec_title
        })
        rows.append(base_row)

# Create the new dataframe
data_similar = pd.DataFrame(rows)

In [None]:
data_similar = data_similar.merge(data[["title", "link"]].rename(columns={"title": "recommended_title", "link": "recommended_link"}), how="left", on="recommended_title")
data_similar.head()

In [None]:
data_similar.to_csv(OUTPUT_FILE, index=False, encoding="utf-8", sep="|")