In [1]:
!pip install pymongo transformers torch

from pymongo import MongoClient
from transformers import AutoTokenizer, AutoModel
import torch
from kaggle_secrets import UserSecretsClient



In [2]:
user_secrets = UserSecretsClient()

uri = user_secrets.get_secret("MongoDB URI")
client = MongoClient(uri)
db = client["test"]
collection = db["sections"]
print(collection.count_documents({}))

229


In [3]:
HF = user_secrets.get_secret("HF Token")

# if not HF:
#     raise ValueError("❌ Hugging Face API token not found! Make sure it's set as a Kaggle secret.")

tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased", token = HF)
model = AutoModel.from_pretrained("mental/mental-bert-base-uncased", token = HF)


def compute_embedding(text):
    """Generate an embedding using MentalBERT."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
sections = collection.find()

for section in sections:
    content = section["section_text"]
    
    embedding = compute_embedding(content)

    collection.update_one(
        {"_id": section["_id"]},
        {"$set": {"embedding": embedding}}
    )

print("Embeddings computed and stored successfully!")


Embeddings computed and stored successfully!


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_similar_sections(query, collection, top_k=5):
    """Find the most similar sections to a query using cosine similarity."""
    query_embedding = np.array(compute_embedding(query)).reshape(1, -1)
    
    sections = list(collection.find({}, {"_id": 1, "section_text": 1, "embedding": 1}))
    
    similarities = []
    for section in sections:
        section_embedding = np.array(section["embedding"]).reshape(1, -1)
        similarity_score = cosine_similarity(query_embedding, section_embedding)[0][0]
        similarities.append((section, similarity_score))
    
    # Sort by similarity score (higher is better)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    return similarities[:top_k]

query = "What are the symptoms of anxiety?"
top_results = find_similar_sections(query, collection)

for section, score in top_results:
    print(f"Score: {score:.4f} | Section: {section['section_text'][:100]}...")


Score: 0.7745 | Section: 266            CHAPTER NINE
2. What are some unique characteristics common to all of the behavioral
...
Score: 0.7732 | Section: 388            CHAPTER THIRTEEN
requirements?  What  cultural  supports  (in  your  familycommunityw...
Score: 0.7711 | Section: 440            CHAPTER FIFTEEN
Clients Experience in Therapy
Most  clients  share  some  degree  of ...
Score: 0.7700 | Section: 110            CHAPTER FIVE
different,  and  what  would  you  be  doing  differently,  if  you  did...
Score: 0.7668 | Section: 66            CHAPTER FOUR
TabLE 4.2     Comparison of Freuds Psychosexual Stages and Eriksons
Psych...
