In [12]:
import pandas as pd
import torch
import chromadb
from chromadb.config import Settings
from pathlib import Path
from tqdm import tqdm
import numpy as np

In [None]:
embedding_path = Path("../embeddings/bio_clincalbert_embeddings.pt")
input_text_path = Path("../embeddings/input_texts.txt")
csv_path = Path("../processed/training_data.csv")
chroma_dir = Path("../vector_db/chroma")

# === Load Data ===
print("Loading embeddings...")
embeddings = torch.load(embedding_path).tolist()

print("Loading input texts...")
with open(input_text_path, "r", encoding="utf-8") as f:
    input_texts = [line.strip() for line in f.readlines() if line.strip()]

print("Loading training CSV...")
df = pd.read_csv(csv_path)

# === Init ChromaDB ===
chroma_dir.mkdir(parents=True, exist_ok=True)

# Updated client initialization
client = chromadb.PersistentClient(path=str(chroma_dir))
collection = client.get_or_create_collection(
    name="patient_embeddings",
    metadata={"hnsw:space": "cosine"}  # Optional: specify similarity metric
)

# === Store into ChromaDB ===
print("Storing into ChromaDB...")
batch_size = 100  # Process in batches for large datasets

for i in tqdm(range(0, len(input_texts), batch_size)):
    batch_texts = input_texts[i:i+batch_size]
    batch_embeddings = embeddings[i:i+batch_size]
    
    # Ensure metadata always has valid values
    batch_metadata = []
    valid_entries = []
    
    for j in range(i, min(i+batch_size, len(df))):
        # Get current index in batch
        idx_in_batch = j - i
        
        # Skip if text or embedding is empty
        if idx_in_batch >= len(batch_texts) or idx_in_batch >= len(batch_embeddings):
            continue
            
        text = batch_texts[idx_in_batch]
        embedding = batch_embeddings[idx_in_batch]
        
        if not text or not isinstance(embedding, (list, np.ndarray)):
            continue
            
        # Prepare metadata with fallback values
        metadata = {
            "unit_no": str(df.iloc[j].get("unit no", "unknown")),
            "name": str(df.iloc[j].get("name", "unknown")),
            "summary": str(df.iloc[j].get("summary", "none"))
        }
        
        # Ensure embedding is a list of floats
        processed_embedding = [float(x) for x in embedding]
        
        valid_entries.append({
            "text": text,
            "embedding": processed_embedding,
            "metadata": metadata,
            "id": f"train_doc_{j}"
        })
    
    if not valid_entries:
        continue
        
    # Prepare batch data
    documents = [entry["text"] for entry in valid_entries]
    embeddings_list = [entry["embedding"] for entry in valid_entries]
    metadatas = [entry["metadata"] for entry in valid_entries]
    ids = [entry["id"] for entry in valid_entries]
    
    # Add to collection
    collection.add(
        documents=documents,
        embeddings=embeddings_list,
        metadatas=metadatas,
        ids=ids
    )

print(f"✅ Stored {collection.count()} documents to ChromaDB at {chroma_dir}")

Loading embeddings...
Loading input texts...
Loading training CSV...
Storing into ChromaDB...


100%|██████████| 705/705 [00:00<00:00, 1302.65it/s]


✅ Stored 800 documents to ChromaDB at ../vector_db/chroma


In [None]:
query_text = "Name: Myra Shah Unit No: 71957719 Admission Date: 2022-06-22 Date Of Birth: 1977-08-01 Sex: F Service: ORTHOPAEDICS Allergies: Penicillins / Amoxicillin / Ultram / hydrocodone / meloxicam / \nomnipague 240 Attending: Dr. Sharma Chief Complaint: left shoulder osteoarthritis/pain Major Surgical Or Invasive Procedure: nan History Of Present Illness: nan Past Medical History: dyslipidemia, heart murmur, OSA (remote hx, resolved w/weight \nloss), migraines, spinal stenosis, vertigo, hypothyroidism, \nGERD, pancreatic cyst, anemia, depression, s/p B/L TKRs, R TSR \n(___), tonsillectomy, L hand ___ digit arthrodesis (___) Social History: nan Family History: Non-contributory Physical Exam: Well appearing in no acute distress  \n Afebrile with stable vital signs  \n Pain well-controlled  \n Respiratory: CTAB  \n Cardiovascular: RRR  \n Gastrointestinal: NT/ND  \n Genitourinary: Voiding independently  \n Neurologic: Intact with no focal deficits  \n Psychiatric: Pleasant, A&O x3  \n Musculoskeletal Upper Extremity:  \n * Incision healing well \n * Scant serosanguinous drainage  \n * ___ strength  \n * SILT, NVI distally  \n * Fingers warm Pertinent Results: nan Medications On Admission: 1. Estrogens Conjugated 0.625 gm VG 1X/WEEK (MO) \n2. FLUoxetine 50 mg PO DAILY \n3. Levothyroxine Sodium 75 mcg PO DAILY \n4. Omeprazole 20 mg PO DAILY \n5. Simvastatin 20 mg PO QPM \n6. Spironolactone 50 mg PO DAILY \n7. Acetaminophen ___ mg PO Q6H:PRN pain \n8. Vitamin D 1000 UNIT PO DAILY \n9. Cyanocobalamin 1000 mcg PO DAILY Brief Hospital Course: The patient was admitted to the orthopedic surgery service and \nwas taken to the operating room for above described procedure. \nPlease see separately dictated operative report for details. The \nsurgery was uncomplicated and the patient tolerated the \nprocedure well. Patient received perioperative IV antibiotics.\n\nPostoperative course was remarkable for the following:\n\nOn POD#0, she was oliguric and was bloused 500cc NS. On POD #1, \nshe was hypotensive and bloused 1L of NS and continued to be \nhypotensive. Later in the day she became hypotensive, short of \nbreath, and hypoxic. A CXR was obtained and was unremarkable. A \nCTA was ordered which was negative for a pulmonary embolism. \nPOD# 2, she continued to have low O2 sats. She was weaned down \non her oxygen and respond well when ambulating with physical \ntherapy, but would de-sat upon laying flat. Medicine was \nconsulted which recommended getting a BNP which was mildly \nelevated, and Tropins which were within normal limits. On POD \n#3, Her oxygen saturation improved. \n\nOtherwise, pain was controlled with a combination of IV and oral \npain medications.  The patient received Aspirin for DVT \nprophylaxis.  Labs were checked throughout the hospital course \nand repleted accordingly. At the time of discharge the patient \nwas tolerating a regular diet and feeling well.  The patient was \nafebrile with stable vital signs.  The patient's hematocrit was \nacceptable and pain was adequately controlled on an oral \nregimen. The operative extremity was neurovascularly intact and \nthe wound was benign. \n\nThe patient's weight-bearing status is non-weight bearing as \ntolerated on the operative extremity.\n\nMs. ___ is discharged to home in stable condition. Discharge Medications: 1. Levothyroxine Sodium 75 mcg PO DAILY \n2. Omeprazole 20 mg PO DAILY \n3. Simvastatin 20 mg PO QPM \n4. Vitamin D 1000 UNIT PO DAILY \n5. Cyanocobalamin 1000 mcg PO DAILY \n6. Estrogens Conjugated 0.625 gm VG 1X/WEEK (MO) \n7. FLUoxetine 50 mg PO DAILY \n8. Docusate Sodium 100 mg PO BID \n9. OxycoDONE (Immediate Release)  ___ mg PO Q4H:PRN pain \nplease no driving or drinking alcohol while taking this \nmedication \n10. Aspirin EC 325 mg PO DAILY \n11. Senna 17.2 mg PO HS \n12. TraMADol 25 mg PO Q6H:PRN pain Discharge Diagnosis: left shoulder osteoarthritis/pain Discharge Condition: Mental Status: Clear and coherent.\nLevel of Consciousness: Alert and interactive.\nActivity Status: Ambulatory - Independent. Discharge Instructions: 1. Please return to the emergency department or notify your \nphysician if you experience any of the following: severe pain \nnot relieved by medication, increased swelling, decreased \nsensation, difficulty with movement, fevers greater than 101.5, \nshaking chills, increasing redness or drainage from the incision \nsite, chest pain, shortness of breath or any other concerns.\n\n2. Please follow up with your primary physician regarding this \nadmission and any new medications and refills. \n\n3. Resume your home medications unless otherwise instructed.\n\n4. You have been given medications for pain control. Please do \nnot drive, operate heavy machinery, or drink alcohol while \ntaking these medications. As your pain decreases, take fewer \ntablets and increase the time between doses. This medication can \ncause constipation, so you should drink plenty of water daily \nand take a stool softener (such as Colace) as needed to prevent \nthis side effect.  Call your surgeons office 3 days before you \nare out of medication so that it can be refilled.  These \nmedications cannot be called into your pharmacy and must be \npicked up in the clinic or mailed to your house.  \n\n5. You may not drive a car until cleared to do so by your \nsurgeon.\n\n6. Please call your surgeon's office to schedule or confirm your Discharge Disposition: Home"
query_embedding = embed_text(query_text)