# ‚úÖ Install dependencies & log environment
This cell installs required libraries (Colab-friendly). It logs versions to `env_multimodal.json` for reproducibility.


In [1]:
# (Run this in Colab/Jupyter; skip install if already set up)
# NOTE: avoid force-uninstall to prevent breaking environment; only install what we need.
!pip install -q transformers sentence-transformers faiss-cpu Pillow langchain pypdf ipywidgets

import sys, platform, json, os, pathlib, datetime
import numpy as np
import torch
import PIL
import sentence_transformers, transformers

env = {
    "timestamp": datetime.datetime.now().isoformat(timespec="seconds"),
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "numpy": np.__version__,
    "torch": torch.__version__,
    "torch_cuda_available": torch.cuda.is_available(),
    "device": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU",
    "pillow": PIL.__version__,
    "sentence_transformers": sentence_transformers.__version__,
    "transformers": transformers.__version__,
}
print(json.dumps(env, indent=2))
with open("env_multimodal.json","w") as f:
    json.dump(env, f, indent=2)


{
  "timestamp": "2025-09-28T00:51:10",
  "python": "3.12.11",
  "platform": "Linux-6.6.97+-x86_64-with-glibc2.35",
  "numpy": "2.0.2",
  "torch": "2.8.0+cu126",
  "torch_cuda_available": false,
  "device": "CPU",
  "pillow": "11.3.0",
  "sentence_transformers": "5.1.0",
  "transformers": "4.56.1"
}


# üóÇ Prepare folders and upload your project documents and images
Upload: clinical papers, research drafts, notes (.txt/.md/.pdf) and images/charts (.png/.jpg).
If running in Colab, you'll be prompted to upload. In classic Jupyter, drop files into the working folder or use the widget.


In [2]:
from pathlib import Path
import os, sys

IMG_DIR = Path("mm_images"); IMG_DIR.mkdir(exist_ok=True)
TXT_DIR = Path("corpus");    TXT_DIR.mkdir(exist_ok=True)

# Helper to detect environment (Colab or Jupyter)
def running_in_colab():
    return "google.colab" in sys.modules

if running_in_colab():
    # Colab upload
    from google.colab import files
    print("Colab detected. Please upload images (multi-select).")
    up_imgs = files.upload()
    for name, data in (up_imgs or {}).items():
        if name.lower().endswith((".png",".jpg",".jpeg",".webp",".bmp",".tif",".tiff")):
            (IMG_DIR / name).write_bytes(data)
    print("\n(Optional) Upload PDFs/TXTs (or skip):")
    up_txts = files.upload()
    for name, data in (up_txts or {}).items():
        (TXT_DIR / name).write_bytes(data)
else:
    # Jupyter: show an upload widget (ipywidgets) ‚Äî works in JupyterLab/Notebook
    try:
        import ipywidgets as widgets
        from IPython.display import display
        uploaded = {}

        def on_upload_change(change):
            for fname, fobj in uploader.value.items():
                b = fobj['content']
                (IMG_DIR / fname).write_bytes(b)
            print("Saved uploaded images to mm_images/")

        print("If running in classic Jupyter, please upload images and text using the Jupyter file browser or place files into ./mm_images and ./corpus.")
        # Note: For simplicity, we don't create a multi-file widget here; instruct user to upload files manually.
    except Exception:
        print("No interactive upload widget available ‚Äî please copy files into ./mm_images and ./corpus manually.")

print("Saved images:", sorted([p.name for p in IMG_DIR.iterdir() if p.is_file()]))
print("Saved text files:", sorted([p.name for p in TXT_DIR.iterdir() if p.is_file()]))


Colab detected. Please upload images (multi-select).


Saving ablation1.png to ablation1 (1).png
Saving ablation2.png to ablation2 (1).png
Saving ablation4.png to ablation4 (1).png
Saving BDI dialog (1)-1.png to BDI dialog (1)-1 (1).png
Saving comparison.png to comparison (1).png
Saving framework.png to framework (1).png
Saving trust game plot.png to trust game plot (1).png
Saving unnamed (1).png to unnamed (1) (1).png
Saving unnamed (2).png to unnamed (2) (1).png
Saving unnamed.png to unnamed (3).png

(Optional) Upload PDFs/TXTs (or skip):


Saving 1_NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf to 1_NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference (2).pdf
Saving 2_NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf to 2_NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference (2).pdf
Saving 3_NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf to 3_NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference (2).pdf
Saved images: ['BDI dialog (1)-1 (1).png', 'BDI dialog (1)-1.png', 'ablation1 (1).png', 'ablation1.png', 'ablation2 (1).png', 'ablation2.png', 'ablation4 (1).png', 'ablation4.png', 'comparison (1).png', 'comparison.png', 'framework (1).png', 'framework.png', 'trust game plot (1).png', 'trust game plot.png', 'unnamed (1) (1).png', 'unnamed (1).png', 'unnamed 

# üìÑ Load & chunk text documents
We use langchain loaders for PDFs and plain text, then split into chunks suitable for embeddings (chunk_size=500, overlap=100).
These chunks will become the text part of the multimodal index.


In [3]:
!pip install -U langchain-community




In [4]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader


In [5]:
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

docs = []
for p in sorted(TXT_DIR.iterdir()):
    if not p.is_file():
        continue
    ext = p.suffix.lower()
    try:
        if ext == ".pdf":
            docs.extend(PyPDFLoader(str(p)).load())
        elif ext in [".txt", ".md", ".text"]:
            docs.extend(TextLoader(str(p), encoding="utf-8").load())
    except Exception as e:
        print(f"[WARN] Could not read {p.name}: {e}")

# If no user docs uploaded, provide a small medical-themed sample
if not docs:
    sample_text = (
        "This sample clinical note mentions Chart A (patient vitals over time) and Chart B (lab trends). "
        "The project: build agents to summarize medical text, refine research articles, and sanitize PHI."
    )
    (TXT_DIR / "sample_project_note.txt").write_text(sample_text, encoding="utf-8")
    docs.extend(TextLoader(str(TXT_DIR / "sample_project_note.txt"), encoding="utf-8").load())

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(docs)

text_corpus = pd.DataFrame({
    "doc_id":   [f"doc{i}" for i in range(len(chunks))],
    "source":   [c.metadata.get("source") or c.metadata.get("file_path") or "uploaded" for c in chunks],
    "text":     [c.page_content for c in chunks],
})
print("‚úÖ Text chunks:", len(text_corpus))
display(text_corpus.head(3))


‚úÖ Text chunks: 2132


Unnamed: 0,doc_id,source,text
0,doc0,corpus/1_NeurIPS-2024-mdagents-an-adaptive-col...,MDAgents: An Adaptive Collaboration of LLMs fo...
1,doc1,corpus/1_NeurIPS-2024-mdagents-an-adaptive-col...,Abstract\nFoundation models are becoming valua...
2,doc2,corpus/1_NeurIPS-2024-mdagents-an-adaptive-col...,The assigned solo or group collaboration struc...


# Load Images & Auto-Caption (BLIP)

# üñºÔ∏è Load images & auto-caption (BLIP)
We caption images (charts/figures) using BLIP. If captioning fails, we fall back to filename.
Captions will be indexed alongside text chunks for joint retrieval.


In [6]:
from PIL import Image
import pandas as pd
from transformers import BlipProcessor, BlipForConditionalGeneration

# Collect images
img_rows = []
for p in sorted(IMG_DIR.iterdir()):
    if p.suffix.lower() in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff"]:
        img_rows.append({"image_id": p.stem, "path": str(p), "caption": p.stem})
df_imgs = pd.DataFrame(img_rows)
print("‚úÖ Images found:", len(df_imgs))
display(df_imgs.head(3) if len(df_imgs) else df_imgs)

# Auto-caption with BLIP
CAPTION = True
if CAPTION and len(df_imgs):
    try:
        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        new_caps = []
        for row in df_imgs.itertuples():
            try:
                img = Image.open(row.path).convert("RGB")
                inputs = processor(img, return_tensors="pt")
                out = caption_model.generate(**inputs, max_new_tokens=32)
                cap = processor.decode(out[0], skip_special_tokens=True)
            except Exception:
                cap = row.caption
            new_caps.append(cap)
        df_imgs["caption"] = new_caps
        print("üìù Auto-captions generated.")
    except Exception as e:
        print("‚ö†Ô∏è Captioning not available, fallback to filenames. Reason:", e)


‚úÖ Images found: 20


Unnamed: 0,image_id,path,caption
0,BDI dialog (1)-1 (1),mm_images/BDI dialog (1)-1 (1).png,BDI dialog (1)-1 (1)
1,BDI dialog (1)-1,mm_images/BDI dialog (1)-1.png,BDI dialog (1)-1
2,ablation1 (1),mm_images/ablation1 (1).png,ablation1 (1)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

üìù Auto-captions generated.


# üîó Merge text chunks and image captions
We produce `all_docs` that contains both text chunks and image-caption items with consistent keys:
- id: doc id or img_{filename}
- text: chunk text or caption
- meta: source / file path / image file


In [7]:
all_docs = []

# Add text chunks
for _, r in text_corpus.iterrows():
    all_docs.append({"id": r["doc_id"], "text": r["text"], "meta": {"source": r["source"], "type": "text"}})

# Add images as short text docs (captions)
for _, r in df_imgs.iterrows():
    all_docs.append({"id": f"img_{r['image_id']}", "text": r.get("caption", r["image_id"]), "meta": {"file": r.get("path"), "type":"image"}})

print("Total items to index (text + images):", len(all_docs))


Total items to index (text + images): 2152


# üß† Embeddings
- `all-MiniLM-L6-v2` for semantic text embeddings (fast, good for text‚Üítext).
- `clip-ViT-B-32` (SentenceTransformers wrapper) for image‚Üîtext cross-search (images encoded as images, texts encoded to CLIP space).
We normalize embeddings for cosine (dot product after normalization).


In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Text embedding model (MiniLM)
st_text = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# CLIP model for joint image/text space
st_clip = SentenceTransformer("clip-ViT-B-32")

# Prepare lists
texts_for_minilm = [d["text"] for d in all_docs]  # used for text->text (minilm)
texts_for_clip   = [d["text"] for d in all_docs]  # used for clip text encoding (for cross search)
image_paths = [d["meta"].get("file") for d in all_docs if d["meta"].get("type")=="image"]

# Encode text with MiniLM
text_vecs_minilm = st_text.encode(texts_for_minilm, batch_size=32, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)

# Encode texts with CLIP (text side)
text_vecs_clip = st_clip.encode(texts_for_clip, batch_size=32, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)

# Encode images with CLIP (image side) ‚Äî we need the PIL images for those entries
pil_images = []
for d in all_docs:
    if d["meta"].get("type")=="image":
        try:
            pil_images.append(Image.open(d["meta"]["file"]).convert("RGB"))
        except Exception:
            # placeholder if file not available
            pil_images.append(Image.new("RGB",(224,224), color=(255,255,255)))
img_vecs_clip = st_clip.encode(pil_images, batch_size=16, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True) if len(pil_images) else np.zeros((0, text_vecs_clip.shape[1]), dtype="float32")

print("Shapes: minilm_text:", text_vecs_minilm.shape, "clip_text:", text_vecs_clip.shape, "clip_images:", img_vecs_clip.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

0_CLIPModel/model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

0_CLIPModel/pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches:   0%|          | 0/68 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Shapes: minilm_text: (2152, 384) clip_text: (2152, 512) clip_images: (20, 512)


# üîé Retrieval helper functions
Functions:
- topk_cosine: fast dot-product ranking on normalized vectors
- retrieve_text_by_text(query): text->text using MiniLM
- retrieve_images_by_text(query): text->images using CLIP
- retrieve_by_image(image_id): image->(text, images) via CLIP image vector


In [9]:
import numpy as np

def topk_cosine(q_vec, mat, k=5):
    if mat.shape[0] == 0:
        return []
    sims = mat @ q_vec
    idx = np.argsort(-sims)[:k]
    return [(int(i), float(sims[i])) for i in idx]

# encoders
def encode_text_minilm(q: str):
    return st_text.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0]

def encode_text_clip(q: str):
    return st_clip.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0]

def retrieve_text_by_text(query, k=5):
    q = encode_text_minilm(query)
    hits = topk_cosine(q, text_vecs_minilm, k=k)
    return [(all_docs[i]["id"], hits[j][1], all_docs[i]["text"]) for j,(i,_) in enumerate(hits)]

def retrieve_images_by_text(query, k=5):
    if img_vecs_clip.shape[0] == 0: return []
    q = encode_text_clip(query)
    hits = topk_cosine(q, img_vecs_clip, k=k)
    # map hit indices to all_docs image positions
    image_doc_indices = [idx for idx,d in enumerate(all_docs) if d["meta"].get("type")=="image"]
    return [(all_docs[image_doc_indices[i]]["id"], hits[j][1], all_docs[image_doc_indices[i]]["meta"].get("file")) for j,(i,_) in enumerate(hits)]

def retrieve_by_image(image_id, k=5):
    # find index in all_docs
    idx = next((i for i,d in enumerate(all_docs) if d["id"]==f"img_{image_id}"), None)
    if idx is None:
        return [], []
    # compute q_vec relative to img_vecs_clip ordering
    # build mapping of image doc indices to img_vecs indices
    image_doc_indices = [i for i,d in enumerate(all_docs) if d["meta"].get("type")=="image"]
    img_idx_in_imgvecs = image_doc_indices.index(idx)
    q_vec = img_vecs_clip[img_idx_in_imgvecs]
    text_hits = topk_cosine(q_vec, text_vecs_clip, k=k)
    img_hits  = topk_cosine(q_vec, img_vecs_clip,  k=k+1)  # may include itself
    img_hits  = [(i,s) for (i,s) in img_hits if i!=img_idx_in_imgvecs][:k]
    text_pairs = [(all_docs[i]["id"], text_hits[j][1], all_docs[i]["text"]) for j,(i,_) in enumerate(text_hits)]
    img_pairs  = [(all_docs[image_doc_indices[i]]["id"], img_hits[j][1], all_docs[image_doc_indices[i]]["meta"].get("file")) for j,(i,_) in enumerate(img_hits)]
    return text_pairs, img_pairs

# Quick tests (if content exists)
if len(all_docs):
    print("Text->Text sample:", retrieve_text_by_text("summarize medical text", k=3))
    print("Text->Images sample:", retrieve_images_by_text("patient vitals chart", k=3))
    if len(df_imgs):
        print("Image->Docs/Images sample for first image:", retrieve_by_image(df_imgs.iloc[0].image_id, k=3))


Text->Text sample: [('doc165', 0.739983320236206, 'Nidhi Rohatgi, Poonam Hosamani, William Collins, Neera Ahuja, Curtis P. Langlotz, Jason\nHom, Sergios Gatidis, John Pauly, and Akshay S. Chaudhari. Adapted large language models can\noutperform medical experts in clinical text summarization. Nature Medicine, 30(4):1134‚Äì1142,\nFebruary 2024.\n[80] Lei Wang, Chen Ma, Xueyang Feng, Zeyu Zhang, Hao Yang, Jingsen Zhang, Zhiyuan Chen,\nJiakai Tang, Xu Chen, Yankai Lin, Wayne Xin Zhao, Zhewei Wei, and Ji-Rong Wen. A survey'), ('doc554', 0.739983320236206, 'Nidhi Rohatgi, Poonam Hosamani, William Collins, Neera Ahuja, Curtis P. Langlotz, Jason\nHom, Sergios Gatidis, John Pauly, and Akshay S. Chaudhari. Adapted large language models can\noutperform medical experts in clinical text summarization. Nature Medicine, 30(4):1134‚Äì1142,\nFebruary 2024.\n[80] Lei Wang, Chen Ma, Xueyang Feng, Zeyu Zhang, Hao Yang, Jingsen Zhang, Zhiyuan Chen,\nJiakai Tang, Xu Chen, Yankai Lin, Wayne Xin Zhao, Zhewei 

# üß© Assemble RAG-style prompt for the generator
The prompt instructs the generator to:
- Use ONLY the evidence provided,
- Cite sources inline using `[doc_id]` or `[img_id]`,
- If evidence is insufficient, respond with a safe REFUSAL.
This prompt will be passed to the generator for RAG answers.


In [10]:
def assemble_prompt(query, text_hits, image_hits, max_context_chars=4000):
    tbits = []
    for doc_id, score, fulltext in text_hits:
        # fetch the text snippet (first 350 chars)
        row_text = next((d for d in all_docs if d["id"]==doc_id), {}).get("text","")
        tbits.append(f"[{doc_id}] {row_text[:350].replace('\\n',' ')}")
    ibits = []
    for img_id, score, img_path in image_hits:
        row = next((d for d in all_docs if d["id"]==img_id), {})
        ibits.append(f"[{img_id}] {row.get('text','(image caption missing)')}")
    evidence = ""
    if tbits:
        evidence += "Text:\n" + "\n".join(tbits)
    if ibits:
        evidence += ("\n\nImages:\n" + "\n".join(ibits)) if evidence else "\n".join(ibits)
    prompt = (
        "System: You are an expert assistant for clinical research and medical writing. "
        "Answer using ONLY the evidence below and include inline citations in the form [doc_id] or [img_id]. "
        "If the evidence does not support a confident answer, reply exactly: REFUSE: insufficient reliable evidence.\n\n"
        f"Query: {query}\n\nEvidence:\n{evidence}\n\nAnswer:"
    )
    if len(prompt) > max_context_chars:
        # simple trim: keep first N chars of evidence
        prompt = prompt[:max_context_chars]
    return prompt


# ‚öôÔ∏è Generator (RAG answer) ‚Äî using an open generator (distilgpt2)
We use `distilgpt2` here to avoid gated models. You may replace it with a larger instruction-tuned model if available.
The wrapper checks for the presence of at least one inline citation in the generated text; otherwise we mark as refusal.


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Use an open model ‚Äî distilgpt2
GEN_MODEL = "distilgpt2"
device = 0 if torch.cuda.is_available() else -1

tok = AutoTokenizer.from_pretrained(GEN_MODEL)
model = AutoModelForCausalLM.from_pretrained(GEN_MODEL)
# ensure pad token exists
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

gen_pipe = pipeline("text-generation", model=model, tokenizer=tok, device=device, max_new_tokens=200)

def generate_rag_answer(query, k_text=3, k_images=3):
    t_hits = retrieve_text_by_text(query, k=k_text)
    i_hits = retrieve_images_by_text(query, k=k_images)
    prompt = assemble_prompt(query, t_hits, i_hits)
    # quick guard: if no hits at all, refuse
    if (not t_hits) and (not i_hits):
        return {"text":"REFUSE: insufficient reliable evidence.", "refused":True, "evidence": {"text":t_hits, "images":i_hits}}
    # generate
    out = gen_pipe(prompt, do_sample=False)[0]["generated_text"]
    # enforce presence of a citation token like [doc or [img
    if ("[doc" not in out) and ("[img" not in out):
        # If generator didn't cite, return refusal with evidence
        return {"text":"REFUSE: model did not include required citations.", "refused":True, "raw_output": out, "evidence": {"text":t_hits, "images":i_hits}}
    return {"text": out, "refused": False, "evidence": {"text":t_hits, "images":i_hits}}


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


# üß™ Example queries for your project
These queries are tailored to your project:
- Summarize medical text
- Refine research articles
- Sanitize sensitive healthcare data (PHI)
We show text-only, image-only, and hybrid RAG queries and the generator's output.


In [12]:
queries = [
    "Summarize the key steps to sanitize PHI from a discharge summary.",
    "Which chart shows an upward trend in patient vitals, and what does the text say about it?",
    "Refine the following research claim: 'Multi-agent systems improve clinical note summarization.'"
]

for q in queries:
    print("\n" + "="*80)
    print("Query:", q)
    res = generate_rag_answer(q, k_text=3, k_images=3)
    print("\nEVIDENCE (text ids):", [t[0] for t in res["evidence"]["text"]])
    print("EVIDENCE (image ids):", [i[0] for i in res["evidence"]["images"]])
    print("\nResult (refused?):", res["refused"])
    print(res["text"][:2000])
    print("\n" + "="*80 + "\n")


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Query: Summarize the key steps to sanitize PHI from a discharge summary.

EVIDENCE (text ids): ['doc1216', 'doc969', 'doc507']
EVIDENCE (image ids): ['img_BDI dialog (1)-1 (1)', 'img_BDI dialog (1)-1', 'img_ablation2']

Result (refused?): False
System: You are an expert assistant for clinical research and medical writing. Answer using ONLY the evidence below and include inline citations in the form [doc_id] or [img_id]. If the evidence does not support a confident answer, reply exactly: REFUSE: insufficient reliable evidence.

Query: Summarize the key steps to sanitize PHI from a discharge summary.

Evidence:
Text:
[doc1216] or the supplemental material, provided in the appendix. If you answer [Yes] to a question, in the
justification please point to the section(s) where related material for the question can be found.
IMPORTANT, please:
‚Ä¢ Delete this instruction block, but keep the section heading ‚ÄúNeurIPS paper checklist",
‚Ä¢ Keep the checklist subsection headings, que
[doc969] 

# üìà Preview retrievals across a small query set and save a CSV
This helps produce `trackB_multimodal_preview.csv` for submission and quick checks.


In [13]:
import pandas as pd
rows = []
preview_queries = [
    "Summarize PHI sanitization best practices",
    "Find figure showing lab trend",
    "Which section discusses model validation?",
]
for q in preview_queries:
    t_hits = retrieve_text_by_text(q, 3)
    i_hits = retrieve_images_by_text(q, 3)
    rows.append({
        "query": q,
        "text@3": [d for d,_,_ in t_hits],
        "images@3": [i for i,_,_ in i_hits],
        "n_text": len(t_hits),
        "n_images": len(i_hits),
    })
df_preview = pd.DataFrame(rows)
display(df_preview)
df_preview.to_csv("trackB_multimodal_preview.csv", index=False)
print("Saved: trackB_multimodal_preview.csv")


Unnamed: 0,query,text@3,images@3,n_text,n_images
0,Summarize PHI sanitization best practices,"[doc1256, doc1009, doc764]","[img_BDI dialog (1)-1 (1), img_BDI dialog (1)-...",3,3
1,Find figure showing lab trend,"[img_ablation1 (1), img_ablation1, doc1467]","[img_unnamed (2), img_unnamed (2) (1), img_abl...",3,3
2,Which section discusses model validation?,"[doc1012, doc1259, doc1258]","[img_comparison, img_comparison (1), img_ablat...",3,3


Saved: trackB_multimodal_preview.csv


# üíæ Save run config and counts (trackB_run_config.json)
Records embedding models and counts for reproducibility.


In [14]:
import json, datetime
cfg = {
  "timestamp": datetime.datetime.now().isoformat(timespec="seconds"),
  "project": "Multi-Agent Medical RAG (Track B)",
  "models": {
    "text_embedding": "sentence-transformers/all-MiniLM-L6-v2",
    "clip": "clip-ViT-B-32",
    "image_caption": "Salesforce/blip-image-captioning-base",
    "generator": "distilgpt2"
  },
  "counts": {
    "n_text_chunks": int(len(text_corpus)),
    "n_images": int(len(df_imgs))
  },
  "paths": {"images_dir": str(IMG_DIR), "corpus_dir": str(TXT_DIR)}
}
json.dump(cfg, open("trackB_run_config.json","w"), indent=2)
print("Saved: trackB_run_config.json")


Saved: trackB_run_config.json
