In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install -U medvqa

In [None]:
# pip install -U medvqa
!medvqa validate_and_submit --competition=medico-2025 --task=1 --repo_id=SujataGaihre/Kvasir-VQA-x1-lora_250915-2004

In [None]:
!pip cache purge
!conda clean -a -y


In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel

# base model (foundation model you trained on)
BASE_MODEL = "google/paligemma-3b-pt-224"
# your LoRA repo (fine-tuned weights you pushed to HF)
LORA_MODEL = "SujataGaihre/Kvasir-VQA-x1-lora_250918-2030"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load processor from the base model
processor = AutoProcessor.from_pretrained(BASE_MODEL)

# load base model
model = AutoModelForVision2Seq.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="auto")

# apply LoRA adapter
model = PeftModel.from_pretrained(model, LORA_MODEL)
model.to(device)

print("✅ Base + LoRA model loaded successfully")


In [None]:
import openai

openai.api_key = "ADD_API_KEY"

In [None]:

# --- 3. Helper Function ---
def ask_model(image, question, model, processor, device):
    """
    Ask the VQA model a question and return the generated text answer.
    """
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=50)
    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return answer

# --- 4. Main Pipeline ---
def main():
    print("🚀 Starting solution for Medico-2025 Subtask 2: Multimodal Explanations")

    # Load official validation set
    print("Loading and preparing the official val_set_task2...")
    ds = load_dataset("SimulaMet/Kvasir-VQA-x1")["test"]
    val_set_task2 = (
        ds.filter(lambda x: x["complexity"] == 1)
          .shuffle(seed=42)
          .select(range(1500))
          .add_column("val_id", list(range(1500)))
          .remove_columns(["complexity", "answer", "original", "question_class"])
          .cast_column("image", HfImage())
    )
    print(f"✅ Official validation set loaded with {len(val_set_task2)} samples.")

    print(f"Generating explanations and writing to {OUTPUT_FILE}...")
    VISUALS_DIR.mkdir(exist_ok=True)

    with open(OUTPUT_FILE, "w") as f:
        for item in tqdm(val_set_task2, desc="Generating explanations"):
            val_id = item["val_id"]
            img_id = item["img_id"]
            image = item["image"]
            original_question = item["question"]

            # Step 1: Main answer
            main_answer = ask_model(image, original_question, model, processor, device)

            # Step 2: Probing questions
            probe_questions = [
                "What is the primary finding or subject in this image?",
                "Where is the finding located in the image (e.g., upper-left, center)?",
                "Describe the visual characteristics of the finding, such as color, shape, and texture."
            ]
            probe_answers = [ask_model(image, q, model, processor, device) for q in probe_questions]

            # Step 3: Textual explanation synthesis
            textual_explanation = (
                f"The model's answer '{main_answer}' is justified by the following observations. "
                f"The primary finding identified is '{probe_answers[0]}'. "
                f"This is located in the {probe_answers[1]} region of the endoscopic view. "
                f"Visually, the finding is characterized by '{probe_answers[2]}'. "
                "This combination of features supports the conclusion."
            )

            # Step 4: Format output
            result = {
                "val_id": val_id,
                "img_id": img_id,
                "question": original_question,
                "answer": main_answer,
                "textual_explanation": textual_explanation,
                "visual_explanation": [],   # placeholder for heatmaps/bboxes
                "confidence_score": 0.95    # placeholder score
            }

            f.write(json.dumps(result) + "\n")

    print(f"\n Success! Your submission file '{OUTPUT_FILE}' has been created.")
    print("This file is ready to upload to your Hugging Face submission repo.")

if __name__ == "__main__":
    main()

In [None]:
mapping = {
  "abnormality_presence": ["Where in the image is the abnormality located?", "What color is the abnormality?", "What type of abnormality is present?", "How many abnormalities are visible?", "Is the abnormality easy to detect?", "What is the size of the abnormality?", "Does the abnormality involve a polyp?", "Is there text overlapping the abnormality?", "Is there an instrument near the abnormality?", "Is the abnormality clinically significant?"],
  "landmark_presence": ["Where is the landmark located in the image?", "What color is the landmark?", "Is the landmark easy to detect?", "Is the landmark partially obscured by an instrument?", "Is there a finding near the landmark?", "Is text overlapping the landmark?", "How many landmarks are visible?", "Is this landmark specific to colonoscopy or gastroscopy?", "Does the landmark confirm the procedure type?", "Is this landmark used as a reference for polyp detection?"],
  "instrument_presence": ["Where in the image is the instrument located?", "How many instruments are present?", "What type of instrument is visible?", "Is the instrument interacting with a polyp?", "Is the instrument interacting with a landmark?", "Is the instrument obscuring any abnormality?", "Is the instrument easy to detect?", "Does the instrument suggest ongoing treatment or biopsy?", "What is the orientation of the instrument (left, right, center)?", "Is the instrument metallic or plastic in appearance?"],
  "polyp_removal_status": ["How many polyps are present?", "Where are the remaining polyps located?", "What type of polyp is visible?", "What is the size of the remaining polyp?", "Is an instrument visible near the polyp?", "Does the mucosa appear healed after removal?", "Is there evidence of a resection site?", "Is bleeding visible after removal?", "Is clipping or cauterization present?", "Is the removal complete or partial?"],
  "finding_detectability": ["What is the color of the finding?", "Where in the image is the finding?", "How many findings are present?", "Is the finding small or large?", "Is the finding obscured by an instrument?", "Is the finding obscured by text or artifacts?", "What is the type of finding (polyp, lesion, inflammation)?", "Is the finding clinically relevant?", "Is the finding localized or diffuse?", "Does the finding resemble common abnormalities?"],
  "box_artifact_presence": ["Where in the image is the artifact located?", "What color is the artifact (green or black)?", "Is the artifact covering an abnormality?", "Is the artifact covering an instrument?", "Is the artifact covering a landmark?", "Is the artifact small or large?", "Does the artifact reduce visibility of findings?", "Is the artifact near the image borders?", "Is there text overlapping the artifact?", "Is the artifact affecting detectability of polyps?"],
  "text_presence": ["Where in the image is the text located?", "Is the text overlapping a finding?", "Is the text overlapping a landmark?", "Is the text overlapping an instrument?", "Is the text easy to read?", "Is the text color white or another color?", "Is the text obstructing detectability?", "Is the text medical annotation or timestamp?", "Is the text large or small?", "Is there more than one text element?"],
  "polyp_type": ["Where is the polyp located in the image?", "What is the size of the polyp?", "What color is the polyp?", "How many polyps are visible?", "Is the polyp easy to detect?", "Is the polyp sessile or pedunculated?", "Is there an instrument near the polyp?", "Is the polyp associated with bleeding?", "Is the polyp removal complete?", "Is the polyp adjacent to a landmark?"],
  "procedure_type": ["Is a landmark confirming the procedure visible?", "Are polyps present typical for this procedure type?", "Is the mucosa appearance consistent with the procedure?", "Is there an instrument suggesting this procedure type?", "Is capsule endoscopy text visible?", "Does the lumen size suggest colonoscopy or gastroscopy?", "Is this a lower or upper GI procedure?", "Are abnormality types typical for this procedure?", "Is light reflection consistent with endoscopic imaging?", "Is the orientation of the image typical for this procedure?"],
  "polyp_size": ["Where is the polyp located in the image?", "What type of polyp is it?", "What color is the polyp?", "Is the polyp sessile or pedunculated?", "Is the polyp adjacent to a landmark?", "Is there more than one polyp?", "Is the polyp removal complete?", "Is an instrument present near the polyp?", "Is the polyp easy to detect?", "Is the polyp size clinically significant?"],
  "finding_count": ["How many polyps are present?", "How many abnormalities are present?", "How many landmarks are visible?", "How many instruments are visible?", "How many findings are clinically relevant?", "How many findings are small vs large?", "How many findings are overlapping?", "How many findings are obscured by artifacts?", "How many findings are located in the center?", "How many findings are located near borders?"],
  "polyp_count": ["Where are the polyps located?", "What is the size of each polyp?", "What is the type of each polyp?", "What color are the polyps?", "Are polyps clustered together?", "Are polyps easy to detect?", "Are any polyps overlapping with landmarks?", "Are polyps interacting with instruments?", "Is polyp removal complete?", "Are polyps causing mucosal distortion?"],
  "instrument_location": ["What type of instrument is at this location?", "Is the instrument near an abnormality?", "Is the instrument near a polyp?", "Is the instrument near a landmark?", "Is the instrument in the center of the lumen?", "Is the instrument touching tissue?", "Is the instrument obscuring findings?", "Is the instrument easy to detect?", "Is there more than one instrument?", "Is the instrument metallic or plastic?"],
  "abnormality_location": ["What type of abnormality is at this location?", "What is the size of the abnormality?", "What is the color of the abnormality?", "Is the abnormality easy to detect?", "Is the abnormality overlapping with text?", "Is the abnormality overlapping with instruments?", "Is the abnormality adjacent to a landmark?", "Is the abnormality in the lumen center?", "Are there multiple abnormalities nearby?", "Is the abnormality clinically significant?"],
  "landmark_location": ["What type of landmark is at this location?", "What is the color of the landmark?", "Is the landmark easy to detect?", "Is the landmark partially obscured?", "Is the landmark overlapping with text?", "Is the landmark adjacent to an abnormality?", "Is the landmark in the lumen center?", "Are there multiple landmarks visible?", "Is the landmark used for orientation?", "Is the landmark confirming the procedure type?"],
  "instrument_count": ["What type of instruments are visible?", "Where are the instruments located?", "Are the instruments interacting with polyps?", "Are the instruments interacting with abnormalities?", "Are the instruments obscuring landmarks?", "Are the instruments metallic or plastic?", "Are the instruments easy to detect?", "Are any instruments overlapping?", "Do the instruments suggest biopsy or resection?", "Is the number of instruments typical for this procedure?"],
  "abnormality_color": ["Where is the abnormality located?", "What type of abnormality is this?", "What is the size of the abnormality?", "Is the abnormality easy to detect?", "Are there multiple colors within the abnormality?", "Is the color typical of inflammation or bleeding?", "Does the abnormality color suggest necrosis?", "Is the abnormality overlapping with text?", "Is the abnormality near an instrument?", "Is the abnormality color clinically significant?"],
  "landmark_color": ["Where is the landmark located?", "What type of landmark is this?", "Is the landmark easy to detect?", "Are there multiple colors in the landmark?", "Is the landmark partially obscured by instruments?", "Is the landmark partially obscured by text?", "Is the landmark used for orientation?", "Is the landmark color typical of healthy mucosa?", "Does the landmark color suggest inflammation?", "Is the landmark color clinically relevant?"],
  "finding_presence": ["Where is the finding located?", "What type of finding is this?", "What is the size of the finding?", "What is the color of the finding?", "Is the finding easy to detect?", "How many findings are present?", "Is the finding overlapping with text?", "Is the finding overlapping with instruments?", "Is the finding clinically significant?", "Does the finding suggest further biopsy?"]
}


In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
from peft import PeftModel
from datasets import load_dataset, Dataset, Image as HfImage
from pathlib import Path
from tqdm import tqdm
import json
from openai import OpenAI

client = OpenAI(api_key="ADD_API_KEY")

# --- File paths ---
OUTPUT_FILE = "FINAL_prompt_OPENAI_submission_task2.jsonl"
VISUALS_DIR = Path("visuals")
VISUALS_DIR.mkdir(exist_ok=True)

# --- Helper: remove echoed question from answer ---
def clean_answer(question: str, answer: str) -> str:
    q_lower = question.lower().strip(" ?")
    a_lower = answer.lower().strip()

    if a_lower.startswith(q_lower):
        # strip the question part from the answer
        return answer[len(question):].strip(" ,.?")
    return answer.strip()

# --- Helper: ask model ---
def ask_model(image, question):
    """Get answer from the vision-language model."""
    inputs = processor(images=image, text=question, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=50)
    answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return answer

# --- Helper: summarize with OpenAI ---
def summarize_with_openai(main_answer, mapped_answers):
    prompt = f"""
    You are a medical AI assistant. Your task is to generate a clear and concise textual explanation 
    based on a main answer and several follow-up question-answer pairs. 

    Main Answer: {main_answer}
    Follow-up Observations: {mapped_answers}

    Instructions:
    - Combine all information into a single coherent explanation in natural language. 
    - Avoid repeating the exact questions. Instead, integrate the answers into flowing sentences.
    - Present findings logically (location → type → size → detectability → color → clinical significance).
    - If there are negative findings (e.g., "No evidence of necrosis"), include them naturally.
    - Write the explanation as a short medical note (~3–5 sentences).
    - Keep it precise, professional, and easy to understand.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful medical summarizer."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3
    )
    return response.choices[0].message.content.strip()

# --- Main pipeline ---
def main():
    print(" Starting structured VQA pipeline with OpenAI summarization...")

    # Load dataset
    ds = load_dataset("SimulaMet/Kvasir-VQA-x1")["test"]
    val_set_task2 = (
        ds.filter(lambda x: x["complexity"] == 1)
          .shuffle(seed=42)
          .select(range(1500))  
          .add_column("val_id", list(range(1500)))
          .remove_columns(["complexity", "answer", "original"])
          .cast_column("image", HfImage())
    )
    print(f"Validation set loaded: {len(val_set_task2)} samples")

    with open(OUTPUT_FILE, "w") as f:
        for item in tqdm(val_set_task2, desc="Processing samples"):
            val_id = item["val_id"]
            img_id = item["img_id"]
            image = item["image"]
            question_classes = item["question_class"]

            # Step 1: Main Q&A (cleaned)
            raw_main_answer = ask_model(image, item["question"])
            cleaned_main_answer = clean_answer(item["question"], raw_main_answer)
            main_answer = f"Question: {item['question']}, Answer: {cleaned_main_answer}"

            # Step 2: Follow-up mapped Q&A (cleaned)
            mapped_answers = {}
            for qc in question_classes:
                mapped_answers[qc] = []
                for q in mapping.get(qc, []):
                    raw_ans = ask_model(image, q)
                    cleaned_ans = clean_answer(q, raw_ans)
                    mapped_answers[qc].append(f"Question: {q}, Answer: {cleaned_ans}")

            # Step 3: Polished textual explanation via OpenAI
            textual_explanation = summarize_with_openai(main_answer, mapped_answers)

            # Step 4: Save result
            result = {
                "val_id": val_id,
                "img_id": img_id,
                "question": item["question"],
                "answer": main_answer,
                # "mapped_answers": mapped_answers,
                "textual_explanation": textual_explanation,
                "visual_explanation": [],
                "confidence_score": 0.95
            }
            f.write(json.dumps(result) + "\n")

    print(f"\n Done! Submission file saved as '{OUTPUT_FILE}'")

if __name__ == "__main__":
    main()


In [None]:
pip install -U medvqa


In [None]:
!medvqa validate_and_submit --competition=medico-2025 --task=2 --repo_id=SujataGaihre/Medico2025_Subtask2_TeamNepal