In [None]:
!pip -q install openai bert-score rouge-score pycocoevalcap nltk pillow tqdm

In [None]:
import os
import base64
import json
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from PIL import Image
from io import BytesIO

# OpenAI client (Responses API)
from openai import OpenAI

# Metrics
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bertscore
from pycocoevalcap.cider.cider import Cider

# Ensure basic NLTK resources (BLEU doesn't need tokenizers, but good practice)
nltk.download('punkt', quiet=True)

In [None]:
DATA_DIR = "/kaggle/input/split-10k-dataset/Split Dataset/test"
CSV_PATH = f"{DATA_DIR}/description_b.csv"
IMAGES_BASE = f"{DATA_DIR}"

# Model and prompt
MODEL_NAME = "gpt-5"  # change to the exact GPT-5 model available on your account if needed
CAPTION_PROMPT = (
    '''Strictly compose a single, small, concise paragraph of 3 to 4 lines describing this labeled biological diagram. 
    Identify all key biological structures, organelles, molecules, or entities indicated by the labels 
    or annotations. Explicitly reference these labels and explain their spatial relationships, 
    interactions, and functions with precise biological terminology. Ensure the description is succinct, 
    informative, and scientifically accurate, without extra elaboration or multiple paragraphs.'''
)

# Limit to 5 samples to save cost; set to None to run all
NUM_SAMPLES = None

# Outputs
PREDICTIONS_CSV = "predictions.csv"
SCORES_JSON = "scores.json"

In [None]:
try:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ["OPENAI_API_KEY"] = ""
except Exception:
    # Fallback to environment variable set manually in Settings -> Environment
    pass

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
df = pd.read_csv(CSV_PATH)
assert {"file_name", "text"}.issubset(df.columns), "CSV must have 'file_name' and 'text' columns."

if NUM_SAMPLES is not None:
    # Deterministic random sample of 5 to minimize cost
    df = df.sample(n=int(NUM_SAMPLES), random_state=42).reset_index(drop=True)

In [None]:
def func(n):
    return n.split('.')[-1]

set(map(func,df['file_name']))


In [None]:
import base64
from pathlib import Path

def encode_image_as_data_url(image_path: str) -> str:
    ext = Path(image_path).suffix.lower()
    mime = {
        '.jpg': 'image/jpeg',
        '.jpeg': 'image/jpeg',
        '.png': 'image/png',
        '.bmp': 'image/bmp',
        '.gif': 'image/gif',
        '.webp': 'image/webp'
    }.get(ext, 'image/jpeg')

    with open(image_path, "rb") as f:
        img_bytes = f.read()
    b64_str = base64.b64encode(img_bytes).decode('utf-8')
    data_url = f"data:{mime};base64,{b64_str}"
    return data_url

def extract_text_from_response(response) -> str:
    # Find the first output item with 'content' attribute (the message)
    for item in response.output:
        if hasattr(item, "content"):
            # content is a list of blocks, find first text block
            for block in item.content:
                if hasattr(block, "type") and block.type in ("output_text", "text"):
                    return getattr(block, "text", "").strip()
    return ""



def generate_caption_with_openai(image_path: str, prompt: str) -> str:
    data_url = encode_image_as_data_url(image_path)
    resp = client.responses.create(
        model=MODEL_NAME,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": prompt},
                    {"type": "input_image", "image_url": data_url},
                ],
            }
        ],
        max_output_tokens=2000
    )

    # print(resp)
    txt = extract_text_from_response(resp)
    return txt

In [None]:
import json
import time

PREDICTIONS_CSV = "predictions.csv"
PREDICTIONS_JSONL = "predictions.jsonl"  # one JSON object per line

# Build a skip set from existing JSONL (if present)
processed = set()
if os.path.exists(PREDICTIONS_JSONL):
    with open(PREDICTIONS_JSONL, "r", encoding="utf-8") as rj:
        for line in rj:
            try:
                obj = json.loads(line)
                processed.add(obj.get("image_id"))
            except Exception:
                pass


records = []
with open(PREDICTIONS_JSONL, "a", encoding="utf-8") as jf:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Captioning"):
        rel_path = row["file_name"]
        if rel_path in processed:
            continue  # already done
            
        rel_path = row["file_name"]
        ref_text = str(row["text"]).strip()
        img_path = os.path.join(IMAGES_BASE, rel_path)

        if not os.path.exists(img_path):
            img_path_alt = os.path.join(DATA_DIR, rel_path)
            if os.path.exists(img_path_alt):
                img_path = img_path_alt

        pred_text = generate_caption_with_openai(img_path, CAPTION_PROMPT)
        # print("Pred:", pred_text)

        rec = {
            "image_id": rel_path,      # keep a stable key name
            "pred": pred_text,         # the model output
            "reference": ref_text,     # optional: keep reference too
            "ts": time.time(),         # optional: for debugging/resume
        }
        records.append({
            "file_name": rel_path,
            "reference": ref_text,
            "prediction": pred_text,
        })

        # Stream to JSONL immediately
        jf.write(json.dumps(rec, ensure_ascii=False) + "\n")
        jf.flush()  # force write to disk frequently

# After loop, save/overwrite the CSV snapshot
pred_df = pd.DataFrame(records)
pred_df.to_csv(PREDICTIONS_CSV, index=False)


In [None]:
# Normalize to lowercase once
references = [str(x).lower().strip() for x in pred_df["reference"].fillna("")]
predictions = [str(x).lower().strip() for x in pred_df["prediction"].fillna("")]

# Simple whitespace tokenizer (already case-insensitive due to lowercase above)
def simple_tokenize(text: str):
    return text.split()

# --- BLEU-1..4 (corpus) ---
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
list_of_refs = [[[t for t in simple_tokenize(ref)]] for ref in references]
hyps = [[t for t in simple_tokenize(hyp)] for hyp in predictions]
smooth = SmoothingFunction().method4

bleu1 = corpus_bleu(list_of_refs, hyps, weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=smooth)
bleu2 = corpus_bleu(list_of_refs, hyps, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smooth)
bleu3 = corpus_bleu(list_of_refs, hyps, weights=(1/3, 1/3, 1/3, 0.0), smoothing_function=smooth)
bleu4 = corpus_bleu(list_of_refs, hyps, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

# --- ROUGE-L (average F1) ---
from rouge_score import rouge_scorer
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rougeL_f = []
for ref, hyp in zip(references, predictions):  # already lowercased
    score = rouge.score(ref, hyp)
    rougeL_f.append(score['rougeL'].fmeasure)
rougeL = float(sum(rougeL_f) / max(1, len(rougeL_f)))

# --- CIDEr ---
from pycocoevalcap.cider.cider import Cider
gts = {i: [references[i]] for i in range(len(references))}   # lowercased refs
res = {i: [predictions[i]] for i in range(len(predictions))} # lowercased hyps
cider_scorer = Cider()
cider_score, _ = cider_scorer.compute_score(gts, res)

# --- BERTScore ---
from bert_score import score as bertscore
P, R, F1 = bertscore(predictions, references, lang='en', rescale_with_baseline=True)  # lowercased inputs
bertscore_precision = float(P.mean().item())
bertscore_recall = float(R.mean().item())
bertscore_f1 = float(F1.mean().item())


In [None]:
results = {
    "BLEU-1": round(bleu1, 4),
    "BLEU-2": round(bleu2, 4),
    "BLEU-3": round(bleu3, 4),
    "BLEU-4": round(bleu4, 4),
    "ROUGE-L": round(rougeL, 4),
    "CIDEr": round(float(cider_score), 4),
    "BERTScore": {
        "Precision": round(bertscore_precision, 4),
        "Recall": round(bertscore_recall, 4),
        "F1": round(bertscore_f1, 4),
    },
}

print(json.dumps(results, indent=2))

with open(SCORES_JSON, "w") as f:
    json.dump(results, f, indent=2)

print(f"\nSaved predictions to: {PREDICTIONS_CSV}")
print(f"Saved metrics to: {SCORES_JSON}")

In [None]:
# Compute corpus metrics from a JSONL built during inference
# Expects each line: {"image_id": "...", "pred": "...", "reference": "...", ...}

import os
import json

# Metrics
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from pycocoevalcap.cider.cider import Cider
from bert_score import score as bertscore

PREDICTIONS_JSONL = "predictions.jsonl"         # input JSONL with pred/reference
SCORES_JSON = "scores_from_jsonl.json"          # output summary JSON

# 1) Load predictions + references from JSONL (latest per image_id wins)
pred_map = {}  # image_id -> (pred, reference)
with open(PREDICTIONS_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        try:
            obj = json.loads(line)
            image_id = str(obj.get("image_id", "")).strip()
            pred = str(obj.get("pred", ""))
            ref = str(obj.get("reference", ""))
            if image_id:
                pred_map[image_id] = (pred, ref)
        except Exception:
            continue

# 2) Build lists and normalize to lowercase
image_ids = []
predictions_raw, references_raw = [], []
for k, (pred, ref) in pred_map.items():
    image_ids.append(k)
    predictions_raw.append(pred)
    references_raw.append(ref)

predictions = [p.lower().strip() for p in predictions_raw]
references = [r.lower().strip() for r in references_raw]

# 3) Simple tokenizer (whitespace)
def simple_tokenize(text: str):
    return text.split()

# 4) BLEU-1..4 (corpus)
list_of_refs = [[[t for t in simple_tokenize(r)]] for r in references]
hyps = [[t for t in simple_tokenize(h)] for h in predictions]
smooth = SmoothingFunction().method4
bleu1 = corpus_bleu(list_of_refs, hyps, weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=smooth)
bleu2 = corpus_bleu(list_of_refs, hyps, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smooth)
bleu3 = corpus_bleu(list_of_refs, hyps, weights=(1/3, 1/3, 1/3, 0.0), smoothing_function=smooth)
bleu4 = corpus_bleu(list_of_refs, hyps, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smooth)

# 5) ROUGE-L (average F1 over pairs)
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rougeL_vals = [rouge.score(r, h)['rougeL'].fmeasure for r, h in zip(references, predictions)]
rougeL = float(sum(rougeL_vals) / max(1, len(rougeL_vals)))

# 6) CIDEr (corpus)
gts = {i: [references[i]] for i in range(len(references))}
res = {i: [predictions[i]] for i in range(len(predictions))}
cider_scorer = Cider()
cider_score, _ = cider_scorer.compute_score(gts, res)

# 7) BERTScore (mean P/R/F1)
P, R, F1 = bertscore(predictions, references, lang='en', rescale_with_baseline=True)
bertscore_precision = float(P.mean().item())
bertscore_recall = float(R.mean().item())
bertscore_f1 = float(F1.mean().item())

# 8) Save and print results
results = {
    "BLEU-1": round(bleu1, 4),
    "BLEU-2": round(bleu2, 4),
    "BLEU-3": round(bleu3, 4),
    "BLEU-4": round(bleu4, 4),
    "ROUGE-L": round(rougeL, 4),
    "CIDEr": round(float(cider_score), 4),
    "BERTScore": {
        "Precision": round(bertscore_precision, 4),
        "Recall": round(bertscore_recall, 4),
        "F1": round(bertscore_f1, 4),
    },
}

with open(SCORES_JSON, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(json.dumps(results, indent=2))


In [None]:
import os, re, json, csv
import pandas as pd

PREDICTIONS_JSONL = "predictions.jsonl"
CAPTIONS_CSV = "gt.csv"

def extract_image_id(path: str) -> str:
    # Keep only the filename stem (no folders, no extension)
    base = os.path.basename(str(path))
    stem, _ = os.path.splitext(base)
    # Prefer the numeric part if present (e.g., "Images/10002.jpeg" -> "10002")
    m = re.search(r"\d+", stem)
    return m.group(0) if m else stem

# Keep the last prediction seen per image_id
latest = {}
with open(PREDICTIONS_JSONL, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        try:
            obj = json.loads(line)
        except Exception:
            continue
        img_key = obj.get("image_id", "")
        pred = obj.get("reference", "")
        if not img_key:
            continue
        image_id = extract_image_id(img_key)
        latest[image_id] = pred

# Build DataFrame and save as CSV with proper quoting
rows = [{"image_id": k, "caption": v} for k, v in latest.items()]
df = pd.DataFrame(rows).sort_values("image_id")
df.to_csv(CAPTIONS_CSV, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
print(f"Wrote {len(df)} rows to {CAPTIONS_CSV}")
