In [None]:
!pip install -q "transformers>=4.53.2" "tokenizers>=0.20.1" "huggingface-hub>=0.25.2" "accelerate>=0.34.2" "peft>=0.13.0" "bitsandbytes>=0.45.0" "safetensors>=0.4.5" "timm>=1.0.9" "torchvision>=0.20.0" "pillow>=10.4.0"

In [None]:
import os, sys, math, random, json, gc
from dataclasses import dataclass
from typing import List, Dict, Any

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from PIL import Image
import pandas as pd
import numpy as np

from datasets import Dataset, DatasetDict
from transformers import (
    AutoProcessor,
    LlavaForConditionalGeneration,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
import warnings
warnings.filterwarnings("ignore")

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Base/model/output
MODEL_ID = "llava-hf/llava-1.5-7b-hf"
OUTPUT_DIR = "/kaggle/working/llava-v16-mistral-7b-caption-lora-final"

# Dataset mount and subpaths
DATASET_DIR = "/kaggle/input/split-10k-dataset/Split Dataset"
CAPTION_CSV = os.path.join(DATASET_DIR, "train", "description_b.csv")
IMAGES_ROOT = os.path.join(DATASET_DIR, "train")      # root under which CSV paths live
IMAGES_DIR  = os.path.join(IMAGES_ROOT, "Images")     # images directory

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# -----------------------------
# Load dataset (CSV with file_name, text)
# -----------------------------
df = pd.read_csv(CAPTION_CSV)

# Expect 'file_name' and 'text' from the CSV; rename to the training code's 'image' and 'caption'
assert "file_name" in df.columns and "text" in df.columns, "CSV must have 'file_name' and 'text' columns."
df = df.rename(columns={"file_name": "image", "text": "caption"})

# Robust path resolver for rows like:
#  - "Images/10000.jpeg"
#  - "train/Images/10000.jpeg"
#  - "10000.jpeg"
def make_path(p: str) -> str:
    p = str(p).strip()
    if os.path.isabs(p):
        return p
    # If CSV already holds 'train/...' join to dataset root
    if p.lower().startswith("train/"):
        return os.path.join(DATASET_DIR, p)
    # If CSV holds 'Images/...' join to train root
    if p.lower().startswith("images/") or p.lower().startswith("./images/"):
        p2 = p.replace("./", "")
        return os.path.join(IMAGES_ROOT, p2)
    # Otherwise treat as bare filename
    return os.path.join(IMAGES_DIR, os.path.basename(p))

# Apply resolver
df["image"] = df["image"].apply(make_path)

# Filter out missing files
df = df[df["image"].apply(os.path.exists)].reset_index(drop=True)
print("Total samples after filtering:", len(df))

# Train/val split (unchanged)
val_frac = 0.2
val_size = max(0, int(len(df) * val_frac)) if len(df) >= 100 else max(1, int(len(df) * 0.1))

df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
df_val = df.iloc[:val_size].reset_index(drop=True)
df_train = df.iloc[val_size:].reset_index(drop=True)

train_ds = Dataset.from_pandas(df_train)
val_ds = Dataset.from_pandas(df_val)
raw = DatasetDict({"train": train_ds, "validation": val_ds})
print(raw)


In [None]:
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "left"
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

# 2) Build chat-templated prompts
INSTRUCTION = """Describe the image in a paragraph"""


def build_texts(caption: str):
    # Prompt only (user + image), used to compute loss mask
    template_user = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": INSTRUCTION}]}
    ]
    prompt_only_text = processor.apply_chat_template(
        template_user,
        add_generation_prompt=True,   # ends with "ASSISTANT:"
        tokenize=False
    )

    # Full conversation (include gold caption as assistant)
    template_full = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": INSTRUCTION}]},
        {"role": "assistant", "content": [{"type": "text", "text": caption}]},
    ]
    full_text = processor.apply_chat_template(
        template_full,
        tokenize=False
    )
    return prompt_only_text, full_text

def map_build_texts(ex):
    p, f = build_texts(ex["caption"])
    return {"prompt": p, "full_text": f}

train_proc = raw["train"].map(map_build_texts, remove_columns=[])
val_proc   = raw["validation"].map(map_build_texts, remove_columns=[])

# 3) Collator: tokenize text + images on the fly and build labels with prompt masking
pad_id = processor.tokenizer.pad_token_id

In [None]:
def collate_fn(batch):
    # Load images
    images = [Image.open(item["image"]).convert("RGB") for item in batch]
    # Strings
    full_texts   = [item["full_text"] for item in batch]
    prompt_texts = [item["prompt"] for item in batch]

    # Tokenize full inputs (text + images)
    inputs = processor(
        images=images,
        text=full_texts,
        padding=True,
        return_tensors="pt"
    )

    # Compute prompt lengths per sample (no padding) to create loss mask
    prompt_lens = [
        len(processor.tokenizer(p, add_special_tokens=False).input_ids)
        for p in prompt_texts
    ]

    # Build labels: mask everything up to prompt length; keep only assistant caption tokens
    labels = inputs["input_ids"].clone()
    labels[:] = -100  # start fully masked

    for i in range(len(batch)):
        ids = inputs["input_ids"][i]
        nonpad = ids != pad_id
        total_nonpad = int(nonpad.sum().item())
        assist_len = total_nonpad - prompt_lens[i]
        if assist_len > 0:
            target_tokens = ids[nonpad][-assist_len:]
            labels[i, -assist_len:] = target_tokens

    inputs["labels"] = labels
    return inputs

In [None]:
bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if bf16 else torch.float16,
    bnb_4bit_quant_type="nf4",
)

model = LlavaForConditionalGeneration.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
)

In [None]:
model.config.use_cache = False

# Prepare for k-bit training and wrap with LoRA
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()

# 5) Training arguments
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    fp16=not bf16,
    bf16=bf16,
    gradient_checkpointing=True,
    report_to="none",
    remove_unused_columns=False,  # <-- important
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_proc,
    eval_dataset=val_proc,
    data_collator=collate_fn,
)

trainer.train()

# 7) Save LoRA adapter and processor
trainer.save_model(OUTPUT_DIR)  # saves PEFT adapter
processor.save_pretrained(OUTPUT_DIR)
print("Saved LoRA adapter + processor to:", OUTPUT_DIR)

In [None]:
!pip install -q sacrebleu rouge-score bert-score

In [None]:
# pip installs (run once per session if needed)


import os
import torch
from PIL import Image
from tqdm import tqdm

from transformers import AutoProcessor, LlavaForConditionalGeneration
from peft import PeftModel

# Assumes these exist from training
# MODEL_ID = "llava-hf/llava-1.5-7b-hf"
# OUTPUT_DIR = "/kaggle/working/llava-v16-mistral-7b-caption-lora"  # change to your adapter dir
# df_val has columns: image (absolute path), caption (reference)
# INSTRUCTION = "... your biology-focused instruction ..."

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16

# Load base + LoRA adapter
base = LlavaForConditionalGeneration.from_pretrained(
    MODEL_ID, torch_dtype=torch_dtype, device_map="auto"
)
model = PeftModel.from_pretrained(base, OUTPUT_DIR).eval()

processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.tokenizer.padding_side = "left"

# Helper to build the chat prompt per sample
def build_prompt(instruction: str):
    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]}]
    return processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

# Batched generation
def generate_captions(rows, batch_size=4, max_new_tokens=200, temperature=0.2, top_p=0.9):
    preds = []
    refs = []
    for i in tqdm(range(0, len(rows), batch_size)):
        batch = rows[i:i+batch_size]
        images = [Image.open(x["image"]).convert("RGB") for x in batch]
        prompts = [build_prompt(INSTRUCTION) for _ in batch]
        inputs = processor(images=images, text=prompts, return_tensors="pt", padding=True).to(model.device)

        with torch.inference_mode():
            gen_ids = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=temperature > 0
            )
        texts = processor.batch_decode(gen_ids, skip_special_tokens=True)

        # Extract only the assistant response after "ASSISTANT:"
        # If not present, fallback to full decoded text
        for t, row in zip(texts, batch):
            parts = t.split("ASSISTANT:")
            caption = parts[-1].strip() if len(parts) >= 2 else t.strip()
            preds.append(caption)
            refs.append(row["caption"])
    return preds, refs

# Prepare rows from df_val
val_rows = df_val[["image", "caption"]].to_dict(orient="records")
preds, refs = generate_captions(val_rows, batch_size=4)
print(f"Generated {len(preds)} captions.")


In [None]:
from IPython.display import FileLink

In [None]:
image_ids = [os.path.basename(r["image"]) for r in val_rows]

# Create DataFrames
preds_df = pd.DataFrame({"Image_id": image_ids, "caption": preds})
refs_df  = pd.DataFrame({"Image_id": image_ids, "caption": refs})
both_df  = pd.DataFrame({"Image_id": image_ids, "pred_caption": preds, "ref_caption": refs})

# Save under /kaggle/working so they appear in the Output panel and are easy to download
preds_csv = "/kaggle/working/val_preds.csv"
refs_csv  = "/kaggle/working/val_refs.csv"
both_csv  = "/kaggle/working/val_preds_refs.csv"

preds_df.to_csv(preds_csv, index=False, encoding="utf-8")
refs_df.to_csv(refs_csv, index=False, encoding="utf-8")
both_df.to_csv(both_csv, index=False, encoding="utf-8")

print("Saved files:")
print(preds_csv)
print(refs_csv)
print(both_csv)

# Provide clickable links in the notebook
display(FileLink(os.path.basename(preds_csv)))
display(FileLink(os.path.basename(refs_csv)))
display(FileLink(os.path.basename(both_csv)))

In [None]:
# Metrics: SacreBLEU (BLEU-4), ROUGE-L, and optional BERTScore

# BLEU via SacreBLEU
import sacrebleu
bleu = sacrebleu.corpus_bleu(preds, [refs])  # single reference per sample
print(f"BLEU (SacreBLEU): {bleu.score:.2f}")

# ROUGE-L via rouge-score
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
r_f1s = [scorer.score(r, p)["rougeL"].fmeasure for p, r in zip(preds, refs)]
print(f"ROUGE-L F1: {100 * (sum(r_f1s) / len(r_f1s)):.2f}")

# Optional: BERTScore (semantic)
use_bertscore = True
if use_bertscore:
    from bert_score import score as bertscore
    P, R, F1 = bertscore(preds, refs, lang="en", rescale_with_baseline=True)
    print(f"BERTScore P/R/F1: {100*P.mean().item():.2f} / {100*R.mean().item():.2f} / {100*F1.mean().item():.2f}")


In [None]:
# In a Kaggle Notebook cell
%cd /kaggle/working
!zip -r working_dir.zip . -x "working_dir.zip"


FileLink('working_dir.zip')  # click to download


In [None]:
!pip install -q pycocoevalcap
# Fallback if needed:
# pip install -q "git+https://github.com/salaniz/pycocoevalcap.git"

In [None]:
from pycocoevalcap.bleu.bleu import Bleu

# Inputs from inference
# preds: list[str]  # model-generated captions, length N
# refs:  list[str]  # reference captions (one per sample), length N

# Build COCO-style dicts: id -> [captions]
res = {str(i): [preds[i]] for i in range(len(preds))}
gts = {str(i): [refs[i]]  for i in range(len(refs))}

bleu = Bleu(n=4)
scores, _ = bleu.compute_score(gts, res)  # scores is a list of 4 floats (BLEU-1..4)
bleu1, bleu2, bleu3, bleu4 = [s for s in scores]
print(f"BLEU-1: {bleu1:.2f}")
print(f"BLEU-2: {bleu2:.2f}")
print(f"BLEU-3: {bleu3:.2f}")
print(f"BLEU-4: {bleu4:.2f}")
