In [None]:
!pip install accelerate -q
!pip install bitsandbytes -q
!pip install peft -q
!pip install trl==0.11.4 -q

In [None]:
from datasets import load_dataset

train_dataset = load_dataset("imagefolder", data_dir="/kaggle/input/biology-dataset-modified/Biology-Dataset", split="train")
eval_dataset = load_dataset("imagefolder", data_dir="/kaggle/input/biology-dataset-modified/Biology-Dataset", split="validation")

In [None]:
import os, torch, gc
from transformers import AutoModelForVision2Seq, AutoProcessor, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# 1) Set allocator tuning BEFORE torch/model loads
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"  # fragmentation fix
os.environ["WANDB_DISABLED"] = "true"  # avoid wandb stalls

# 2) Clear memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.set_device(0)

# 3) Load model
model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"
model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    device_map={"": 0},
    torch_dtype=torch.float16,  # fp16 for T4; use bfloat16 on A100/L4
    _attn_implementation="eager",
    trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(model_id)

# 4) Save memory: no KV cache while training
model.config.use_cache = False
model.train()   

# 5) PEFT LoRA (keep small)
peft_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora=False,  # simpler, less overhead
)
model = get_peft_model(model, peft_config)
model.enable_input_require_grads()


training_args = TrainingArguments(
    output_dir="./smolvlm-256m-caption-model",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    save_steps=500,
    save_total_limit=1,
    fp16=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
    eval_strategy="no",     # or "epoch"

    report_to="none",
    optim="adamw_8bit",
)

def smolvlm_collate_fn(examples):
    texts, images = [], []
    for ex in examples:
        # build chat text
        messages = [
            {"role":"user","content":[{"type":"image"},{"type":"text","text":"Describe this image."}]},
            {"role":"assistant","content":[{"type":"text","text":str(ex["text"])}]},
        ]
        txt = processor.apply_chat_template(messages, tokenize=False)
        texts.append(txt)
        img = ex["image"]
        if hasattr(img, "convert") and getattr(img, "mode", "") != "RGB":
            img = img.convert("RGB")
        images.append([img])  # processor expects list-of-images per sample

    batch = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048
    )
    # Trainer expects labels if training LM
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    batch["labels"] = labels
    return batch

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # or set eval off if not using
    data_collator=smolvlm_collate_fn,
    tokenizer=processor.tokenizer,
)


In [None]:
trainer.train()
trainer.save_model()

In [None]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

base_id = "HuggingFaceTB/SmolVLM-256M-Instruct"        # base model
adapter_dir = "/kaggle/working/smolvlm-256m-caption-model/"  # your trainer.save_model() path

processor = AutoProcessor.from_pretrained(base_id)

# load base
model = AutoModelForVision2Seq.from_pretrained(
    base_id, trust_remote_code=True, _attn_implementation="eager"
).to(device)  # in-place API below [web:451][web:435]

# attach adapter IN PLACE (do NOT assign)
model.load_adapter(adapter_dir, adapter_name="default")  # returns None by design [web:451]

# select adapter if multiple exist
if hasattr(model, "set_adapter"):
    model.set_adapter("default")  # optional but recommended [web:451]

model.eval()

# Build a prompt and run inference
img_path = "/kaggle/input/biology-dataset-modified/Biology-Dataset/test"
img = os.path.join(img_path, "1.png")
image = Image.open(img).convert("RGB")
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

inputs = processor(text=prompt, images=[[image]], return_tensors="pt").to(device)
with torch.inference_mode():
    out = model.generate(**inputs, max_new_tokens=200, do_sample=False)
caption = processor.batch_decode(out[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
print(caption)

In [None]:
!pip -q install evaluate sacrebleu rouge-score bert-score pycocoevalcap

In [None]:
import torch
import evaluate
from statistics import mean

# Load metrics once
bleu_metric = evaluate.load("bleu")              # sacrebleu-backed BLEU [0..1]
rouge_metric = evaluate.load("rouge")            # returns rouge1/2/L/Lsum
bertscore_metric = evaluate.load("bertscore")    # returns precision/recall/f1 lists

# CIDEr from pycocoevalcap (expects dict format)
from pycocoevalcap.cider.cider import Cider

def compute_bleu(preds, refs):
    # refs: list of list[str] (multi-ref) or list[str]; convert to list[list]
    norm_refs = [[r] if isinstance(r, str) else r for r in refs]
    out = bleu_metric.compute(predictions=preds, references=norm_refs)
    return {"bleu": 100.0 * float(out["bleu"])}  # scale to [0,100]

def compute_rougeL(preds, refs):
    # For multiple refs, use the first or the best; here we take first if list
    single_refs = [r[0] if isinstance(r, list) else r for r in refs]
    out = rouge_metric.compute(predictions=preds, references=single_refs)
    return {"rougeL_f": 100.0 * float(out["rougeL"])}  # ROUGE-L F1 in %

def compute_bertscore(preds, refs, model_type="roberta-large", lang="en"):
    single_refs = [r[0] if isinstance(r, list) else r for r in refs]
    device = "cuda" if torch.cuda.is_available() else "cpu"
    out = bertscore_metric.compute(
        predictions=preds,
        references=single_refs,
        model_type=model_type,
        lang=lang,
        device=device,
        rescale_with_baseline=True    # common practice for comparability
    )
    # Return average F1 (plus P/R if desired)
    return {
        "bertscore_f1": 100.0 * float(mean(out["f1"])),
        "bertscore_p":  100.0 * float(mean(out["precision"])),
        "bertscore_r":  100.0 * float(mean(out["recall"])),
        "model": model_type,
    }

def compute_cider(preds, refs):
    # Convert to COCO dicts: keys must be strings; values are lists of strings
    gts = {str(i): (refs[i] if isinstance(refs[i], list) else [refs[i]]) for i in range(len(preds))}
    res = {str(i): [preds[i]] for i in range(len(preds))}
    score, _ = Cider().compute_score(gts, res)
    return {"cider": float(score)}  # typically already on a 0–10ish scale (often reported ×10)

def compute_all(preds, refs, model_type="roberta-large", lang="en"):
    metrics = {}
    metrics.update(compute_bleu(preds, refs))
    metrics.update(compute_rougeL(preds, refs))
    metrics.update(compute_bertscore(preds, refs, model_type=model_type, lang=lang))
    metrics.update(compute_cider(preds, refs))
    return metrics


In [None]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import os
import csv
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

base_id = "HuggingFaceTB/SmolVLM-256M-Instruct"        # base model
adapter_dir = "/kaggle/working/smolvlm-256m-caption-model/"  # your trainer.save_model() path

processor = AutoProcessor.from_pretrained(base_id)

# load base
model = AutoModelForVision2Seq.from_pretrained(
    base_id, trust_remote_code=True, _attn_implementation="eager"
).to(device)  # in-place API below [web:451][web:435]

# attach adapter IN PLACE (do NOT assign)
model.load_adapter(adapter_dir, adapter_name="default")  # returns None by design [web:451]

# select adapter if multiple exist
if hasattr(model, "set_adapter"):
    model.set_adapter("default")  # optional but recommended [web:451]


TEST_PATH =  "/kaggle/input/biology-dataset-modified/Biology-Dataset/test"


def inference(model):
    refs = []
    preds = []
    model.eval()
    test_csv = os.path.join(TEST_PATH, "metadata.csv")
    with open(test_csv, "r") as f:
        reader = csv.reader(f)
        _ = next(reader)
        for img, caption in tqdm(reader):
            refs.append(caption)
            img_path = os.path.join(TEST_PATH, img)
            image = Image.open(img_path).convert("RGB")
            
            messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
            
            inputs = processor(text=prompt, images=[[image]], return_tensors="pt").to(device)
            
            with torch.inference_mode():
                out = model.generate(**inputs, max_new_tokens=300, do_sample=False)
            gen_caption = processor.batch_decode(out[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
            preds.append(gen_caption)
    return refs, preds

In [None]:
refs, preds = inference(model)
scores = compute_all(preds, refs, model_type="roberta-large", lang="en")
print(scores)

In [None]:
import os
import csv

test_csv = os.path.join(TEST_PATH, "metadata.csv")
predictions = []
with open(test_csv, "r") as f:
    reader = csv.reader(f)
    _ = next(reader)
    for ind, (img, caption) in enumerate(reader):
        predictions.append([img, preds[ind]])
        
with open("preds.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["image", "generated_caption"])
    writer.writerows(predictions)

In [None]:
image_files = ['391.jpeg', '1.png', '137.png', '235.png', '378.png', '580.png', 
            '715.png', '817.png', '879.png', '2152.png', '2403.jpeg', 
            '2807.jpeg', '4075.jpeg', '4099.png', '4444.png', '4531.png', 
            '4672.jpeg', '6143.jpeg', '6308.jpeg', '6896.jpeg', 
            '7068.png', '7539.png', '8032.png', '8433.jpeg']

In [None]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import os
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

base_id = "HuggingFaceTB/SmolVLM-256M-Instruct"        # base model
adapter_dir = "/kaggle/working/smolvlm-256m-caption-model/"  # your trainer.save_model() path

processor = AutoProcessor.from_pretrained(base_id)

# load base
model = AutoModelForVision2Seq.from_pretrained(
    base_id, trust_remote_code=True, _attn_implementation="eager"
).to(device)  # in-place API below [web:451][web:435]

# attach adapter IN PLACE (do NOT assign)
model.load_adapter(adapter_dir, adapter_name="default")  # returns None by design [web:451]

# select adapter if multiple exist
if hasattr(model, "set_adapter"):
    model.set_adapter("default")  # optional but recommended [web:451]

model.eval()

res = []

for image_name in tqdm(image_files):
    # Build a prompt and run inference
    img_path = "/kaggle/input/biology-dataset-modified/Biology-Dataset/test"
    img = os.path.join(img_path, image_name)
    image = Image.open(img).convert("RGB")
    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image. Give only one paragraph."}]}]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    
    inputs = processor(text=prompt, images=[[image]], return_tensors="pt").to(device)
    with torch.inference_mode():
        out = model.generate(**inputs, max_new_tokens=200, do_sample=False)
    caption = processor.batch_decode(out[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)[0]
    res.append((image_name, caption))

In [None]:
res.sort(key=lambda x: image_files.index(x[0]))
for image, caption in res:
    print(f"{image}: {caption}")