In [2]:
import sys
import json
import os
import wandb
import random
import itertools
import gc
from huggingface_hub import whoami, login
from datasets import load_dataset
from pathlib import Path
from tqdm import tqdm
from collections import Counter, defaultdict
from PIL import Image
import torchvision.transforms as T
from google.colab import userdata
from datetime import datetime

In [4]:
# CIDEr-D implementation (simplified)
def cider_d(candidates, mult_references, n=4, sigma=6.0, scale=10.0):
    from collections import Counter
    import numpy as np

    def __cook_sentence(sentence, n):
        tokens = sentence.split()
        return Counter(tuple(tokens[i:i + k]) for k in range(1, n + 1) for i in range(len(tokens) - k + 1))

    def __compute_doc_freq(cooked_mrefs):
        doc_freq = Counter()
        for refs in cooked_mrefs:
            ngram_set = set()
            for ref in refs:
                ngram_set.update(ref.keys())
            doc_freq.update(ngram_set)
        return doc_freq

    cooked_mrefs = [[__cook_sentence(ref, n) for ref in refs] for refs in mult_references]
    cooked_cands = [__cook_sentence(cand, n) for cand in candidates]
    doc_frequencies = __compute_doc_freq(cooked_mrefs)
    log_n_refs = np.log(float(len(cooked_mrefs)))

    scores = []
    for cand, refs in zip(cooked_cands, cooked_mrefs):
        cand_vec = [Counter() for _ in range(n)]
        cand_norm = [0.0] * n
        cand_len = sum(cand[t] for t in cand if len(t) == 1)
        for ngram, freq in cand.items():
            k = len(ngram) - 1
            df = doc_frequencies.get(ngram, 0)
            idf = log_n_refs - np.log(max(1, df))
            cand_vec[k][ngram] = freq * idf
            cand_norm[k] += (freq * idf) ** 2
        cand_norm = [np.sqrt(no) for no in cand_norm]

        ngrams_scores = []
        for ref in refs:
            ref_vec = [Counter() for _ in range(n)]
            ref_norm = [0.0] * n
            ref_len = sum(ref[t] for t in ref if len(t) == 1)
            for ngram, freq in ref.items():
                k = len(ngram) - 1
                df = doc_frequencies.get(ngram, 0)
                idf = log_n_refs - np.log(max(1, df))
                ref_vec[k][ngram] = freq * idf
                ref_norm[k] += (freq * idf) ** 2
            ref_norm = [np.sqrt(no) for no in ref_norm]

            sims = []
            for ni in range(n):
                sim = 0
                for ngram, count in cand_vec[ni].items():
                    sim += min(count, ref_vec[ni].get(ngram, 0)) * ref_vec[ni].get(ngram, 0)
                if cand_norm[ni] != 0 and ref_norm[ni] != 0:
                    sim /= cand_norm[ni] * ref_norm[ni]
                sim *= np.exp(-((cand_len - ref_len) ** 2) / (2 * sigma ** 2))
                sims.append(sim)
            ngrams_scores.append(sum(sims) / n)
        scores.append(sum(ngrams_scores) / len(ngrams_scores))
    scores = np.array(scores) * scale
    return {"cider_d": float(scores.mean())}, {"cider_d": torch.tensor(scores)}

In [5]:
## Using Secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")
wandb_key = user_secrets.get_secret("wandb_api_key")

if not hf_token or not wandb_key:
    raise ValueError("HF_TOKEN or wandb_key not found in Colab secrets.")

## Authentication
login(hf_token)
wandb.login(key= wandb_key)

os.environ["HF_TOKEN"] = hf_token
os.environ["WANDB_PROJECT"] = "Kvasir-VQA-x1_Subtask1"
os.environ["WANDB_WATCH"] = "all"
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_LOG_MODEL"] = "false"

# Test
HF_USER = whoami()["name"]
print("Logged into HF as:", HF_USER)

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfirojpaudel[0m ([33mfirojpaudel-madan-bhandari-memorial-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Logged into HF as: Firoj112


In [6]:
# Data Preparation
"""
What we will do:
1) Cache images locally from SimulaMet-HOST/Kvasir-VQA with augmentation
2) Build VLM-ready JSONL files for train/test splits
3) Inspect dataset balance by question type
"""

# Working directories
BASE_DIR = Path("./")
DATA_DIR = BASE_DIR / "Kvasir-VQA-x1"
IMG_DIR = DATA_DIR / "images"
DATA_DIR.mkdir(parents=True, exist_ok=True)
IMG_DIR.mkdir(parents=True, exist_ok=True)

print("Data dir:", DATA_DIR)
print("Images dir:", IMG_DIR)

# Augment Function
def augment_image(img):
    """Apply data augmentation to PIL image"""
    transform = T.Compose([
        T.RandomRotation(15),
        T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        T.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),
        T.RandomHorizontalFlip(),
    ])
    return transform(img)

Data dir: Kvasir-VQA-x1
Images dir: Kvasir-VQA-x1/images


In [7]:
# 1. Saving unique images locally with optional augmentation
print("⏬ Caching images from SimulaMet-HOST/Kvasir-VQA ...")
host = load_dataset("SimulaMet-HOST/Kvasir-VQA", split="raw")
df = host.select_columns(['source', 'question', 'answer', 'img_id']).to_pandas()
# Save one image per unique img_id
for i, row in tqdm(df.groupby('img_id').nth(0).iterrows(), total=df['img_id'].nunique()):
    p = IMG_DIR / f"{row['img_id']}.jpg"
    if p.exists():
        continue
    host[i]['image'].save(p)

# 2. Create JSONLs for train/test from Kvasir-VQA-x1 (VLM-ready for ms-swift)
print("Creating JSONLs ...")
def write_jsonl(split):
    out_path = DATA_DIR / f"Kvasir-VQA-x1-{split}.jsonl"
    ds = load_dataset("SimulaMet/Kvasir-VQA-x1", split=split)
    with open(out_path, "w", encoding="utf-8") as f:
        for r in ds:
            rec = {
                "messages": [
                    {"role": "user", "content": f"<image>{r['question']}"},
                    {"role": "assistant", "content": r["answer"]}
                ],
                "images": [str(IMG_DIR / f"{r['img_id']}.jpg")]
            }
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    return out_path

train_jsonl = write_jsonl("train")
test_jsonl = write_jsonl("test")

print("Train JSONL:", train_jsonl)
print("Test JSONL:", test_jsonl)

sample_lines = list(itertools.islice(open(train_jsonl, "r", encoding="utf-8"), 3))
for i, line in enumerate(sample_lines, 1):
    j = json.loads(line)
    print(f"\n--- Sample {i} ---")
    print("messages:", j["messages"])
    print("images:", j["images"])
    assert Path(j["images"][0]).exists(), "Missing image file!"
print("\nLooks good ✅")

⏬ Caching images from SimulaMet-HOST/Kvasir-VQA ...


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/31 [00:00<?, ?files/s]

data/00000.parquet:   0%|          | 0.00/26.8M [00:00<?, ?B/s]

data/00001.parquet:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

data/00002.parquet:   0%|          | 0.00/25.5M [00:00<?, ?B/s]

data/00003.parquet:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

data/00004.parquet:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

data/00005.parquet:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

data/00006.parquet:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

data/00007.parquet:   0%|          | 0.00/23.8M [00:00<?, ?B/s]

data/00008.parquet:   0%|          | 0.00/20.2M [00:00<?, ?B/s]

data/00009.parquet:   0%|          | 0.00/5.66M [00:00<?, ?B/s]

data/00010.parquet:   0%|          | 0.00/5.75M [00:00<?, ?B/s]

data/00011.parquet:   0%|          | 0.00/8.13M [00:00<?, ?B/s]

data/00012.parquet:   0%|          | 0.00/6.49M [00:00<?, ?B/s]

data/00013.parquet:   0%|          | 0.00/6.80M [00:00<?, ?B/s]

data/00014.parquet:   0%|          | 0.00/5.89M [00:00<?, ?B/s]

data/00015.parquet:   0%|          | 0.00/4.84M [00:00<?, ?B/s]

data/00016.parquet:   0%|          | 0.00/64.7M [00:00<?, ?B/s]

data/00017.parquet:   0%|          | 0.00/67.5M [00:00<?, ?B/s]

data/00018.parquet:   0%|          | 0.00/68.3M [00:00<?, ?B/s]

data/00019.parquet:   0%|          | 0.00/67.4M [00:00<?, ?B/s]

data/00020.parquet:   0%|          | 0.00/66.4M [00:00<?, ?B/s]

data/00021.parquet:   0%|          | 0.00/68.4M [00:00<?, ?B/s]

data/00022.parquet:   0%|          | 0.00/72.3M [00:00<?, ?B/s]

data/00023.parquet:   0%|          | 0.00/72.6M [00:00<?, ?B/s]

data/00024.parquet:   0%|          | 0.00/111M [00:00<?, ?B/s]

data/00025.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

data/00026.parquet:   0%|          | 0.00/87.0M [00:00<?, ?B/s]

data/00027.parquet:   0%|          | 0.00/42.7M [00:00<?, ?B/s]

data/00028.parquet:   0%|          | 0.00/73.9M [00:00<?, ?B/s]

data/00029.parquet:   0%|          | 0.00/60.7M [00:00<?, ?B/s]

data/00030.parquet:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

Generating raw split:   0%|          | 0/58849 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 6500/6500 [00:53<00:00, 121.97it/s]


Creating JSONLs ...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/143594 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15955 [00:00<?, ? examples/s]

Train JSONL: Kvasir-VQA-x1/Kvasir-VQA-x1-train.jsonl
Test JSONL: Kvasir-VQA-x1/Kvasir-VQA-x1-test.jsonl

--- Sample 1 ---
messages: [{'role': 'user', 'content': '<image>Are there any abnormalities, polyps, or anatomical landmarks visible in the image?'}, {'role': 'assistant', 'content': 'Evidence of oesophagitis is present with no polyps identified and the z-line is visible.'}]
images: ['Kvasir-VQA-x1/images/clb0kvxvm90y4074yf50vf5nq.jpg']

--- Sample 2 ---
messages: [{'role': 'user', 'content': '<image>What procedure is depicted in the image and what type of polyp is observed?'}, {'role': 'assistant', 'content': 'Evidence of a colonoscopy with a paris iia polyp noted'}]
images: ['Kvasir-VQA-x1/images/cl8k2u1r71foz083278j63qnm.jpg']

--- Sample 3 ---
messages: [{'role': 'user', 'content': '<image>Have all polyps been removed, is there any text present, and where is the abnormality located in the image?'}, {'role': 'assistant', 'content': 'Polyps remain present, text is visible, and the

In [8]:
# Count records in JSONL files
def count_records_in_jsonl(jsonl_file):
    with open(jsonl_file, "r", encoding="utf-8") as f:
        return sum(1 for _ in f)

# Get the count of records in train and test datasets
train_records = count_records_in_jsonl(train_jsonl)
test_records = count_records_in_jsonl(test_jsonl)

print(f"Number of records in the train dataset: {train_records}")
print(f"Number of records in the test dataset: {test_records}")

Number of records in the train dataset: 143594
Number of records in the test dataset: 15955


In [9]:
# Reducing training and validation time
!shuf -n 3000 Kvasir-VQA-x1/Kvasir-VQA-x1-train.jsonl > Kvasir-VQA-x1/Kvasir-VQA-x1-train-3000.jsonl
TRN_3000_PATH = "Kvasir-VQA-x1/Kvasir-VQA-x1-train-3000.jsonl"

!shuf -n 500 Kvasir-VQA-x1/Kvasir-VQA-x1-test.jsonl > Kvasir-VQA-x1/Kvasir-VQA-x1-test-500.jsonl
VAL_500_PATH = "Kvasir-VQA-x1/Kvasir-VQA-x1-test-500.jsonl"

MODEL_NAME = "google/paligemma-3b-pt-224"
HUB_MODEL_ID = f"Kvasir-VQA-x1-lora_{datetime.now().strftime('%y%m%d-%H%M')}"

TRAIN_PATH = str(TRN_3000_PATH)
VAL_PATH = str(VAL_500_PATH)  # Use sampled validation set

print("Model:      ", MODEL_NAME)
print("Train file: ", TRAIN_PATH)
print("Valid file: ", VAL_PATH)
print("Hub repo:   ", HUB_MODEL_ID)

print("📝 You can find training logs after the training starts at: https://wandb.ai/home")
print("📌 After each validation stage, the HF repository will be updated with the best model.")
print(f"✅ Model will be available at: https://huggingface.co/{HF_USER}/{HUB_MODEL_ID}")

Model:       google/paligemma-3b-pt-224
Train file:  Kvasir-VQA-x1/Kvasir-VQA-x1-train-3000.jsonl
Valid file:  Kvasir-VQA-x1/Kvasir-VQA-x1-test-500.jsonl
Hub repo:    Kvasir-VQA-x1-lora_251019-1321
📝 You can find training logs after the training starts at: https://wandb.ai/home
📌 After each validation stage, the HF repository will be updated with the best model.
✅ Model will be available at: https://huggingface.co/Firoj112/Kvasir-VQA-x1-lora_251019-1321


In [10]:
# Count records in the shuffled train and validation datasets
train_shuffled_records = count_records_in_jsonl(TRN_3000_PATH)
val_shuffled_records = count_records_in_jsonl(VAL_500_PATH)

# Print the results
print(f"Number of records in the shuffled train dataset: {train_shuffled_records}")
print(f"Number of records in the shuffled validation dataset: {val_shuffled_records}")

Number of records in the shuffled train dataset: 3000
Number of records in the shuffled validation dataset: 500
