In [None]:
!pip uninstall -y spacy thinc

Found existing installation: spacy 3.8.5
Uninstalling spacy-3.8.5:
  Successfully uninstalled spacy-3.8.5
Found existing installation: thinc 8.3.6
Uninstalling thinc-8.3.6:
  Successfully uninstalled thinc-8.3.6


In [None]:
!pip install --quiet gensim==4.3.3 transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Imports & Configuration
# Importing libraries
import os
import json
import random
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image
from gensim.models.fasttext import FastText

# Define paths to dataset components stored in Google Drive
DATA_ROOT        = "/content/drive/MyDrive/VQA_Project/sample_data"
QUESTIONS_JSON   = os.path.join(DATA_ROOT, "questions",   "train2014_questions_subset.json")
ANNOTATIONS_JSON = os.path.join(DATA_ROOT, "annotations", "train2014_annotations_subset.json")
IMAGES_DIR       = os.path.join(DATA_ROOT, "images",      "train2014")
SUBSET_IDS_TXT   = os.path.join(DATA_ROOT, "subsets",     "train_subset_ids.txt")

# Hyperparameters
SEED        = 42
BATCH_SIZE  = 4
NUM_EPOCHS  = 10
LR          = 1e-3
TOP_ANSWERS = 1000

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reproducibility
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark     = False

print(f"  Block 1 complete — DEVICE = {DEVICE}")
print(f"  Data root:       {DATA_ROOT}")
print(f"  Questions JSON:  {QUESTIONS_JSON}")
print(f"  Annotations JSON:{ANNOTATIONS_JSON}")
print(f"  Images dir:      {IMAGES_DIR}")


  Block 1 complete — DEVICE = cuda
  Data root:       /content/drive/MyDrive/VQA_Project/sample_data
  Questions JSON:  /content/drive/MyDrive/VQA_Project/sample_data/questions/train2014_questions_subset.json
  Annotations JSON:/content/drive/MyDrive/VQA_Project/sample_data/annotations/train2014_annotations_subset.json
  Images dir:      /content/drive/MyDrive/VQA_Project/sample_data/images/train2014


In [None]:
# Build `records` list & answer vocabulary

import json
from collections import Counter

# Loading subset image-ID list
with open(SUBSET_IDS_TXT, "r") as f:
    subset_ids = set(int(line.strip()) for line in f)

# Loading questions and annotations
with open(QUESTIONS_JSON, "r") as f:
    questions = json.load(f)["questions"]

with open(ANNOTATIONS_JSON, "r") as f:
    annotations = json.load(f)["annotations"]

# Map question_id → list of answers (lower-cased)
ans_map = {}
for ann in annotations:
    qid = ann["question_id"]
    ans_map.setdefault(qid, []).extend(a["answer"].lower() for a in ann["answers"])

# Building a records list
records = []
for q in questions:
    qid, img_id = q["question_id"], q["image_id"]
    if img_id not in subset_ids or qid not in ans_map:
        continue
    img_file = f"COCO_train2014_{img_id:012d}.jpg"
    img_path = os.path.join(IMAGES_DIR, img_file)
    records.append({
        "image_path": img_path,
        "question":   q["question"],
        "answers":    ans_map[qid]
    })

print(f"✔️  Built {len(records):,} records")

# Building answer vocabulary i.e. top 1000ans+ <unk>
answer_counter = Counter(ans for rec in records for ans in rec["answers"])
top_answers    = answer_counter.most_common(TOP_ANSWERS)

answer_vocab = {ans: idx for idx, (ans, _) in enumerate(top_answers)}
answer_vocab["<unk>"] = len(answer_vocab)

print(f"✔️  Answer vocab size = {len(answer_vocab)} (top {TOP_ANSWERS} + <unk>)")


✔️  Built 15,000 records
✔️  Answer vocab size = 1001 (top 1000 + <unk>)


In [None]:
# Configuring Hugging-Face libs and import basics
!pip install --quiet transformers==4.39.3 accelerate

import torch, random
from transformers import ViltProcessor, ViltForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from collections import Counter
print("Torch:", torch.__version__)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Torch: 2.6.0+cu124


In [None]:
# Load the ViLT processor from Hugging Face
# The ViltProcessor handles both image and text preprocessing, including tokenization and feature extraction.
# 'dandelin/vilt-b32-mlm' is a pre-trained Vision-and-Language Transformer model fine-tuned for masked language modeling.
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

#Total number of answer classes
NUM_ANS     = len(answer_vocab)             # e.g. 1 001
#reverse map from index to the answer label
id2label    = {i: a for a,i in answer_vocab.items()}
label2id    = answer_vocab


In [None]:
# ---- constants shared by DataLoader workers ----
MAX_Q_LEN = 32        # ViLT can handle up to 40+ but 32 is common


In [None]:
# VQA data class with one-hot label vector
class ViltVQADataset(Dataset):
    def __init__(self, recs):
        self.recs = recs
        self.resize = transforms.Resize((224,224))
    def __len__(self): return len(self.recs)
    def __getitem__(self, idx):
        rec   = self.recs[idx]
        #Load and pre process the image, i.e. convert it inro RGB and resize
        img   = self.resize(Image.open(rec["image_path"]).convert("RGB"))

        #Using ViLT processor for tokenization
        enc = processor(
            images=img,
            text=rec["question"],
            padding="max_length",
            truncation=True,
            max_length=MAX_Q_LEN,
            return_tensors="pt"
        )
        enc = {k: v.squeeze(0) for k,v in enc.items()}

        # Create one-hot encoded target vector for the most frequent answer
        gt_ans  = Counter(rec["answers"]).most_common(1)[0][0]
        vec     = torch.zeros(NUM_ANS, dtype=torch.float32)
        vec[label2id.get(gt_ans, label2id["<unk>"])] = 1.0
        enc["labels"] = vec            # shape (1001,)

        return enc


In [None]:
# Dataloader setup for training and validation
from sklearn.model_selection import train_test_split
train_rec, val_rec = train_test_split(records, test_size=0.15, random_state=42) # training=85% and validation=15%

BATCH = 8           # fits on Colab T4; reduce if OOM
#Initialization of Dataloader for training
train_dl = DataLoader(ViltVQADataset(train_rec), batch_size=BATCH,
                      shuffle=True, num_workers=2, pin_memory=True)
val_dl   = DataLoader(ViltVQADataset(val_rec),   batch_size=BATCH,
                      shuffle=False, num_workers=2, pin_memory=True)
print(f"Train batches: {len(train_dl)}  |  Val batches: {len(val_dl)}")


Train batches: 1594  |  Val batches: 282


In [None]:
# Load ViLT model and set up optimizer
model = ViltForQuestionAnswering.from_pretrained(
    "dandelin/vilt-b32-finetuned-vqa",
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
).to(DEVICE)

# setup AdamW optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scaler    = torch.cuda.amp.GradScaler()      # mixed precision
EPOCHS    = 3


Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-finetuned-vqa and are newly initialized because the shapes did not match:
- classifier.3.weight: found shape torch.Size([3129, 1536]) in the checkpoint and torch.Size([1001, 1536]) in the model instantiated
- classifier.3.bias: found shape torch.Size([3129]) in the checkpoint and torch.Size([1001]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler    = torch.cuda.amp.GradScaler()      # mixed precision


In [None]:
# ViLT fine-tuning with AMP and one-hot labels
from tqdm.auto import tqdm
from torch import amp


EPOCHS  = 7                 #Number of fine tunning epoch
scaler  = amp.GradScaler()
model.train()               # model in training mode

def val_top1(loader):
    """Hard Top-1 accuracy (expects one-hot labels)."""
    model.eval(); correct = n = 0
    with torch.no_grad():
        for batch in loader:
            labels = batch.pop("labels")            #Extra hot one label
            batch  = {k: v.to(DEVICE) for k,v in batch.items()}
            logits = model(**batch).logits          # forward only
            pred   = logits.argmax(1).cpu()         # predict class inex
            gt     = labels.argmax(1)               # Ground truth class index
            correct += (pred == gt).sum().item()    # couting the correct predictions
            n      += pred.size(0)                  #switch back to training mode
    model.train()
    return correct / n

# Training loop
for ep in range(1, EPOCHS + 1):
    tot_loss = n_samples = 0

    for batch in tqdm(train_dl, desc=f"Epoch {ep}/{EPOCHS}"):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        with amp.autocast(device_type="cuda"):
            out  = model(**batch)       # BCEWithLogits loss via one-hot labels
            loss = out.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        # Calculate total loss
        tot_loss  += loss.item() * batch["labels"].size(0)
        n_samples += batch["labels"].size(0)

    #computing training loss and validation accuracy
    train_loss = tot_loss / n_samples
    val_acc    = val_top1(val_dl)
    print(f"Epoch {ep}:  train-loss {train_loss:.4f}  |  Val Top-1 {val_acc:.4f}")


Epoch 1/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 1:  train-loss 76.5539  |  Val Top-1 0.2324


Epoch 2/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 2:  train-loss 6.4841  |  Val Top-1 0.2324


Epoch 3/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 3:  train-loss 5.0935  |  Val Top-1 0.3418


Epoch 4/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 4:  train-loss 4.4036  |  Val Top-1 0.3764


Epoch 5/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 5:  train-loss 4.0078  |  Val Top-1 0.4040


Epoch 6/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 6:  train-loss 3.6889  |  Val Top-1 0.4142


Epoch 7/7:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 7:  train-loss 3.3750  |  Val Top-1 0.4187


In [None]:
# continue ViLT fine-tuning for epochs 8-10
from tqdm.auto import tqdm
from torch import amp

EXTRA_EPOCHS = 3           # run 3 more epochs
start_ep     = 8           # continue after the 7th epoch
end_ep       = start_ep + EXTRA_EPOCHS - 1

def val_top1(loader):
    """Hard Top-1 accuracy (expects one-hot labels)."""
    model.eval(); correct = n = 0
    with torch.no_grad():
        for batch in loader:
            labels = batch.pop("labels")                 # (B,1001) one-hot
            batch  = {k: v.to(DEVICE) for k,v in batch.items()}
            logits = model(**batch).logits
            pred   = logits.argmax(1).cpu()
            gt     = labels.argmax(1)
            correct += (pred == gt).sum().item()
            n      += pred.size(0)
    model.train()
    return correct / n

for ep in range(start_ep, end_ep + 1):
    tot_loss = n_samples = 0
    for batch in tqdm(train_dl, desc=f"Epoch {ep}/{end_ep}"):
        batch = {k: v.to(DEVICE) for k,v in batch.items()}
        with amp.autocast(device_type="cuda"):
            out  = model(**batch)            # BCEWithLogits loss
            loss = out.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        tot_loss  += loss.item() * batch["labels"].size(0)
        n_samples += batch["labels"].size(0)

    train_loss = tot_loss / n_samples
    val_acc    = val_top1(val_dl)
    print(f"Epoch {ep}:  train-loss {train_loss:.4f}  |  Val Top-1 {val_acc:.4f}")


Epoch 8/10:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 8:  train-loss 3.0520  |  Val Top-1 0.4342


Epoch 9/10:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 9:  train-loss 2.7312  |  Val Top-1 0.4360


Epoch 10/10:   0%|          | 0/1594 [00:00<?, ?it/s]

Epoch 10:  train-loss 2.4199  |  Val Top-1 0.4413


In [None]:
# Computes Top-1 and VQA-soft accuracy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
import torch

# helper functions
def maj_answer(ans_list):
    return Counter([a.lower() for a in ans_list]).most_common(1)[0][0]

def vqa_soft_score(pred, ans_list):
    cnt = sum(pred.lower() == a.lower() for a in ans_list)
    return min(1.0, cnt / 3.0)

# evaluation dataset
class EvalDataset(Dataset):
    def __init__(self, recs):
        self.recs = recs
        self.resize = transforms.Resize((224,224))
    def __len__(self):  return len(self.recs)
    def __getitem__(self, idx):
        rec = self.recs[idx]
        img = self.resize(Image.open(rec["image_path"]).convert("RGB"))
        enc = processor(images=img,
                        text   =rec["question"],
                        padding="max_length",
                        truncation=True,
                        max_length=MAX_Q_LEN,
                        return_tensors="pt")
        sample = {k:v.squeeze(0) for k,v in enc.items()}
        sample["answers"] = rec["answers"]   # keep full answer list
        return sample

def collate_eval(batch):
    keys = ["pixel_values","input_ids","attention_mask"]
    merged = {k: torch.stack([b[k] for b in batch]) for k in keys}
    merged["answers"] = [b["answers"] for b in batch]
    return merged

eval_dl = DataLoader(EvalDataset(records), batch_size=32,
                     shuffle=False, num_workers=2,
                     collate_fn=collate_eval, pin_memory=True)

# Dataset for evaluation
@torch.no_grad()
def evaluate(loader):
    model.eval(); hard = soft = n = 0
    for b in tqdm(loader, desc="Eval"):
        b_dev = {k:v.to(DEVICE) for k,v in b.items() if k!="answers"}
        logits = model(**b_dev).logits                     # (B, 1001)
        preds  = [id2label[i] for i in logits.argmax(1).cpu().tolist()]
        for p, ans_list in zip(preds, b["answers"]):
            hard += (p.lower() == maj_answer(ans_list))
            soft += vqa_soft_score(p, ans_list)
        n += len(preds)
    return hard/n, soft/n

top1, vqa = evaluate(eval_dl)
print(f"\nTop-1 accuracy : {top1:.4f}")
print(f"VQA-soft acc  : {vqa:.4f}")


Eval:   0%|          | 0/469 [00:00<?, ?it/s]


Top-1 accuracy : 0.4976
VQA-soft acc  : 0.5303


In [None]:
# Shows top & bottom question prefixes
import re, pandas as pd
from collections import defaultdict
from tqdm.auto import tqdm
from PIL import Image

# Extracting first two words from question category
def prefix(question):
    tok = re.findall(r"\w+", question.lower())
    return " ".join(tok[:2]) if len(tok) >= 2 else tok[0]

# Predicting the answer for the single record using ViLT model
@torch.no_grad()
def vilt_predict(rec):
    img = transforms.Resize((224,224))(Image.open(rec["image_path"]).convert("RGB"))
    enc = processor(images=img, text=rec["question"], return_tensors="pt").to(DEVICE)
    idx = model(**enc).logits.argmax(1).item()
    return id2label[idx]

# gather stats
stats = defaultdict(lambda:[0,0])        # correct, total
for rec in tqdm(records, desc="Predict all"):
    pred = vilt_predict(rec)
    gt   = maj_answer(rec["answers"])
    pfx  = prefix(rec["question"])
    stats[pfx][1] += 1
    if pred.lower() == gt.lower():
        stats[pfx][0] += 1

rows = [{"qtype": k, "acc%": 100*c/t, "count": t}
        for k,(c,t) in stats.items() if t >= 20]          # ignore rare types
df = pd.DataFrame(rows).sort_values("acc%", ascending=False)

print("Top-5 question prefixes")
display(df.head(5).style.format({"acc%":"{:.1f}"}))
print("\nBottom-5 question prefixes")
display(df.tail(5).style.format({"acc%":"{:.1f}"}))


Predict all:   0%|          | 0/15000 [00:10<?, ?it/s]

Top-5 question prefixes


Unnamed: 0,qtype,acc%,count
33,could this,100.0,24
53,do the,95.2,42
43,did the,93.3,30
9,can you,92.5,67
30,is she,92.0,50



Bottom-5 question prefixes


Unnamed: 0,qtype,acc%,count
37,where are,3.1,97
28,what do,0.0,28
29,where was,0.0,34
50,what colors,0.0,29
54,what country,0.0,23
