In [None]:
!pip install -q rouge-score
!pip install nltk rouge

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

from rouge_score import rouge_scorer
from transformers import ViTModel, GPT2LMHeadModel, ViTFeatureExtractor, GPT2Tokenizer
from transformers import AutoProcessor, BlipForConditionalGeneration
from transformers import AutoTokenizer

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import nltk
nltk.download('wordnet')

from collections import defaultdict
from transformers import ViTImageProcessor
import zipfile
import json

from transformers import AutoProcessor, AutoModelForVision2Seq
from rouge import Rouge

[nltk_data] Downloading package wordnet to /root/nltk_data...


### load model from drive

In [3]:
!pip install -q gdown

file_id = "199sCxfrCK8o9_e_6tlYZU6VUgfkjWTyk"
!gdown --id {file_id} --output my_model.pth


Downloading...
From (original): https://drive.google.com/uc?id=199sCxfrCK8o9_e_6tlYZU6VUgfkjWTyk
From (redirected): https://drive.google.com/uc?id=199sCxfrCK8o9_e_6tlYZU6VUgfkjWTyk&confirm=t&uuid=e4c006f7-a624-4b35-be00-9db7a02b596c
To: /content/my_model.pth
100% 586M/586M [00:10<00:00, 55.7MB/s]


In [4]:
class ImageCaptionModel(nn.Module):
    """
    Custom Encoder-Decoder Model for Image Captioning using ViT as an encoder.
    """
    def __init__(self, vit_name="WinKawaks/vit-small-patch16-224", gpt2_name="gpt2"):
        """
        Initialize the model.

        Args:
            vit_name: Name of the pre-trained ViT model
            gpt2_name: Name of the pre-trained GPT-2 model
        """
        super().__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.encoder = ViTModel.from_pretrained(vit_name)
        self.decoder = GPT2LMHeadModel.from_pretrained(gpt2_name)

        # Project ViT CLS token output to GPT2 hidden size
        self.encoder_to_decoder = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.n_embd)

    def forward(self, pixel_values, input_ids, labels=None):
        """
        Forward pass.

        Args:
            pixel_values: Image tensor
            input_ids: Text token IDs
            labels: Text token labels

        Returns:
            Model output with loss and logits
        """
        encoder_outputs = self.encoder(pixel_values=pixel_values)
        cls_embedding = encoder_outputs.last_hidden_state[:, 0, :]  # Take CLS token
        projected_embedding = self.encoder_to_decoder(cls_embedding)

        batch_size = input_ids.size(0)
        prefix_embedding = projected_embedding.unsqueeze(1)
        decoder_inputs_embeds = self.decoder.transformer.wte(input_ids)
        decoder_inputs_embeds = torch.cat([prefix_embedding, decoder_inputs_embeds], dim=1)

        # Adding prefix with -100 so that it doesn't affect the loss
        if labels is not None:
            prefix_pad = torch.full((batch_size, 1), -100, device=labels.device)
            labels = torch.cat([prefix_pad, labels], dim=1)

        outputs = self.decoder(
            inputs_embeds=decoder_inputs_embeds,
            labels=labels,
            return_dict=True
        )

        return outputs

In [5]:


processor = ViTImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = ImageCaptionModel()
model.load_state_dict(torch.load("my_model.pth", map_location=device, weights_only=True))
model.to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Using device: cuda


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

ImageCaptionModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=384, out_features=1536, bias=True)
            (intermediate

### Load images

In [6]:

with zipfile.ZipFile("/content/custom_captions_dataset.zip", "r") as zip_ref:
    zip_ref.extractall("/content/")
test_image_dir = "/content/custom_captions_dataset/test"

image_files = sorted([
    file for file in os.listdir(test_image_dir)
    if file.lower().endswith((".jpg"))
])
# print(image_files)
print(len(image_files))


928


### divide into 16 X 16 patches

In [7]:

def divide_into_patches(image, patch_size=16):
    image = image.resize((224, 224))  # Resize to standard ViT size
    image_np = np.array(image)

    patches = []
    h, w, _ = image_np.shape
    for i in range(0, h, patch_size):
        for j in range(0, w, patch_size):
            patch = image_np[i:i+patch_size, j:j+patch_size]
            patches.append(patch)
    return patches


In [8]:
all_image_patches = {}

for image_filename in image_files:
    image_path = os.path.join(test_image_dir, image_filename)
    image = Image.open(image_path).convert("RGB")
    patches = divide_into_patches(image, patch_size=16)
    all_image_patches[image_filename] = patches

print(len(all_image_patches))

928


### reconstruction from 16 X 16 patches function and getting the masked images

In [9]:

def reconstruct_image_from_patches(patches, patch_size=16, image_size=224):
    rows = cols = image_size // patch_size
    reconstructed = np.zeros((image_size, image_size, 3), dtype=np.uint8)

    idx = 0
    for i in range(rows):
        for j in range(cols):
            reconstructed[i*patch_size:(i+1)*patch_size, j*patch_size:(j+1)*patch_size] = patches[idx]
            idx += 1
    return Image.fromarray(reconstructed)

def mask_patches(patches, percent):
    patches = patches.copy()
    num_to_mask = int(len(patches) * (percent / 100))
    indices = random.sample(range(len(patches)), num_to_mask)
    for idx in indices:
        patches[idx][:] = 0  # Black out the patch
    return patches

# Dictionary to store masked images
masked_versions = {10: {}, 50: {}, 80: {}}

# Apply patch masking
for filename in image_files:
    image_path = os.path.join(test_image_dir, filename)
    image = Image.open(image_path).convert("RGB")
    patches = divide_into_patches(image)

    for pct in [10, 50, 80]:
        masked_patches = mask_patches(patches, percent=pct)
        masked_image = reconstruct_image_from_patches(masked_patches)
        masked_versions[pct][filename] = masked_image

print("Masked images ready for all occlusion levels.")

Masked images ready for all occlusion levels.


In [10]:


def occlude_image(image_np, mask_percentage):
    image = Image.fromarray(image_np).resize((224, 224))
    image_np = np.array(image)

    patch_size = 16
    h, w, _ = image_np.shape
    patches = []

    for i in range(0, h, patch_size):
        for j in range(0, w, patch_size):
            patch = image_np[i:i+patch_size, j:j+patch_size]
            patches.append(patch)

    num_to_mask = int(len(patches) * (mask_percentage / 100))
    indices = random.sample(range(len(patches)), num_to_mask)

    for idx in indices:
        patches[idx][:] = 0

    reconstructed = np.zeros_like(image_np)
    idx = 0
    for i in range(0, h, patch_size):
        for j in range(0, w, patch_size):
            reconstructed[i:i+patch_size, j:j+patch_size] = patches[idx]
            idx += 1

    return Image.fromarray(reconstructed)

In [11]:
# occlude image
# Directory and file list
test_image_dir = "/content/custom_captions_dataset/test"
image_files = os.listdir(test_image_dir)

# Final dictionary of occluded versions
masked_versions = {10: {}, 50: {}, 80: {}}

for filename in image_files:
    image_path = os.path.join(test_image_dir, filename)
    image = Image.open(image_path).convert("RGB")
    image_np = np.array(image)

    for pct in [10, 50, 80]:
        occluded = occlude_image(image_np, pct)
        masked_versions[pct][filename] = occluded

### From custom model :

In [12]:
def generate_custom_masked_captions(masked_versions,model,device):
    masked_captions = defaultdict(dict)

    if(device == None):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # processor = ViTImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224"
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for pct in [10, 50, 80]:
        print(f"\nGenerating captions for {pct}% masked images...")
        for fname, masked_img in masked_versions[pct].items():
            pixel_values = processor(images=masked_img, return_tensors="pt")['pixel_values'].to(device)

            with torch.no_grad():
                enc_out = model.encoder(pixel_values=pixel_values)
                cls_embed = enc_out.last_hidden_state[:, 0, :]
                projected_embedding = model.encoder_to_decoder(cls_embed)

                tokenizer = AutoTokenizer.from_pretrained("gpt2")
                tokens = [tokenizer.bos_token_id]
                for _ in range(50):
                    dec_input = torch.tensor([tokens], device=device)
                    embed = model.decoder.transformer.wte(dec_input)
                    prefix_embed = projected_embedding.unsqueeze(1)
                    full_embed = torch.cat([prefix_embed, embed], dim=1)
                    logits = model.decoder(inputs_embeds=full_embed).logits
                    next_token = torch.argmax(logits[:, -1, :], dim=-1).item()
                    if next_token == tokenizer.eos_token_id:
                        break
                    tokens.append(next_token)

                caption = tokenizer.decode(tokens, skip_special_tokens=True)
                masked_captions[pct][fname] = caption
                # print(f"{fname} ({pct}%): {caption}")

    with open("masked_image_captions_custom_model.json", "w") as f:
        json.dump(masked_captions, f, indent=2)


### from somlvlm

In [13]:
def generate_smolvlm_masked_captions(masked_versions,device):

    if(device == None):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load SmolVLM model and processor
    model_id = "HuggingFaceTB/SmolVLM-Instruct"
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
        _attn_implementation="eager"
    ).to(device)

    all_captions = {}

    for pct in [10, 50, 80]:
        print(f"\nGenerating captions for {pct}% masked images...")
        pct_captions = {}

        for image_name, image in tqdm(masked_versions[pct].items()):
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "What's in this image?"}
                    ]
                }
            ]
            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
            inputs = processor(text=prompt, images=[image], return_tensors="pt").to(device)

            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=64)

            caption = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
            pct_captions[image_name] = caption
            print(f"{image_name}: {caption}")

        all_captions[str(pct)] = pct_captions

    with open("smolvlm_masked_image_captions.json", "w") as f:
        json.dump(all_captions, f, indent=2)

    return all_captions


In [14]:
# Load ground truth from CSV (assumed format: filename, caption)
gt_df = pd.read_csv("/content/custom_captions_dataset/test.csv")
ground_truth = dict(zip(gt_df['filename'], gt_df['caption']))

# Initialize metrics
rouge = Rouge()
smoothie = SmoothingFunction().method4

# Evaluate function
def evaluate(preds, refs):
    bleu_scores, rouge_scores, meteor_scores = [], [], []
    for fname, pred in preds.items():
        if fname not in refs:
            continue
        ref = refs[fname]
        bleu = sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
        rouge_score = rouge.get_scores(pred, ref)[0]['rouge-l']['f']
        meteor = meteor_score([ref.split()], pred.split())

        bleu_scores.append(bleu)
        rouge_scores.append(rouge_score)
        meteor_scores.append(meteor)

    return (
        sum(bleu_scores) / len(bleu_scores),
        sum(rouge_scores) / len(rouge_scores),
        sum(meteor_scores) / len(meteor_scores)
    )

# evaluation function for occluded images
def evaluate_on_occluded_images(model, dataloader, device, occlusion_levels=[0, 10, 50, 80]):
    results = {}

    # Check for existing custom captions
    try:
        with open("masked_image_captions_custom_model.json", "r") as f:
            custom_captions = json.load(f)
            print("Loaded captions from masked_image_captions_custom_model.json")
    except FileNotFoundError:
        print("masked_image_captions_custom_model.json not found, generating custom captions...")
        custom_captions = generate_custom_masked_captions(masked_versions,model,device)

    # Check for existing smolvlm captions
    try:
        with open("masked_image_captions_smolvlm.json", "r") as f:
            smolvlm_captions = json.load(f)
            print("Loaded captions from masked_image_captions_smolvlm.json")
    except FileNotFoundError:
        print("masked_image_captions_smolvlm.json not found, generating smolvlm captions...")
        smolvlm_captions = generate_smolvlm_masked_captions(masked_versions,device)


    for pct in occlusion_levels:
        print(f"\n=== {pct}% Occlusion Evaluation ===")
        c_bleu, c_rouge, c_meteor = evaluate(custom_captions[str(pct)], dataloader)
        s_bleu, s_rouge, s_meteor = evaluate(smolvlm_captions[str(pct)], dataloader)

        print(f"Custom Model - BLEU: {c_bleu:.4f}, ROUGE-L: {c_rouge:.4f}, METEOR: {c_meteor:.4f}")
        print(f"SmolVLM      - BLEU: {s_bleu:.4f}, ROUGE-L: {s_rouge:.4f}, METEOR: {s_meteor:.4f}")

        results[pct] = { # Changed to pct to store integer keys
            "Custom Model": {"BLEU": c_bleu, "ROUGE-L": c_rouge, "METEOR": c_meteor},
            "SmolVLM": {"BLEU": s_bleu, "ROUGE-L": s_rouge, "METEOR": s_meteor}
        }

    return results

results = evaluate_on_occluded_images(model, ground_truth, 'cuda', [0, 10, 50, 80])


Loaded captions from masked_image_captions_custom_model.json
Loaded captions from masked_image_captions_smolvlm.json

=== 0% Occlusion Evaluation ===
Custom Model - BLEU: 0.0374, ROUGE-L: 0.2809, METEOR: 0.1867
SmolVLM      - BLEU: 0.0230, ROUGE-L: 0.2592, METEOR: 0.1303

=== 10% Occlusion Evaluation ===
Custom Model - BLEU: 0.0313, ROUGE-L: 0.2650, METEOR: 0.1659
SmolVLM      - BLEU: 0.0177, ROUGE-L: 0.2505, METEOR: 0.1150

=== 50% Occlusion Evaluation ===
Custom Model - BLEU: 0.0312, ROUGE-L: 0.2635, METEOR: 0.1661
SmolVLM      - BLEU: 0.0103, ROUGE-L: 0.2173, METEOR: 0.0895

=== 80% Occlusion Evaluation ===
Custom Model - BLEU: 0.0306, ROUGE-L: 0.2520, METEOR: 0.1618
SmolVLM      - BLEU: 0.0042, ROUGE-L: 0.1336, METEOR: 0.0536
