## Library and Dataset installation
- Install necessary libraries

### Download Dataset

In [None]:
import zipfile
import os
import gdown

# ----------------- custom dataset ----------------- #
file_id_1 = "1FMVcFM78XZE1KE1rIkGBpCdcdI58S1LB"
output_1 = "image_caption_dataset.zip"

print("Downloading original image caption dataset...")
gdown.download(f"https://drive.google.com/uc?id={file_id_1}", output_1, quiet=False)

print("Extracting image_caption_dataset.zip...")
with zipfile.ZipFile(output_1, 'r') as zip_ref:
    zip_ref.extractall()

# Delete the ZIP file
os.remove(output_1)
print("Deleted image_caption_dataset.zip")

# ----------------- occluded dataset ----------------- #
file_id_2 = "1GdBV3YjpheJNKhHs7KJVreVNnXZlMSsr"
output_2 = "occluded_datasets.zip"

print("Downloading occluded datasets (10%, 50%, 80%)...")
gdown.download(f"https://drive.google.com/uc?id={file_id_2}", output_2, quiet=False)

print("Extracting occluded_datasets.zip...")
with zipfile.ZipFile(output_2, 'r') as zip_ref:
    zip_ref.extractall()

# Delete the ZIP file
os.remove(output_2)
print("Deleted occluded_datasets.zip")

print("All datasets downloaded, extracted, and cleaned up.")

# ----------------- model weights ----------------- #
file_id = "1aMX9P2Jeb1Rg3AMNZFrNUDodhxIT7Use" 
output_filename = "clip_gpt_image_captioner.pth"  

url = f'https://drive.google.com/uc?id={file_id}'

print(f"Attempting to download file with ID: {file_id}")
print(f"Saving as: {output_filename}")


gdown.download(url, output_filename, quiet=False, fuzzy=True)

file_size = os.path.getsize(output_filename)
print(f"Saved as: '{os.path.abspath(output_filename)}'")


Downloading original image caption dataset...


Downloading...
From (original): https://drive.google.com/uc?id=1FMVcFM78XZE1KE1rIkGBpCdcdI58S1LB
From (redirected): https://drive.google.com/uc?id=1FMVcFM78XZE1KE1rIkGBpCdcdI58S1LB&confirm=t&uuid=c74262ce-157a-404b-b144-77cc52994f09
To: /content/image_caption_dataset.zip
100%|██████████| 286M/286M [00:01<00:00, 147MB/s]


Extracting image_caption_dataset.zip...
Deleted image_caption_dataset.zip
Downloading occluded datasets (10%, 50%, 80%)...


Downloading...
From (original): https://drive.google.com/uc?id=1GdBV3YjpheJNKhHs7KJVreVNnXZlMSsr
From (redirected): https://drive.google.com/uc?id=1GdBV3YjpheJNKhHs7KJVreVNnXZlMSsr&confirm=t&uuid=b0b87dc8-2c4a-4760-819d-2480abd9ddce
To: /content/occluded_datasets.zip
100%|██████████| 503M/503M [00:07<00:00, 64.6MB/s]


Extracting occluded_datasets.zip...
Deleted occluded_datasets.zip
All datasets downloaded, extracted, and cleaned up.


## Necessary classes and function implementations
- To feed data into the model

In [2]:
!pip install evaluate
!pip install rouge_score



In [None]:

import os
import pandas as pd
import numpy as np
import torch
import torchvision.transforms as transforms
from PIL import Image
import random
from tqdm import tqdm
import evaluate
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from transformers import (
    GPT2Tokenizer,
    CLIPModel,
    AutoProcessor,
    AutoModelForImageTextToText,
    GPT2LMHeadModel,
    GPT2Config
)
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import time
from typing import List, Dict, Union, Optional

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_DATA_DIR = "custom_captions_dataset"
TEST_CSV_PATH = os.path.join(BASE_DATA_DIR, "test.csv")
TEST_IMAGE_DIR = os.path.join(BASE_DATA_DIR, "test")
CUSTOM_MODEL_WEIGHTS_PATH = "clip_gpt_image_captioner.pth"
OUTPUT_DIR = "partb_results"
SMOLVLM_MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
CUSTOM_CLIP_MODEL_NAME="openai/clip-vit-base-patch32"
CUSTOM_GPT2_MODEL_NAME="gpt2"


def occlude_image(image: Image.Image, mask_percentage: int, patch_size=16) -> np.array:
    np_image = np.array(image)
    if len(np_image.shape) != 3: return np_image

    h, w, c = np_image.shape
    patches_h = h // patch_size
    patches_w = w // patch_size
    if patches_h == 0 or patches_w == 0: return np_image

    total_patches = patches_h * patches_w
    num_patches_to_mask = int((mask_percentage / 100.0) * total_patches)
    if num_patches_to_mask == 0 and mask_percentage > 0: num_patches_to_mask = 1

    if num_patches_to_mask > 0:
        all_patch_indices = np.arange(total_patches)
        mask_indices = np.random.choice(all_patch_indices, size=num_patches_to_mask, replace=False)
        occluded_np_image = np_image.copy()
        for idx in mask_indices:
            patch_row, patch_col = idx // patches_w, idx % patches_w
            h_start, w_start = patch_row * patch_size, patch_col * patch_size
            occluded_np_image[h_start:h_start + patch_size, w_start:w_start + patch_size, :] = 0
        return occluded_np_image
    else:
        return np_image


class ImageCaptionTestDataset(Dataset):
    def __init__(self, csv_path, image_dir):
        try:
            self.data = pd.read_csv(csv_path)
            if 'filename' not in self.data.columns or 'caption' not in self.data.columns:
                 raise ValueError("CSV must contain 'filename' and 'caption' columns.")
            self.data.dropna(subset=['filename', 'caption'], inplace=True)
            self.data.reset_index(drop=True, inplace=True)
        except FileNotFoundError:
            print(f"Error: Test CSV file not found at {csv_path}")
            self.data = pd.DataFrame(columns=['filename', 'caption'])
        except ValueError as ve:
             print(f"Error: {ve}")
             self.data = pd.DataFrame(columns=['filename', 'caption'])

        self.image_dir = image_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx): idx = idx.tolist()
        if idx >= len(self.data): raise IndexError("Index out of bounds")

        row = self.data.iloc[idx]
        filename = row["filename"]
        caption = str(row["caption"])
        image_path = os.path.join(self.image_dir, filename)
        try:
             image = Image.open(image_path).convert("RGB")
        except Exception as e:
             print(f"Warning: Error loading image {image_path}: {e}. Returning None.")
             image = None

        return {"image": image, "caption": caption, "filename": filename}


class ImageCaptionModel(nn.Module):
    def __init__(self, clip_model=CUSTOM_CLIP_MODEL_NAME, gpt2_model=CUSTOM_GPT2_MODEL_NAME,
                 freeze_clip=True, freeze_gpt2_partial=True, projection_dim=256, contrastive_weight=1):
        super(ImageCaptionModel, self).__init__()
        self.contrastive_weight = contrastive_weight
        clip = CLIPModel.from_pretrained(clip_model)
        self.encoder = clip.vision_model
        self.encoder_dim = self.encoder.config.hidden_size
        if freeze_clip:
            for param in self.encoder.parameters(): param.requires_grad = False

        gpt2_config = GPT2Config.from_pretrained(gpt2_model)
        gpt2_config.add_cross_attention = True
        self.decoder = GPT2LMHeadModel.from_pretrained(gpt2_model, config=gpt2_config)
        self.decoder_dim = self.decoder.config.hidden_size
        if freeze_gpt2_partial:
            num_layers_to_freeze = len(self.decoder.transformer.h) - 2
            if num_layers_to_freeze > 0:
                for i, block in enumerate(self.decoder.transformer.h):
                    if i < num_layers_to_freeze:
                        for param in block.parameters(): param.requires_grad = False

        self.connect = nn.Sequential(
            nn.Linear(self.encoder_dim, self.decoder_dim * 2), nn.GELU(), nn.Linear(self.decoder_dim * 2, self.decoder_dim)
        )
        self.img_projection = nn.Sequential(
            nn.Linear(self.encoder_dim, projection_dim), nn.ReLU(), nn.Linear(projection_dim, projection_dim)
        )
        self.txt_projection = nn.Sequential(
            nn.Linear(self.decoder_dim, projection_dim), nn.ReLU(), nn.Linear(projection_dim, projection_dim)
        )
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model)
        if self.tokenizer.pad_token is None:
             self.tokenizer.add_special_tokens({'pad_token': self.tokenizer.eos_token})
             self.decoder.resize_token_embeddings(len(self.tokenizer))

        special_tokens = {'additional_special_tokens': ['<|img|>', '<|caption|>']}
        num_added = self.tokenizer.add_special_tokens(special_tokens)
        if num_added > 0: self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.img_token_id = self.tokenizer.convert_tokens_to_ids("<|img|>")
        self.caption_token_id = self.tokenizer.convert_tokens_to_ids("<|caption|>")
        if self.img_token_id == self.tokenizer.unk_token_id: self.img_token_id = self.tokenizer.eos_token_id
        if self.caption_token_id == self.tokenizer.unk_token_id: self.caption_token_id = self.tokenizer.eos_token_id

    def generate_caption(self, image_tensor):
        self.eval()
        with torch.no_grad():
            encoder_outputs = self.encoder(pixel_values=image_tensor).last_hidden_state
            cls_output = encoder_outputs[:, 0, :]
            img_features = self.connect(cls_output)
            batch_size = img_features.size(0)
            prefix_tokens = torch.tensor([[self.img_token_id, self.caption_token_id]] * batch_size, dtype=torch.long, device=img_features.device)
            generated_ids = self.decoder.generate(
                input_ids=prefix_tokens, encoder_hidden_states=img_features.unsqueeze(1),
                max_length=50 + 2, num_beams=4, early_stopping=True,
                pad_token_id=self.tokenizer.pad_token_id, eos_token_id=self.tokenizer.eos_token_id,
                no_repeat_ngram_size=2,
            )
            prefix_len = prefix_tokens.shape[1]
            decoded_captions = self.tokenizer.batch_decode(generated_ids[:, prefix_len:], skip_special_tokens=True)
            return [caption.strip() for caption in decoded_captions]


class SmolVLMWrapper(nn.Module):
    def __init__(self, model_name: str = SMOLVLM_MODEL_NAME,
                 torch_dtype: torch.dtype = torch.bfloat16,
                 device_map: str = "auto"):
        super().__init__()
        self.model_name = model_name
        self.torch_dtype = torch_dtype if torch.cuda.is_available() else torch.float32
        self.device_map = device_map
        print(f"Initializing SmolVLM Wrapper: {self.model_name} (dtype: {self.torch_dtype})")
        try:
            self.model = AutoModelForImageTextToText.from_pretrained(
                self.model_name, torch_dtype=self.torch_dtype, device_map=self.device_map,
                 _attn_implementation="eager"
            )
            self.processor = AutoProcessor.from_pretrained(self.model_name)
            print("SmolVLM wrapped model and processor loaded.")
        except Exception as e:
            print(f"Error loading wrapped model/processor '{self.model_name}': {e}")
            raise e
        self.eval()

    def forward(self, **kwargs):
        return self.model(**kwargs)

    @torch.no_grad()
    def generate_caption(self, image_input: Union[str, Image.Image],
                         prompt_text: str = "Describe this image in detail.",
                         max_new_tokens: int = 100,
                         do_sample: bool = False, **generate_kwargs) -> str:
        self.eval()
        try:
            if isinstance(image_input, str): image = Image.open(image_input).convert("RGB")
            elif isinstance(image_input, Image.Image): image = image_input.convert("RGB")
            else: raise ValueError("image_input must be a file path (str) or a PIL Image.")
        except Exception as e: return f"Error: Could not load image - {e}"

        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}]
        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
        inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device, dtype=self.model.dtype)

        try:
            generated_ids = self.model.generate(
                **inputs, max_new_tokens=max_new_tokens, do_sample=do_sample, **generate_kwargs
            )
            input_len = inputs["input_ids"].shape[1]
            generated_ids = generated_ids[:, input_len:]
            generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
            return generated_text
        except Exception as e: return f"Error: Generation failed - {e}"


# --- Helper Function for Metric Calculation ---
def calculate_metrics(predictions, references):
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    meteor = evaluate.load("meteor")

    valid_indices = [i for i, p in enumerate(predictions) if isinstance(p, str) and p and not p.startswith("ERROR:")]
    if not valid_indices: return {'BLEU': 0, 'ROUGE-L': 0, 'METEOR': 0}

    valid_preds = [predictions[i] for i in valid_indices]
    valid_refs = [references[i] for i in valid_indices]

    try: bleu_score = bleu.compute(predictions=valid_preds, references=valid_refs)["bleu"]
    except: bleu_score = 0
    try: rouge_score = rouge.compute(predictions=valid_preds, references=valid_refs)["rougeL"]
    except: rouge_score = 0
    try: meteor_score_val = meteor.compute(predictions=valid_preds, references=valid_refs)["meteor"]
    except Exception as e: meteor_score_val = 0; print(f"Meteor calc failed: {e}")

    return {'BLEU': bleu_score, 'ROUGE-L': rouge_score, 'METEOR': meteor_score_val}


# --- Combined Evaluation Function (Handles one model type at a time) ---
def evaluate_single_model_on_occluded_images(
    model,                  # The model instance (either ImageCaptionModel or SmolVLMWrapper)
    model_identifier: str,  # A string identifier like "custom_model" or "smolvlm"
    test_csv_path: str,
    image_dir: str,
    output_dir: str,
    device: torch.device,
    occlusion_levels: List[int],
    custom_transform: Optional[transforms.Compose] = None # Only needed for ImageCaptionModel
):
    """
    Evaluates a SINGLE model (Custom or SmolVLM) on occluded images.
    """
    os.makedirs(output_dir, exist_ok=True)
    model.eval()

    is_custom_model = isinstance(model, ImageCaptionModel)
    is_smol_wrapper = isinstance(model, SmolVLMWrapper)

    if not is_custom_model and not is_smol_wrapper:
        raise TypeError("Model must be an instance of ImageCaptionModel or SmolVLMWrapper")
    if is_custom_model and custom_transform is None:
        raise ValueError("custom_transform must be provided for ImageCaptionModel")

    print(f"\n--- Evaluating Model: {model_identifier} ---")

    test_dataset = ImageCaptionTestDataset(csv_path=test_csv_path, image_dir=image_dir)
    def safe_collate(batch):
        batch = [item for item in batch if item["image"] is not None]
        return batch[0] if batch else None
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=safe_collate, num_workers=0)

    results_per_level = {}

    for level in occlusion_levels:
        print(f"Processing Occlusion Level: {level}%")
        level_preds = []
        level_refs = []
        level_filenames = []

        for batch_data in tqdm(test_dataloader, desc=f"Generating ({model_identifier}, {level}%)"):
            if batch_data is None: continue

            pil_image = batch_data["image"]
            ref_caption = batch_data["caption"]
            filename = batch_data["filename"]

            if level > 0:
                occluded_np = occlude_image(pil_image, level)
                processed_image = Image.fromarray(occluded_np)
            else:
                processed_image = pil_image

            # Generate caption based on model type
            if is_custom_model:
                 image_tensor = custom_transform(processed_image).unsqueeze(0).to(device)
                 caption = model.generate_caption(image_tensor)[0]
            elif is_smol_wrapper:
                 caption = model.generate_caption(processed_image)


            level_preds.append(caption)
            level_refs.append([ref_caption])
            level_filenames.append(filename)

        # Calculate metrics for this level
        level_metrics = calculate_metrics(level_preds, level_refs)
        print(f"Metrics @{level}% ({model_identifier}): {level_metrics}")
        results_per_level[level] = {
            'scores': level_metrics,
            'filenames': level_filenames,
            'predictions': level_preds,
            'references': level_refs
        }

        # Save intermediate predictions
        df_level = pd.DataFrame({'filename': level_filenames, 'generated_caption': level_preds})
        df_level.to_csv(os.path.join(output_dir, f"{model_identifier}_captions_{level}.csv"), index=False)

    return results_per_level


# --- Function to combine results and create Part C data ---
def combine_and_analyze_results(
        smolvlm_results: Dict, custom_results: Dict,
        test_csv_path: str, output_dir: str
    ):
    """
    Combines results from both models, calculates changes, saves summary and Part C data.
    """
    print("\n--- Combining Results and Generating Final Outputs ---")
    part_c_data = []
    summary_list = []
    occlusion_levels = sorted(list(smolvlm_results.keys()))

    # --- Process SmolVLM ---
    baseline_smol = smolvlm_results.get(0, {}).get('scores')
    for level in occlusion_levels:
        if level not in smolvlm_results: continue
        data = smolvlm_results[level]
        scores = data['scores']
        changes = {}
        if level > 0 and baseline_smol:
             changes = {m + '_change': scores.get(m, 0) - baseline_smol.get(m, 0) for m in baseline_smol}

        row = {'Model': 'SmolVLM', 'Occlusion Level': level}
        row.update(scores)
        row.update(changes)
        summary_list.append(row)

        # Add to Part C data (excluding errors, excluding baseline level 0)
        if level > 0:
             refs_flat = [r[0] for r in data['references']]
             for orig, gen, fname in zip(refs_flat, data['predictions'], data['filenames']):
                  if isinstance(gen, str) and not gen.startswith("ERROR:"):
                       part_c_data.append({
                          "original_caption": orig, "generated_caption": gen,
                          "perturbation_percentage": level, "model_label": "Model A"
                       })

    # --- Process Custom Model ---
    baseline_custom = custom_results.get(0, {}).get('scores')
    for level in occlusion_levels:
        if level not in custom_results: continue
        data = custom_results[level]
        scores = data['scores']
        changes = {}
        if level > 0 and baseline_custom:
            changes = {m + '_change': scores.get(m, 0) - baseline_custom.get(m, 0) for m in baseline_custom}

        row = {'Model': 'Custom Model', 'Occlusion Level': level}
        row.update(scores)
        row.update(changes)
        summary_list.append(row)

        # Add to Part C data (excluding errors, excluding baseline level 0)
        if level > 0:
            refs_flat = [r[0] for r in data['references']]
            for orig, gen, fname in zip(refs_flat, data['predictions'], data['filenames']):
                 if isinstance(gen, str) and not gen.startswith("ERROR:"):
                      part_c_data.append({
                         "original_caption": orig, "generated_caption": gen,
                         "perturbation_percentage": level, "model_label": "Model B"
                      })

    # --- Save Summary ---
    summary_df = pd.DataFrame(summary_list)
    summary_cols = ['Model', 'Occlusion Level', 'BLEU', 'ROUGE-L', 'METEOR',
                    'BLEU_change', 'ROUGE-L_change', 'METEOR_change']
    summary_df = summary_df[[col for col in summary_cols if col in summary_df.columns]]
    summary_csv_path = os.path.join(output_dir, "partb_evaluation_summary.csv")
    summary_df.to_csv(summary_csv_path, index=False, float_format='%.4f')
    print(f"Saved evaluation summary to {summary_csv_path}")
    print("\nEvaluation Summary:")
    print(summary_df.to_string(float_format='%.4f'))

    # --- Save Part C Data ---
    partc_df = pd.DataFrame(part_c_data)
    partc_csv_path = os.path.join(output_dir, "partc_data.csv")
    partc_df.to_csv(partc_csv_path, index=False)
    print(f"\nSaved Part C data ({len(partc_df)} rows) to {partc_csv_path}")


# --- Main Execution Block ---
if __name__ == "__main__":
    start_time = time.time()
    print(f"Using device: {DEVICE}")
    print(f"Base data directory: {BASE_DATA_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")

    # --- Load Models ---
    print("\nLoading SmolVLM model...")
    smolvlm_dtype = torch.bfloat16 if DEVICE == torch.device("cuda") and torch.cuda.is_bf16_supported() else torch.float16 if DEVICE == torch.device("cuda") else torch.float32
    smolvlm_wrapped = SmolVLMWrapper(
        model_name=SMOLVLM_MODEL_NAME,
        torch_dtype=smolvlm_dtype,
        device_map="auto"
    )

    print("\nLoading Custom model...")
    custom_model = ImageCaptionModel(
        clip_model=CUSTOM_CLIP_MODEL_NAME,
        gpt2_model=CUSTOM_GPT2_MODEL_NAME
    ).to(DEVICE)
    try:
        custom_model.load_state_dict(torch.load(CUSTOM_MODEL_WEIGHTS_PATH, map_location=DEVICE))
        print(f"Loaded custom model weights from {CUSTOM_MODEL_WEIGHTS_PATH}")
    except FileNotFoundError:
        print(f"Error: Custom model weights not found at {CUSTOM_MODEL_WEIGHTS_PATH}. Exiting.")
        exit()
    except Exception as e:
        print(f"Error loading custom model weights: {e}. Exiting.")
        exit()

    custom_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # --- Create DataLoader ---
    print("\nCreating Test DataLoader...")
    def safe_collate(batch):
        batch = [item for item in batch if item["image"] is not None]
        return batch[0] if batch else None
    test_dataset = ImageCaptionTestDataset(csv_path=TEST_CSV_PATH, image_dir=TEST_IMAGE_DIR)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=safe_collate, num_workers=0)
    print(f"DataLoader created with {len(test_dataset)} samples.")

    # --- Run Evaluation for Each Model ---
    occlusion_levels_to_run = [0, 10, 50, 80]

    custom_eval_results = evaluate_single_model_on_occluded_images(
        model=custom_model,
        model_identifier="custom_model",
        test_csv_path=TEST_CSV_PATH,
        image_dir=TEST_IMAGE_DIR,
        output_dir=OUTPUT_DIR,
        device=DEVICE,
        occlusion_levels=occlusion_levels_to_run,
        custom_transform=custom_transform
    )

    smolvlm_eval_results = evaluate_single_model_on_occluded_images(
        model=smolvlm_wrapped,
        model_identifier="smolvlm",
        test_csv_path=TEST_CSV_PATH,
        image_dir=TEST_IMAGE_DIR,
        output_dir=OUTPUT_DIR,
        device=DEVICE,
        occlusion_levels=occlusion_levels_to_run,
        custom_transform=None
    )



    # --- Combine Results ---
    if smolvlm_eval_results and custom_eval_results:
         combine_and_analyze_results(
             smolvlm_results=smolvlm_eval_results,
             custom_results=custom_eval_results,
             test_csv_path=TEST_CSV_PATH,
             output_dir=OUTPUT_DIR
         )
    else:
         print("\nError: Evaluation failed for one or both models. Cannot combine results.")


    end_time = time.time()
    print(f"\n--- Part B Evaluation Finished ---")
    print(f"Total time taken: {end_time - start_time:.2f} seconds")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using device: cuda
Base data directory: test_subset_data
Output directory: partb_results

Loading SmolVLM model...
Initializing SmolVLM Wrapper: HuggingFaceTB/SmolVLM-256M-Instruct (dtype: torch.bfloat16)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SmolVLM wrapped model and processor loaded.

Loading Custom model...


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

Error loading custom model weights: PytorchStreamReader failed reading zip archive: failed finding central directory. Exiting.

Creating Test DataLoader...
DataLoader created with 20 samples.

--- Evaluating Model: custom_model ---
Processing Occlusion Level: 0%


Generating (custom_model, 0%):   0%|          | 0/20 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating (custom_model, 0%): 100%|██████████| 20/20 [00:21<00:00,  1.07s/it]
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Metrics @0% (custom_model): {'BLEU': 0.0, 'ROUGE-L': np.float64(0.1784728886560233), 'METEOR': np.float64(0.13413293242690252)}
Processing Occlusion Level: 10%


Generating (custom_model, 10%): 100%|██████████| 20/20 [00:20<00:00,  1.02s/it]
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Metrics @10% (custom_model): {'BLEU': 0.0, 'ROUGE-L': np.float64(0.18002399771529673), 'METEOR': np.float64(0.13207572727185946)}
Processing Occlusion Level: 50%


Generating (custom_model, 50%):  65%|██████▌   | 13/20 [00:17<00:07,  1.10s/it]