In [1]:
!pip uninstall -y transformers
!pip install --no-cache-dir transformers accelerate timm --upgrade
!pip install transformers
!pip install sentencepiece
!pip install torch torchvision
!pip install pillow
!pip install -U transformers
!pip install evaluate
!pip install rouge_score
!pip install nltk

Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
Successfully installed transformers-4.51.3


In [2]:
import os
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import evaluate

from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    ViTModel,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    ViTImageProcessor,
    get_linear_schedule_with_warmup
)

import zipfile


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Part A.1-Generate SmolVLM captions

In [3]:
def zero_shot_captioning(image_path,model_name):

    if model_name != "SmolVLM":
        raise ValueError("Only SmolVLM is currently supported.")

    # test_image_dir = "/content/custom_captions_dataset/test"
    test_image_dir = image_path

    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Load model and processor
    model_id = "HuggingFaceTB/SmolVLM-Instruct"
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
        _attn_implementation="eager"
    ).to(DEVICE)

    # Caption generation
    captions = {}
    for image_file in tqdm(os.listdir(test_image_dir)):
        if image_file.lower().endswith((".jpg")):
            image_path = os.path.join(test_image_dir, image_file)
            image = Image.open(image_path).convert("RGB")
            # Prepare message in the chat template style
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": "What's in this image?"}
                    ]
                }
            ]

            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
            inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE)

            # Generate caption
            with torch.no_grad():
                outputs = model.generate(**inputs, max_new_tokens=64)

            caption = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
            captions[image_file] = caption
            print(f" # {image_file} - {caption}")

    # Save results
    with open("smolvlm_generated_captions.json", "w") as f:
        json.dump(captions, f, indent=2)

    return captions

### Part A.2 - Custom Encode Decode Model

In [4]:

class ImageCaptionDataset(Dataset):

    def __init__(self, dataframe, image_dir, tokenizer, processor, max_length=32):
            """ dataframe: DataFrame with 'filename' and 'caption' columns"""
            self.data = dataframe
            self.image_dir = image_dir
            self.tokenizer = tokenizer
            self.processor = processor
            self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        caption = row['caption']

        image = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(images=image, return_tensors="pt")['pixel_values'].squeeze(0)

        tokens = self.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        input_ids = tokens.input_ids.squeeze(0)
        attention_mask = tokens.attention_mask.squeeze(0)

        # Labels (same as input_ids for LM training)
        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore loss on padding

        return {
            'pixel_values': pixel_values,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


In [5]:
class ImageCaptionModel(nn.Module):
    """
    Custom Encoder-Decoder Model for Image Captioning using ViT as an encoder.
    """
    def __init__(self, vit_name="WinKawaks/vit-small-patch16-224", gpt2_name="gpt2"):
        """
        Initialize the model.

        Args:
            vit_name: Name of the pre-trained ViT model
            gpt2_name: Name of the pre-trained GPT-2 model
        """
        super().__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.encoder = ViTModel.from_pretrained(vit_name)
        self.decoder = GPT2LMHeadModel.from_pretrained(gpt2_name)

        # Project ViT CLS token output to GPT2 hidden size
        self.encoder_to_decoder = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.n_embd)

    def forward(self, pixel_values, input_ids, labels=None):
        """
        Forward pass.

        Args:
            pixel_values: Image tensor
            input_ids: Text token IDs
            labels: Text token labels

        Returns:
            Model output with loss and logits
        """
        encoder_outputs = self.encoder(pixel_values=pixel_values)
        cls_embedding = encoder_outputs.last_hidden_state[:, 0, :]  # Take CLS token
        projected_embedding = self.encoder_to_decoder(cls_embedding)

        batch_size = input_ids.size(0)
        prefix_embedding = projected_embedding.unsqueeze(1)
        decoder_inputs_embeds = self.decoder.transformer.wte(input_ids)
        decoder_inputs_embeds = torch.cat([prefix_embedding, decoder_inputs_embeds], dim=1)

        # Adding prefix with -100 so that it doesn't affect the loss
        if labels is not None:
            prefix_pad = torch.full((batch_size, 1), -100, device=labels.device)
            labels = torch.cat([prefix_pad, labels], dim=1)

        outputs = self.decoder(
            inputs_embeds=decoder_inputs_embeds,
            labels=labels,
            return_dict=True
        )

        return outputs

## Train Model

In [6]:
def train_model(model, dataloader, optimizer, criterion, device, epochs):
    """
    Train the encoder-decoder model.

    Args:
        model: Custom image captioning model
        dataloader: Training data loader
        optimizer: Optimizer (e.g., Adam)
        criterion: Loss function
        device: Device to use ('cuda' or 'cpu')
        epochs: Number of epochs

    Returns:
        None
    """
    model.to(device)
    model.train()

    total_steps = len(dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=total_steps
    )

    for epoch in range(epochs):
        epoch_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)

        for batch in progress:
            images = batch['pixel_values'].to(device)
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            # Forward pass
            outputs = model(pixel_values=images, input_ids=input_ids, labels=labels)
            loss = outputs.loss

            # Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()

            epoch_loss += loss.item()
            progress.set_postfix(loss=loss.item())

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch+1} completed. Avg Loss: {avg_loss:.4f}")

### Captions generator

In [7]:
# for single image
def generate_caption(model, image_path, tokenizer, processor, device, max_len=64):
    model.eval()
    model.to(device)  # Ensure the model is on the correct device

    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt")['pixel_values'].to(device)

    with torch.no_grad():
        enc_out = model.encoder(pixel_values=pixel_values)
        cls_embed = enc_out.last_hidden_state[:, 0, :]
        projected_embedding = model.encoder_to_decoder(cls_embed)
        temperature = 0.8
        top_k = 40

        tokens = [tokenizer.bos_token_id]
        for _ in range(max_len):
            dec_input = torch.tensor([tokens], device=device)
            embed = model.decoder.transformer.wte(dec_input)

            # Append the projected embedding as a prefix
            prefix_embed = projected_embedding.unsqueeze(1)
            full_embed = torch.cat([prefix_embed, embed], dim=1)

            logits = model.decoder(inputs_embeds=full_embed).logits
            next_token = torch.argmax(logits[:, -1, :], dim=-1).item()

            if next_token == tokenizer.eos_token_id:
                break

            tokens.append(next_token)

    return tokenizer.decode(tokens, skip_special_tokens=True)

def generate_captions(model, tokenizer, processor):
    captions = {}
    test_dir = "/content/custom_captions_dataset/test"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for fname in tqdm(os.listdir(test_dir)):
        if fname.lower().endswith(".jpg"):
            path = os.path.join(test_dir, fname)
            cap = generate_caption(model, path, tokenizer, processor, device)  # Pass device separately
            captions[fname] = cap
            print(f"{fname}: {cap}")

    with open("custom_model_generated_captions.json", "w") as f:
        json.dump(captions, f, indent=2)
    return captions



### Part A.3 - Evaluate Model

In [8]:
def evaluate_model(model, dataloader, device):
    """
    Evaluate model performance using BLEU, ROUGE-L, METEOR.

    Args:
        model: Trained model
        dataloader: Test data loader
        device: 'cuda' or 'cpu'

    Returns:
        dict: BLEU, ROUGE-L, METEOR scores for the test set
    """
    # Load evaluation metrics
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")

    model.to(device)
    model.eval()

    predictions = []
    references = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            images = batch['pixel_values'].to(device)
            input_ids = batch['input_ids']

            # Get the actual captions (references)
            for ids in input_ids:
                ref_caption = tokenizer.decode(ids, skip_special_tokens=True)
                references.append([ref_caption])  # List of lists format for BLEU

            # Generate captions
            if isinstance(model, ImageCaptionModel):
                # For custom model
                encoder_outputs = model.encoder(pixel_values=images)
                cls_embedding = encoder_outputs.last_hidden_state[:, 0, :]
                projected_embedding = model.encoder_to_decoder(cls_embedding)

                for i in range(len(images)):
                    emb = projected_embedding[i:i+1]
                    current_token = torch.tensor([[tokenizer.bos_token_id]]).to(device)
                    generated_tokens = [tokenizer.bos_token_id]

                    # Max length of 64 tokens
                    for _ in range(64):
                        prefix_embedding = emb.unsqueeze(1)
                        decoder_inputs_embeds = model.decoder.transformer.wte(current_token)
                        embeds = torch.cat([prefix_embedding, decoder_inputs_embeds], dim=1)

                        outputs = model.decoder(inputs_embeds=embeds)
                        next_token_logits = outputs.logits[:, -1, :]
                        next_token = torch.argmax(next_token_logits, dim=-1).item()

                        if next_token == tokenizer.eos_token_id:
                            break

                        generated_tokens.append(next_token)
                        current_token = torch.tensor([[next_token]]).to(device)

                    pred_caption = tokenizer.decode(generated_tokens, skip_special_tokens=True)
                    predictions.append(pred_caption)
            else:
                # For SmolVLM model
                for i in range(len(images)):
                    image_tensor = images[i:i+1]
                    image_tensor = (image_tensor + 1) / 2
                    image_tensor = torch.clamp(image_tensor, 0, 1)

                    prompt = processor.apply_chat_template(
                        [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What's in this image?"}]}],
                        add_generation_prompt=True
                    )

                    # Pass image_tensor instead of images[i]
                    inputs = processor(text=prompt, images=[image_tensor.cpu().squeeze(0)], return_tensors="pt").to(device)
                    outputs = model.generate(**inputs, max_new_tokens=64)
                    caption = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

                    if "Assistant:" in caption:
                        caption = caption.split("Assistant:")[-1].strip()

                    predictions.append(caption)

    # Calculate metrics
    bleu_score = bleu.compute(predictions=predictions, references=references)
    rouge_results = rouge.compute(predictions=predictions, references=[ref[0] for ref in references])

    # For METEOR, tokenize predictions and references before calculating the score
    meteor_scores = []
    for pred, ref in zip(predictions, references):
        # Tokenize the prediction and reference using nltk.word_tokenize
        pred_tokens = nltk.word_tokenize(pred)
        ref_tokens = nltk.word_tokenize(ref[0])

        meteor_score = nltk.translate.meteor_score.meteor_score([ref_tokens], pred_tokens)
        meteor_scores.append(meteor_score)

    meteor_score = sum(meteor_scores) / len(meteor_scores)

    # Compile results
    results = {
        "bleu": bleu_score["bleu"],
        "rouge-l": rouge_results["rougeL"],
        "meteor": meteor_score
    }

    return results

### https://huggingface.co/docs/transformers/model_doc/gpt2

In [9]:
# Unzip the dataset
with zipfile.ZipFile("/content/custom_captions_dataset.zip", "r") as zip_ref:
    zip_ref.extractall("/content/")


In [15]:

print("2. Training custom encoder-decoder model...")

# Load training data
train_df = pd.read_csv("/content/custom_captions_dataset/train.csv")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
image_processor = ViTImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224")

# Create dataset and dataloader
train_dataset = ImageCaptionDataset(
    train_df,
    "/content/custom_captions_dataset/train/",
    tokenizer,
    image_processor
)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Initialize model, optimizer, and train
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ImageCaptionModel()
optimizer = AdamW(model.parameters(), lr=3e-5)
optimizer = AdamW(
    model.parameters(),
    lr=3e-5,
    weight_decay=0.01
)

train_model(model, train_dataloader, optimizer, None, device, epochs=5)

# Save the trained model
torch.save(model.state_dict(), "custom_image_caption_model.pth")



2. Training custom encoder-decoder model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/88.2M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/5:   0%|          | 0/358 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Epoch 1/5: 100%|██████████| 358/358 [02:41<00:00,  2.22it/s, loss=1.89]


Epoch 1 completed. Avg Loss: 3.0118


Epoch 2/5: 100%|██████████| 358/358 [02:37<00:00,  2.27it/s, loss=2.21]


Epoch 2 completed. Avg Loss: 2.4479


Epoch 3/5: 100%|██████████| 358/358 [02:38<00:00,  2.26it/s, loss=1.82]


Epoch 3 completed. Avg Loss: 2.3276


Epoch 4/5: 100%|██████████| 358/358 [02:37<00:00,  2.28it/s, loss=2.04]


Epoch 4 completed. Avg Loss: 2.2479


Epoch 5/5: 100%|██████████| 358/358 [02:36<00:00,  2.29it/s, loss=1.97]


Epoch 5 completed. Avg Loss: 2.1973


In [10]:
print("helo")

helo


In [11]:
import evaluate

# 1. Zero-shot Evaluation using SmolVLM
print("1. Generating captions using SmolVLM (zero-shot)")
smolvlm_captions = zero_shot_captioning("/content/custom_captions_dataset/test","SmolVLM")
print("\n2. Training custom encoder-decoder model")
print("\n3. Generating captions using custom model")
custom_captions = generate_captions(model, tokenizer, image_processor)


1. Generating captions using SmolVLM (zero-shot)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [12]:
# For downloading the model file from google drive for testing seperately. not needed if already loaded
!pip install -q gdown

file_id = "199sCxfrCK8o9_e_6tlYZU6VUgfkjWTyk"
!gdown --id {file_id} --output my_model.pth


Downloading...
From (original): https://drive.google.com/uc?id=199sCxfrCK8o9_e_6tlYZU6VUgfkjWTyk
From (redirected): https://drive.google.com/uc?id=199sCxfrCK8o9_e_6tlYZU6VUgfkjWTyk&confirm=t&uuid=481ce57f-8907-4645-9da7-90c88b8d261a
To: /content/my_model.pth
100% 586M/586M [00:11<00:00, 51.3MB/s]


In [13]:
# For loading the model from the downloaded model, not needed if already loaded
from transformers import ViTImageProcessor

processor = ViTImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = ImageCaptionModel()
model.load_state_dict(torch.load("my_model.pth", map_location=device, weights_only=True))
model.to(device)
model.eval()

Using device: cuda


Some weights of ViTModel were not initialized from the model checkpoint at WinKawaks/vit-small-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImageCaptionModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=384, out_features=1536, bias=True)
            (intermediate

In [None]:
# Unzip the dataset for testing ( if done seperately ), not needed if already the training dataset is loaded
import zipfile

with zipfile.ZipFile("custom_captions_dataset.zip", "r") as zip_ref:
    zip_ref.extractall("/content/")

In [14]:
# Load JSON data,, not needed if already loaded
with open("smolvlm_generated_captions.json", "r") as f:
    smolvlm_captions = json.load(f)
with open("custom_model_generated_captions.json", "r") as f:
    custom_captions = json.load(f)

print(len(smolvlm_captions))
print(len(custom_captions))

928
928


In [15]:
# Evaluating the model
import pandas as pd

print("\n4. Evaluating models")
test_df = pd.read_csv("/content/custom_captions_dataset/test.csv")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
image_processor = ViTImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224")
test_dataset = ImageCaptionDataset(
    test_df,
    "/content/custom_captions_dataset/test/",
    tokenizer,
    image_processor
)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model_id = "HuggingFaceTB/SmolVLM-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_id)

print("\nEvaluating SmolVLM (zero-shot)\n")
smolvlm_model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
    _attn_implementation="eager"
).to(device)

with open("smolvlm_generated_captions.json", "r") as f:
    smolvlm_captions = json.load(f)






4. Evaluating models


chat_template.json:   0%|          | 0.00/429 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/92.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]


Evaluating SmolVLM (zero-shot)



config.json:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.49G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

In [None]:
# print("\nEvaluating Custom Model")
smolvlm_results = evaluate_model(smolvlm_model, test_dataloader, device)

print("\nEvaluating Custom Model")
custom_results = evaluate_model(model, test_dataloader, device)
print("\n5.Comparing SmolVLM and Custom Model: ")

print("-" * 50)
print(f"                  SmolVLM    Custom Model")
print(f"BLEU:             {smolvlm_results['bleu']:.4f}      {custom_results['bleu']:.4f}")
print(f"ROUGE-L:          {smolvlm_results['rouge-l']:.4f}      {custom_results['rouge-l']:.4f}")
print(f"METEOR:           {smolvlm_results['meteor']:.4f}      {custom_results['meteor']:.4f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Evaluating:   0%|          | 0/116 [00:00<?, ?it/s]It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
Evaluating:  78%|███████▊  | 90/116 [1:43:07<30:05, 69.45s/it]

In [None]:
# Clear all variables and run garbage collection for ram problems
for name in dir():
    if not name.startswith('_'):
        del globals()[name]

import gc
gc.collect()

import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

if 'smolvlm_model' in globals():
    del smolvlm_model
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    gc.collect()
