In [None]:
!pip install --upgrade transformers
!pip install pycocoevalcap

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m870.8 kB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
%cd /content/drive/MyDrive/DL_project/

/content/drive/MyDrive/DL_project


In [None]:
import os
import pickle

import torch
from torch.utils.data import Dataset, DataLoader

from PIL import Image

from tqdm import tqdm_notebook, tqdm

from torch.optim import AdamW
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
from transformers.utils import logging

#evaluation packages
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.spice.spice import Spice

from collections import defaultdict

In [None]:
# --- Configuration ---
IMG_DIR = r'./Flicker8k_images'
CAP_DIR = r'./Flicker8k_captions'
ENCODER_ID = "google/vit-base-patch16-224-in21k"
DECODER_ID = "gpt2"
TOKENIZER_NAME = "gpt2"
SAVED_MODEL_PATH = r'./saved-model' # Path to load initial model
CHECKPOINT_DIR = r'./image-captioning-model' # Directory to save new checkpoints

MAX_LEN = 48
BATCH_SIZE = 32 # Increased to match your original DataLoader
EPOCHS = 5
LR = 1e-5
gradient_acc_steps = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Load test_data
with open(os.path.join(CAP_DIR, 'test_data.pickle'), 'rb') as f:
    test_data = pickle.load(f)

In [None]:
class Flickr8kDataset(Dataset):
    def __init__(self, data, tokenizer, img_processor, img_dir, max_len, test_data=False):
        self.tokenizer = tokenizer
        self.processor = img_processor
        self.img_dir = img_dir
        self.max_len = max_len
        self.data = []

        # Use all captions for better training

        for filename, captions in data.items():
          if test_data:
            self.data.append((filename, captions[0]))
          else:
              for cap in captions:
                  self.data.append((filename, cap))


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        filename, caption = self.data[idx]

        # --- Image Processing ---
        img_path = os.path.join(self.img_dir, filename)
        img = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(img, return_tensors='pt').pixel_values.squeeze(0)

        # --- Caption Processing ---
        tokenized_output = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        tokens = tokenized_output.input_ids.squeeze(0)
        attention_mask = tokenized_output.attention_mask.squeeze(0)

        labels = tokens.clone()
        labels[labels == cap_tokenizer.pad_token_id] = -100

        return {'pixel_values': pixel_values, 'labels': labels, 'attention_mask': attention_mask, 'filename': filename}

In [None]:
img_processor = ViTImageProcessor.from_pretrained(ENCODER_ID)
cap_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
cap_tokenizer.add_special_tokens({'pad_token': '<PAD>', 'bos_token': '<BOS>'})

test_dataset = Flickr8kDataset(test_data, cap_tokenizer, img_processor, IMG_DIR, MAX_LEN, test_data=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = VisionEncoderDecoderModel.from_pretrained(r'./image-captioning-model/epoch_decoder_only_baseline_3')
model.to(DEVICE)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

In [None]:
def evaluate(model, loader, tokenizer, device, max_len):
    """
    Evaluates the model on a given dataset.
    Calculates loss and shows some generated captions, omitting the BLEU calculation.
    """
    model.eval()
    generated_captions = []
    lab = []

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc="Evaluating")):

            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['attention_mask'].to(device)
            filenames = batch.get('filename')


            generated_ids = model.generate(
                    pixel_values=pixel_values,
                    max_new_tokens=20,
                    decoder_start_token_id=cap_tokenizer.bos_token_id,
                    pad_token_id=cap_tokenizer.pad_token_id,
                    eos_token_id=cap_tokenizer.eos_token_id,
                    do_sample=True,
                    top_p=0.9,
                    temperature=0.7,
                    #num_beams=5,
                    length_penalty=3.0,
                    repetition_penalty=3.0,
                    min_length=5,
                    early_stopping=True
            )

            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            generated_captions.extend(preds)
            lab.extend(filenames)

    return preds, generated_captions


In [None]:
# get prediction
label, caption = evaluate(
      model=model,
      loader=test_loader,
      tokenizer=cap_tokenizer,
      device=DEVICE,
      max_len=MAX_LEN
)

pred = dict(zip(label, caption))
pred = {x: [pred[x]] for x in pred}

Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping', 'length_penalty']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating: 100%|██████████| 32/32 [00:30<00:00,  1.04it/s]


In [None]:
# generate evaluation metrics
gts = test_data
res = pred

# --------------------------------------------------------
# Evaluate metrics
# --------------------------------------------------------
print("Evaluating metrics...\n")

scorers = [
    (Bleu(4), ["BLEU-1", "BLEU-2", "BLEU-3", "BLEU-4"]),
    (Meteor(), "METEOR"),
    (Rouge(), "ROUGE-L"),
    (Cider(), "CIDEr"),
    #(Spice(), "SPICE"),
]

final_scores = {}

for scorer, method in scorers:
    print(f"Computing {method}...")

    score, scores = scorer.compute_score(gts, res)

    if isinstance(method, list):  # BLEU returns 4 numbers
        for m, s in zip(method, score):
            final_scores[m] = s
            print(f"{m}: {s:.4f}")
    else:
        final_scores[method] = score
        print(f"{method}: {score:.4f}")

print("\n=== Final Results ===")
for k, v in final_scores.items():
    print(f"{k}: {v:.4f}")

Evaluating metrics...

Computing ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4']...
{'testlen': 19361, 'reflen': 15680, 'guess': [19361, 18361, 17361, 16361], 'correct': [8022, 214, 6, 0]}
ratio: 1.2347576530611457
BLEU-1: 0.4143
BLEU-2: 0.0695
BLEU-3: 0.0119
BLEU-4: 0.0000
Computing METEOR...
METEOR: 0.1640
Computing ROUGE-L...
ROUGE-L: 0.2467
Computing CIDEr...
CIDEr: 0.1170

=== Final Results ===
BLEU-1: 0.4143
BLEU-2: 0.0695
BLEU-3: 0.0119
BLEU-4: 0.0000
METEOR: 0.1640
ROUGE-L: 0.2467
CIDEr: 0.1170
