In [1]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
%cd /content/drive/MyDrive/DL_project/

/content/drive/MyDrive/DL_project


In [4]:
import os
import pickle

import torch
from torch.utils.data import Dataset, DataLoader

from PIL import Image

from tqdm import tqdm_notebook, tqdm

from torch.optim import AdamW
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel
from transformers.utils import logging

In [5]:
# --- Configuration ---
IMG_DIR = r'./Flicker8k_images'
CAP_DIR = r'./Flicker8k_captions'
ENCODER_ID = "google/vit-base-patch16-224-in21k"
DECODER_ID = "gpt2"
TOKENIZER_NAME = "gpt2"
SAVED_MODEL_PATH = r'./saved-model' # Path to load initial model
CHECKPOINT_DIR = r'./image-captioning-model' # Directory to save new checkpoints

MAX_LEN = 48
BATCH_SIZE = 32 # Increased to match your original DataLoader
EPOCHS = 5
LR = 1e-5
gradient_acc_steps = 8
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
with open(os.path.join(CAP_DIR, 'train_data.pickle'), 'rb') as f:
    train_data = pickle.load(f)

# Load dev_data
with open(os.path.join(CAP_DIR, 'dev_data.pickle'), 'rb') as f:
    dev_data = pickle.load(f)

# Load test_data
with open(os.path.join(CAP_DIR, 'test_data.pickle'), 'rb') as f:
    test_data = pickle.load(f)

In [8]:
class Flickr8kDataset(Dataset):
    def __init__(self, data, tokenizer, img_processor, img_dir, max_len, test_data=False):
        self.tokenizer = tokenizer
        self.processor = img_processor
        self.img_dir = img_dir
        self.max_len = max_len
        self.data = []

        # Use all captions for better training

        for filename, captions in data.items():
          if test_data:
            self.data.append((filename, captions[0]))
          else:
              for cap in captions:
                  self.data.append((filename, cap))


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        filename, caption = self.data[idx]

        # --- Image Processing ---
        img_path = os.path.join(self.img_dir, filename)
        img = Image.open(img_path).convert("RGB")
        pixel_values = self.processor(img, return_tensors='pt').pixel_values.squeeze(0)

        # --- Caption Processing ---
        tokenized_output = self.tokenizer(
            caption,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        tokens = tokenized_output.input_ids.squeeze(0)
        attention_mask = tokenized_output.attention_mask.squeeze(0)

        labels = tokens.clone()
        labels[labels == cap_tokenizer.pad_token_id] = -100

        return {'pixel_values': pixel_values, 'labels': labels, 'attention_mask': attention_mask, 'filename': filename}

In [9]:
img_processor = ViTImageProcessor.from_pretrained(ENCODER_ID)
cap_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
cap_tokenizer.add_special_tokens({'pad_token': '<PAD>', 'bos_token': '<BOS>'})

train_dataset = Flickr8kDataset(train_data, cap_tokenizer, img_processor, IMG_DIR, MAX_LEN, test_data=False)
test_dataset = Flickr8kDataset(test_data, cap_tokenizer, img_processor, IMG_DIR, MAX_LEN, test_data=True)
dev_dataset = Flickr8kDataset(dev_data, cap_tokenizer, img_processor, IMG_DIR, MAX_LEN, test_data=False)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    ENCODER_ID, DECODER_ID
)

# Set the dropout for the encoder
model.encoder.config.attention_probs_dropout_prob = 0.2
model.encoder.config.hidden_dropout_prob = 0.2

# Set the dropout for the decoder
model.decoder.config.attention_probs_dropout_prob = 0.2
model.decoder.config.hidden_dropout_prob = 0.2

# Set the generation params on the main config (as you've done)
model.config.pad_token_id = cap_tokenizer.pad_token_id
model.config.decoder_start_token_id = cap_tokenizer.bos_token_id
model.config.eos_token_id = cap_tokenizer.eos_token_id
model.decoder.resize_token_embeddings(len(cap_tokenizer))

model.to(DEVICE)
#model.save_pretrained(SAVED_MODEL_PATH)

In [10]:
model = VisionEncoderDecoderModel.from_pretrained(r'./image-captioning-model/epoch_12')
model.to(DEVICE)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

In [11]:
def evaluate(model, loader, tokenizer, device, max_len):
    """
    Evaluates the model on a given dataset.
    Calculates loss and shows some generated captions, omitting the BLEU calculation.
    """
    model.eval()
    total_loss = 0
    generated_captions = {}

    with torch.no_grad():
        for i, batch in enumerate(tqdm(loader, desc="Evaluating")):
            # if i==1: break
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['attention_mask'].to(device)
            filenames = batch.get('filename')

            outputs = model(
                pixel_values=pixel_values,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask
            )

            total_loss += outputs.loss.item()
            logging.disable_progress_bar()

            logging.enable_progress_bar()

            if i == 0:
              generated_ids = model.generate(
                  pixel_values=pixel_values,
                  max_new_tokens=20,
                  decoder_start_token_id=cap_tokenizer.bos_token_id,
                  pad_token_id=cap_tokenizer.pad_token_id,
                  eos_token_id=cap_tokenizer.eos_token_id,
                  do_sample=True,
                  top_p=0.9,
                  temperature=0.7,
                  #num_beams=5,
                  #length_penalty=3.0,
                  repetition_penalty=3.0,
                  min_length=5,
                  early_stopping=True
              )

              preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
              for j, filename in enumerate(filenames):
                  if filename not in generated_captions:
                      generated_captions[filename] = preds[j]
            # ---------------------------------

    avg_loss = total_loss / len(loader)

    # Print some example captions
    print("\n--- Example Generated Captions ---")
    for filename, caption in list(generated_captions.items())[:10]: # first 3 examples
        print(f"Image: {filename}\nCaption: {caption}\n")
    print("--------------------------------\n")

    return avg_loss


In [12]:
# --- Training & Evaluation Loop ---
print(f"Starting training on {DEVICE}...")
for epoch in range(0,25):
    if epoch <9: lr = 1e-5
    else: lr = 5e-6

    # train full model
    optimizer = AdamW([
      {'params': model.parameters(), 'lr': lr}
    ])

    model.train()
    total_loss = 0

    print(f"Epoch {epoch+1}/{EPOCHS}")
    optimizer.zero_grad()
    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Training Batches")):
        pixel_values = batch['pixel_values'].to(DEVICE)
        attn_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(pixel_values=pixel_values, labels=labels, decoder_attention_mask=attn_mask)
        loss = outputs.loss
        total_loss += loss.item()

        loss = loss / gradient_acc_steps
        loss.backward()

        if (batch_idx + 1) % gradient_acc_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    # Optional: Handle the remainder (if len(train_loader) is not a multiple of gradient_acc_steps)
    if (batch_idx + 1) % gradient_acc_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Train Loss: {avg_train_loss:.4f}")

    # --- Save Model ---
    epoch_output_dir = os.path.join(CHECKPOINT_DIR, f"epoch_{epoch+1}")
    os.makedirs(epoch_output_dir, exist_ok=True)
    model.save_pretrained(epoch_output_dir)
    cap_tokenizer.save_pretrained(epoch_output_dir)
    print(f"Model saved to {epoch_output_dir}")

    # --- Evaluation ---
    avg_dev_loss = evaluate(
        model=model,
        loader=dev_loader,
        tokenizer=cap_tokenizer,
        device=DEVICE,
        max_len=MAX_LEN
    )
    print(f"Dev Loss: {avg_dev_loss:.4f}")

print("Training finished!")

Starting training on cuda...
Epoch 1/5


Training Batches:   0%|          | 0/938 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Training Batches:   0%|          | 2/938 [01:14<9:44:09, 37.45s/it]


KeyboardInterrupt: 