In [4]:
# [1] Load Flickr8k Dataset using kaggle.json (upload first)
import os
from zipfile import ZipFile

# Upload your kaggle.json file manually in Colab (Files > Upload)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Flickr8k from Kaggle
!kaggle datasets download -d adityajn105/flickr8k

# Unzip dataset
!unzip -q flickr8k.zip -d flickr8k_data

# Load image paths and captions

Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0
Downloading flickr8k.zip to /content
 97% 1.01G/1.04G [00:08<00:00, 406MB/s]
100% 1.04G/1.04G [00:08<00:00, 128MB/s]


In [5]:
import os, torch, random
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForConditionalGeneration
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# [B] Read Flickr8k captions and sample 2000
captions_path = "flickr8k_data/captions.txt"
images_path = "flickr8k_data/Images"

df = pd.read_csv(captions_path)
df.columns = ['filename', 'caption']
df['filename'] = df['filename'].str.strip()

# Sample only 2000 images
df = df.groupby("filename").first().reset_index().sample(8000, random_state=42).reset_index(drop=True)


In [7]:
# [C] Torch Dataset Class with on-the-fly image loading
class Flickr8kDataset(Dataset):
    def __init__(self, df, processor, root_dir):
        self.df = df
        self.processor = processor
        self.root_dir = root_dir
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.root_dir, row['filename'])
        caption = row['caption']

        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)
        image = (image * 255).byte()
        image = transforms.ToPILImage()(image)

        pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.squeeze(0)
        tokens = self.processor(text=caption, padding="max_length", truncation=True, max_length=32, return_tensors="pt")

        return {
            "pixel_values": pixel_values,
            "input_ids": tokens.input_ids.squeeze(0),
            "attention_mask": tokens.attention_mask.squeeze(0)
        }


In [8]:
# [D] Load processor and split dataset
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

train_df = df[:7000].reset_index(drop=True)
val_df = df[7000:].reset_index(drop=True)

train_dataset = Flickr8kDataset(train_df, processor, images_path)
val_dataset = Flickr8kDataset(val_df, processor, images_path)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [9]:
# [E] Load model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [10]:
# [F] Training Loop (2–5 epochs)
from torch.nn.utils.rnn import pad_sequence

def train_one_epoch():
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        pixel_values = batch["pixel_values"].to(device)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Train for 3 epochs (should complete in under 2 hours)
for epoch in range(5):
    print(f"Epoch {epoch+1}")
    loss = train_one_epoch()
    print(f"Train Loss: {loss:.4f}")

100%|██████████| 1750/1750 [19:32<00:00,  1.49it/s]


Train Loss: 0.3962
Epoch 5


100%|██████████| 1750/1750 [19:32<00:00,  1.49it/s]

Train Loss: 0.2714





In [11]:
# [G] Save final model
model.save_pretrained("blip_caption_model")
processor.save_pretrained("blip_caption_model")


[]

In [None]:
import os
from google.colab import files

# Zip the blip_finetuned folder
!zip -r blip_caption_model.zip blip_finetuned

# Download the zip file
files.download('blip_caption_model.zip')

  adding: blip_finetuned/ (stored 0%)
  adding: blip_finetuned/generation_config.json (deflated 28%)
  adding: blip_finetuned/model.safetensors (deflated 7%)
  adding: blip_finetuned/preprocessor_config.json (deflated 48%)
  adding: blip_finetuned/special_tokens_map.json (deflated 80%)
  adding: blip_finetuned/vocab.txt (deflated 53%)
  adding: blip_finetuned/tokenizer.json (deflated 71%)
  adding: blip_finetuned/config.json (deflated 67%)
  adding: blip_finetuned/tokenizer_config.json (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>