In [None]:
!pip install datasets transformers evaluate rouge_score

In [12]:
import os
from transformers import BlipProcessor, BlipForConditionalGeneration
from datasets import load_dataset, DatasetDict
import torch
from PIL import Image
from tqdm import tqdm
import evaluate

In [5]:
ds = load_dataset("tomytjandra/h-and-m-fashion-caption-12k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/323 [00:00<?, ?B/s]

train-00000-of-00011.parquet:   0%|          | 0.00/478M [00:00<?, ?B/s]

train-00001-of-00011.parquet:   0%|          | 0.00/465M [00:00<?, ?B/s]

train-00002-of-00011.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

train-00003-of-00011.parquet:   0%|          | 0.00/335M [00:00<?, ?B/s]

train-00004-of-00011.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

train-00005-of-00011.parquet:   0%|          | 0.00/321M [00:00<?, ?B/s]

train-00006-of-00011.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00007-of-00011.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00008-of-00011.parquet:   0%|          | 0.00/319M [00:00<?, ?B/s]

train-00009-of-00011.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

train-00010-of-00011.parquet:   0%|          | 0.00/297M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12437 [00:00<?, ? examples/s]

In [14]:
# -------------------------------
# Configuration Parameters
# -------------------------------
OUTPUT_CSV = "captioned_dataset.csv"  # Output file to save captions
BATCH_SIZE = 64  # Adjust based on your GPU memory
USE_GPU = torch.cuda.is_available()
TEST_SPLIT_PERCENTAGE = 16.6  # Percentage of data to use for testing

In [8]:
split_dataset = ds['train'].train_test_split(test_size=TEST_SPLIT_PERCENTAGE / 100, seed=42)
print("\nSplit Dataset Structure:")
print(split_dataset)

# Assign splits
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']


Split Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'image'],
        num_rows: 10372
    })
    test: Dataset({
        features: ['text', 'image'],
        num_rows: 2065
    })
})


In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Move model to GPU if available
device = torch.device("cuda" if USE_GPU else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

In [10]:
# -------------------------------
# Define Caption Generation Function
# -------------------------------
def generate_captions(batch):
    images = batch['image']

    # Convert images to PIL format if necessary
    pil_images = []
    for img in images:
        pil_images.append(img)

    # Process images ????????????????????????????????????????????????????????????????????????
    inputs = processor(images=pil_images, return_tensors="pt", padding=True).to(device)

    # Generate captions
    with torch.no_grad():
        outputs = model.generate(**inputs)

    # Decode captions
    captions = processor.batch_decode(outputs, skip_special_tokens=True)

    return {'generated_caption': captions}

In [15]:
# -------------------------------
# Apply Caption Generation to Dataset
# -------------------------------
# Use batched processing for efficiency
captioned_dataset = test_dataset.map(
    generate_captions,
    batched=True,
    batch_size=BATCH_SIZE,
    # remove_columns=["image"],
    desc="Generating captions"
)

# -------------------------------
# Save the Results
# -------------------------------
# Convert to pandas DataFrame for easier handling
df = captioned_dataset.to_pandas()

# Save to CSV
df.to_csv(OUTPUT_CSV, index=False)
print(f"Captioned dataset saved to {OUTPUT_CSV}")

Generating captions:   0%|          | 0/2065 [00:00<?, ? examples/s]

Captioned dataset saved to captioned_dataset.csv


In [18]:
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")
# cider = evaluate.load("cider") better if have more reference captions per image

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [19]:
references = [[ref] for ref in df['text'].tolist()]  # Ground truth captions
predictions = df['generated_caption'].tolist()  # Generated captions

In [21]:
# Compute BLEU
bleu_result = bleu.compute(predictions=predictions, references=references)
print("\nBLEU Score:", bleu_result)

# Compute METEOR
meteor_result = meteor.compute(predictions=predictions, references=references)
print("METEOR Score:", meteor_result)

# Compute ROUGE
rouge_result = rouge.compute(predictions=predictions, references=references)
print("ROUGE Scores:", rouge_result)


BLEU Score: {'bleu': 0.0017250019485011887, 'precisions': [0.4454292974058973, 0.06945753129627137, 0.007504103806769327, 0.0006522549385016772], 'brevity_penalty': 0.08744520640499305, 'length_ratio': 0.29097317744154055, 'translation_length': 16923, 'reference_length': 58160}
METEOR Score: {'meteor': 0.09863973024007634}
ROUGE Scores: {'rouge1': 0.20632010298168338, 'rouge2': 0.03242772689635774, 'rougeL': 0.1517829651137922, 'rougeLsum': 0.15167120336601517}
