In [None]:
import requests
import torch
from PIL import Image
from transformers import *
from tqdm import tqdm
# set device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:

encoder_model = "google/vit-base-patch16-224"

decoder_model = "gpt2"
# load the model
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model, decoder_model
).to(device)

In [None]:

tokenizer = GPT2TokenizerFast.from_pretrained(decoder_model)

image_processor = ViTImageProcessor.from_pretrained(encoder_model)

In [None]:
if "gpt2" in decoder_model:
  # gpt2 does not have decoder_start_token_id and pad_token_id
  # but has bos_token_id and eos_token_id
  tokenizer.pad_token = tokenizer.eos_token # pad_token_id as eos_token_id
  model.config.eos_token_id = tokenizer.eos_token_id
  model.config.pad_token_id = tokenizer.pad_token_id
  # set decoder_start_token_id as bos_token_id
  model.config.decoder_start_token_id = tokenizer.bos_token_id
else:
  # set the decoder start token id to the CLS token id of the tokenizer
  model.config.decoder_start_token_id = tokenizer.cls_token_id
  # set the pad token id to the pad token id of the tokenizer
  model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
!pip install datasets

In [None]:
import datasets

In [None]:
from datasets import load_dataset
trust_remote_code=True
max_length = 32 # max length of the captions in tokens
coco_dataset_ratio = 2
train_ds = load_dataset("HuggingFaceM4/COCO", split=f"train[:{coco_dataset_ratio}%]")
valid_ds = load_dataset("HuggingFaceM4/COCO", split=f"validation[:{coco_dataset_ratio}%]")
test_ds = load_dataset("HuggingFaceM4/COCO", split="test")
len(train_ds), len(valid_ds), len(test_ds)

(11335, 500, 25010)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np

# remove the images with less than 3 dimensions (possibly grayscale images)
train_ds = train_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)
valid_ds = valid_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)
test_ds = test_ds.filter(lambda item: np.array(item["image"]).ndim in [3, 4], num_proc=2)


In [None]:
  def preprocess(items):
    # preprocess the image
    pixel_values = image_processor(items["image"], return_tensors="pt").pixel_values.to(device)
    # tokenize the caption with truncation and padding
    targets = tokenizer(
        [sentence["raw"] for sentence in items["sentences"]],
        max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
    ).to(device)
    return {'pixel_values': pixel_values, 'labels': targets["input_ids"]}
# using with_transform to preprocess the dataset during training
train_dataset = train_ds.with_transform(preprocess)
valid_dataset = valid_ds.with_transform(preprocess)
test_dataset = test_ds.with_transform(preprocess)


In [None]:
# a function we'll use to collate the batches
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.stack([x['labels'] for x in batch])
    }

In [None]:
! pip install evaluate

In [None]:
!pip install rouge_score

In [None]:
import evaluate

# load the rouge and bleu metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
  preds = eval_pred.label_ids
  labels = eval_pred.predictions
  # decode the predictions and labels
  pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
  labels_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
  # compute the rouge score
  rouge_result = rouge.compute(predictions=pred_str, references=labels_str)
  # multiply by 100 to get the same scale as the rouge score
  rouge_result = {k: round(v * 100, 4) for k, v in rouge_result.items()}
  # compute the bleu score
  bleu_result = bleu.compute(predictions=pred_str, references=labels_str)
  # get the length of the generated captions
  generation_length = bleu_result["translation_length"]
  return {
        **rouge_result,
        "bleu": round(bleu_result["bleu"] * 100, 4),
        "gen_len": bleu_result["translation_length"] / len(preds)
  }

In [None]:
#! pip install transformers torch


In [None]:
num_epochs = 0.5 # number of epochs
batch_size = 16
gradient_accumulation_steps = 4


In [None]:

for item in train_dataset:
  print(item["labels"].shape)
  print(item["pixel_values"].shape)
  break

torch.Size([32])
torch.Size([3, 224, 224])


In [None]:
#!pip install accelerate -U
#!pip install transformers[torch]
#!pip install rouge_score
#!pip install evaluate
#!pip install datasets
#!pip install tqdm


In [None]:
#from transformers import Seq2SeqTrainingArguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    gradient_accumulation_steps=gradient_accumulation_steps,
    fp16=True,
    predict_with_generate=True,             # use generate to calculate the loss
    num_train_epochs=num_epochs,            # number of epochs
    evaluation_strategy="steps",            # evaluate after each eval_steps
    eval_steps=5000,                        # evaluate after each 2000 steps
    logging_steps=5000,                     # log after each 2000 steps
    save_steps=5000,                        # save after each 2000 steps
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    output_dir="vit-swin-base-224-gpt2-image-captioning", # output directory

)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=image_processor,       # we use the image processor as the tokenizer
    args=training_args,              # pass the training arguments
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collate_fn,
)


Using auto half precision backend


In [None]:
from torch.utils.data import DataLoader

def get_eval_loader(eval_dataset=None):
  return DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=batch_size)

def get_test_loader(eval_dataset=None):
  return DataLoader(test_dataset, collate_fn=collate_fn, batch_size=batch_size)

# override the get_train_dataloader, get_eval_dataloader and
# get_test_dataloader methods of the trainer
# so that we can properly load the data
trainer.get_train_dataloader = lambda: DataLoader(train_dataset, collate_fn=collate_fn, batch_size=batch_size)
trainer.get_eval_dataloader = get_eval_loader
trainer.get_test_dataloader = get_test_loader

In [None]:
trainer.train()

***** Running training *****
  Num examples = 11,330
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 89
  Number of trainable parameters = 240,337,080


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=89, training_loss=1.541468759601036, metrics={'train_runtime': 16763.4388, 'train_samples_per_second': 0.338, 'train_steps_per_second': 0.005, 'total_flos': 1.0337930409054044e+18, 'train_loss': 1.541468759601036, 'epoch': 0.5021156558533145})

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Configuration saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/config.json
Configuration saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/generation_config.json
Model weights saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/model.safetensors
tokenizer config file saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/special_tokens_map.json


('/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/tokenizer_config.json',
 '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/special_tokens_map.json',
 '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/vocab.json',
 '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/merges.txt',
 '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/added_tokens.json',
 '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/tokenizer.json')

In [None]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

{'eval_loss': 1.1900444030761719,
 'eval_rouge1': 20.6773,
 'eval_rouge2': 2.8465,
 'eval_rougeL': 19.2114,
 'eval_rougeLsum': 19.2055,
 'eval_bleu': 0.9006,
 'eval_gen_len': 11.192,
 'eval_runtime': 1297.6699,
 'eval_samples_per_second': 0.385,
 'eval_steps_per_second': 0.025,
 'epoch': 0.5021156558533145}

In [None]:
image_captioner = pipeline("image-to-text", model="Abdou/vit-swin-base-224-gpt2-image-captioning")
image_captioner.model = image_captioner.model.to(device)

In [None]:
import gc
from torch.utils.data import DataLoader, SequentialSampler

def get_evaluation_metrics(model, dataset, batch_size=16, subset_ratio=0.1):
    model.eval()
    # Use only a subset of the dataset for evaluation
    subset_size = int(len(dataset) * subset_ratio)
    dataset_subset = dataset.select(range(subset_size))

    # Define our dataloader
    dataloader = DataLoader(dataset_subset, collate_fn=collate_fn, batch_size=batch_size)

    # Number of testing steps
    n_test_steps = len(dataloader)

    # Initialize our lists that store the predictions and the labels
    predictions, labels = [], []

    # Initialize the test loss
    test_loss = 0.0

    for batch in tqdm(dataloader, "Evaluating"):
        # Get the batch
        pixel_values = batch["pixel_values"].to(device)
        label_ids = batch["labels"].to(device)

        # Forward pass
        with torch.no_grad():
            outputs = model(pixel_values=pixel_values, labels=label_ids)

        # Get the loss
        loss = outputs.loss
        test_loss += loss.item()

        # Free the GPU memory
        logits = outputs.logits.detach().cpu()

        # Add the predictions to the list
        predictions.extend(logits.argmax(dim=-1).tolist())

        # Add the labels to the list
        labels.extend(label_ids.cpu().tolist())

        # Clear cache to free up memory
        torch.cuda.empty_cache()
        gc.collect()

    # Make the EvalPrediction object that the compute_metrics function expects
    eval_prediction = EvalPrediction(predictions=predictions, label_ids=labels)

    # Compute the metrics
    metrics = compute_metrics(eval_prediction)

    # Add the test_loss to the metrics
    metrics["test_loss"] = test_loss / n_test_steps

    return metrics

# Adjust the batch size and subset ratio as needed
metrics = get_evaluation_metrics(image_captioner.model, test_dataset, batch_size=4, subset_ratio=0.1)


Evaluating: 100%|██████████| 623/623 [52:25<00:00,  5.05s/it]


In [None]:
metrics

{'rouge1': 52.7259,
 'rouge2': 23.8143,
 'rougeL': 51.1198,
 'rougeLsum': 51.0873,
 'bleu': 17.1926,
 'gen_len': 11.199036918138042,
 'test_loss': 0.801543534854442}

In [None]:
from transformers import VisionEncoderDecoderModel, GPT2TokenizerFast

save_directory = '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning'

model = VisionEncoderDecoderModel.from_pretrained(save_directory).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(save_directory)


In [None]:
import torch
from PIL import Image
import requests
from transformers import VisionEncoderDecoderModel, GPT2TokenizerFast, ViTImageProcessor

# Load the model and tokenizer

image_processor = ViTImageProcessor.from_pretrained("microsoft/swin-base-patch4-window7-224-in22k")

# Function to generate caption
def generate_caption(image_url):
    # Load and preprocess the image
    image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values.to(device)

    # Generate the caption
    output_ids = model.generate(pixel_values, max_length=32, num_beams=4)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return caption

# Test the model with the example image
image_url = "https://upload.wikimedia.org/wikipedia/commons/3/3a/Cat03.jpg"
caption = generate_caption(image_url)
print("Generated Caption:", caption)


loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--swin-base-patch4-window7-224-in22k/snapshots/68dc76680a5bf3bdf670669f3025dc9be2e30781/preprocessor_config.json
size should be a dictionary on of the following set of keys: ({'height', 'width'}, {'shortest_edge'}, {'shortest_edge', 'longest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.
Image processor ViTImageProcessor {
  "_valid_processor_keys": [
    "images",
    "do_resize",
    "size",
    "resample",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.003921568627450

Generated Caption: A man standing in front of a mirror.


In [None]:
!pip install huggingface_hub




In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import login

login(token="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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import VisionEncoderDecoderModel, GPT2TokenizerFast
from huggingface_hub import HfApi

save_directory = '/content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning'
repo_name = "Rahuljat27/Image-Caption-Generator"
# Save the model and tokenizer locally
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Push the model to the Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


Configuration saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/config.json
Configuration saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/generation_config.json
Model weights saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/model.safetensors
tokenizer config file saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/vit-swin-base-224-gpt2-image-captioning/special_tokens_map.json
Configuration saved in /tmp/tmpkl_i1721/config.json
Configuration saved in /tmp/tmpkl_i1721/generation_config.json
Model weights saved in /tmp/tmpkl_i1721/model.safetensors
Uploading the following files to Rahuljat27/Image-Caption-Generator: README.md,model.safetensors,config.json,generation_config.json


model.safetensors:   0%|          | 0.00/962M [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmpwvisso7z/tokenizer_config.json
Special tokens file saved in /tmp/tmpwvisso7z/special_tokens_map.json
Uploading the following files to Rahuljat27/Image-Caption-Generator: special_tokens_map.json,README.md,vocab.json,merges.txt,tokenizer_config.json,tokenizer.json


CommitInfo(commit_url='https://huggingface.co/Rahuljat27/Image-Caption-Generator/commit/dc774e602a0c54311fd5845127848542287394d7', commit_message='Upload tokenizer', commit_description='', oid='dc774e602a0c54311fd5845127848542287394d7', pr_url=None, pr_revision=None, pr_num=None)