In [1]:
import os


import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split
import transformers

from transformers import Seq2SeqTrainer ,Seq2SeqTrainingArguments
from transformers import VisionEncoderDecoderModel , ViTFeatureExtractor
from transformers import AutoTokenizer ,  GPT2Config , default_data_collator


if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


  from .autonotebook import tqdm as notebook_tqdm


There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A4500


## Custom Dataset creation

In [2]:
import torch
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset


class Image_Caption_Dataset(Dataset):
    def __init__(
        self, root_dir, df_image,df_caption, feature_extractor, tokenizer, max_target_length=512
    ):
        self.root_dir = root_dir
        self.df_image = df_image
        self.df_caption = df_caption
        self.feature_extractor = feature_extractor
        self.tokenizer = tokenizer
        self.max_length = max_target_length

    def __len__(self):
        return self.df_caption.shape[0]

    def __getitem__(self, idx):
        # return image
        image_path = self.df_image[idx]
        text = self.df_caption[idx]
        # prepare image
        image = Image.open(self.root_dir + "/" + image_path).convert("RGB")
        pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values
        # add captions by encoding the input
        captions = self.tokenizer(
            text, padding="max_length", max_length=self.max_length
        ).input_ids
        captions = [
            caption if caption != self.tokenizer.pad_token_id else -100
            for caption in captions
        ]
        encoding = {
            "pixel_values": pixel_values.squeeze(),
            "labels": torch.tensor(captions),
        }
        return encoding




In [3]:
df_train = pd.read_csv("/home/rajib/dl_project/custom_captions_dataset/train.csv")
df_test = pd.read_csv("/home/rajib/dl_project/custom_captions_dataset/test.csv")
df_val = pd.read_csv("/home/rajib/dl_project/custom_captions_dataset/val.csv")

train_df_image = df_train['filename']
train_df_caption = df_train['caption']

test_df_image = df_test['filename']
test_df_caption = df_test['caption']

val_df_image = df_val['filename']
val_df_caption = df_val['caption']

## Feature Extractor and Tokenizer

In [4]:
import pandas as pd
from transformers import (AutoTokenizer, Seq2SeqTrainer,
                          Seq2SeqTrainingArguments, VisionEncoderDecoderModel,
                          ViTFeatureExtractor, default_data_collator)



encoder_checkpoint = "google/vit-base-patch16-224"
decoder_checkpoint = "gpt2"
output_dir = "./image_captioning_checkpoint"
# load feature extractor and tokenizer
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
tokenizer.pad_token = tokenizer.eos_token





## Dataset object creation

### Train dataset

In [5]:
root_dir_train = "/home/rajib/dl_project/custom_captions_dataset/train"
train_df_image = train_df_image
train_df_caption = train_df_caption
trian_dataset = Image_Caption_Dataset(root_dir_train, train_df_image, train_df_caption,feature_extractor,tokenizer)

In [6]:
len(trian_dataset)

5715

### Val dataset

In [7]:
root_dir_val = "/home/rajib/dl_project/custom_captions_dataset/val"
val_df_image = val_df_image
val_df_caption = val_df_caption
val_dataset = Image_Caption_Dataset(root_dir_val, val_df_image, val_df_caption,feature_extractor,tokenizer)

## Training 

In [8]:

# initialize a vit-bert from a pretrained ViT and a pretrained GPT2 model
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_checkpoint, decoder_checkpoint
)
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 512
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 5
model.decoder.resize_token_embeddings(len(tokenizer))

# freeze the encoder
for param in model.encoder.parameters():
    param.requires_grad = False


training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=12,
    num_train_epochs=10,
    per_device_eval_batch_size=12,
    overwrite_output_dir=True,
    fp16=True,
    run_name="first_run",
    load_best_model_at_end=True,
    output_dir=output_dir,
    logging_steps=2000,
    save_steps=2000,
    eval_steps=2000,
)


if __name__ == "__main__":
    # instantiate trainer
    trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=feature_extractor,
        args=training_args,
        train_dataset=trian_dataset,
        eval_dataset=val_dataset,
        data_collator=default_data_collator,
    )
    trainer.train()

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.

Step,Training Loss,Validation Loss
2000,2.2315,2.360497
4000,1.7529,2.475226


Non-default generation parameters: {'max_length': 512, 'early_stopping': True, 'num_beams': 5, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 512, 'early_stopping': True, 'num_beams': 5, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
There were missing keys in the checkpoint model loaded: ['decoder.lm_head.weight'].


In [9]:

import torch
from PIL import Image
from transformers import (AutoTokenizer, VisionEncoderDecoderModel,
                          ViTFeatureExtractor)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


encoder_checkpoint = "google/vit-base-patch16-224"
decoder_checkpoint = "gpt2"
model_checkpoint = "/home/rajib/dl_project/image_captioning_checkpoint/checkpoint-2000"
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)






In [10]:
test_images = test_df_image.tolist()
test_image_path = "/home/rajib/dl_project/custom_captions_dataset/test"
test_images = [os.path.join(test_image_path, image) for image in test_images]
test_caption = test_df_caption.tolist()
test_images.pop(388)
test_images.pop(486-1)
test_images.pop(924-2)
test_caption.pop(388)
test_caption.pop(486-1)
test_caption.pop(924-2)

'This is a black and white photograph of a very old biplane taking off from a flat field. There is some sort of crop growing in the field, but it is indiscernable. The plane itself says "America" on its side. There is a star emblem on its tail as well as underneath the visible wing. The plane is only about 20 feet off the ground. '

In [11]:
def predict(image):
    
    clean_text = lambda x: x.replace("<|endoftext|>", "").split("\n")[0]
    sample = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
    caption_ids = model.generate(sample, max_length=50)[0]
    caption_text = clean_text(tokenizer.decode(caption_ids))
    return caption_text

## Generate Captions for the test set

In [12]:

exception_list = []

generated_caption_list = list()

for i in tqdm(range(len(test_images))):
    x = Image.open(test_images[i])
    generated_caption_list.append(predict(x))
        

100%|██████████| 925/925 [05:00<00:00,  3.08it/s]


### ROUGE L

In [13]:
import evaluate
rouge = evaluate.load('rouge')

results = rouge.compute(predictions = generated_caption_list, references = test_caption)
print(results['rougeL'])

0.28700664117916097


### Converting corpus list to dictionary (metric input format)

In [14]:
idRef, idGen = 0, 0
reference_captions_dict = dict()
for x in test_caption:
    tempRef = [x]
    reference_captions_dict[idRef] = tempRef
    idRef +=1

generated_captions_dict = dict()
for x in generated_caption_list:
    tempGen = [x]
    generated_captions_dict[idGen] = tempGen
    idGen +=1

### CIDEr

In [15]:
from pycocoevalcap.cider.cider import Cider
cider = Cider()
scores, scores_per_image = cider.compute_score(reference_captions_dict, generated_captions_dict)
print("CIDEr scores:")
print(scores)

CIDEr scores:
0.09645651993133295


In [17]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### SPICE

In [18]:
from pycocoevalcap.spice.spice import Spice
spice = Spice()
scores, detailed_scores = spice.compute_score(reference_captions_dict, generated_captions_dict)
print("SPICE scores:")
print(scores)

Parsing reference captions
Initiating Stanford parsing pipeline
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - TokenizerAnnotator: No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.2 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [0.5 sec].
Loading classifier from edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz ... done [0.2 sec].
Loading classif

SPICE evaluation took: 11.05 s
SPICE scores:
0.15214098576590698
