<a href="https://colab.research.google.com/github/sonalshreya/humor_in_AI/blob/main/TrainBlip2_and_Vit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import torch
from transformers import (
    VisionEncoderDecoderModel,
    BlipForConditionalGeneration,
    BlipProcessor,
    AutoFeatureExtractor,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
from PIL import Image
import numpy as np

In [None]:
# Load the dataset
dataset = load_dataset("jmhessel/newyorker_caption_contest", 'explanation')

# Preprocess the dataset (first 200 rows)
def preprocess_dataset(examples):
    # Convert image to PIL Image
    images = examples['image']
    captions = examples['image_description']
    return {'images': images, 'captions': captions}

In [None]:
# Prepare dataset
dataset_train = dataset['train'].select(range(500))
processed_dataset = dataset_train.map(preprocess_dataset, remove_columns=dataset_train.column_names)

# BLIP-2 Fine-tuning

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False,num_items_in_batch=None):
        # Extract inputs
        pixel_values = inputs.get('pixel_values')
        input_ids = inputs.get('input_ids')
        attention_mask = inputs.get('attention_mask')

        # Forward pass
        outputs = model(
            pixel_values=pixel_values,
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids  # Use input_ids as labels for language modeling loss
        )

        # The model returns a dictionary with loss
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss


In [None]:
def prepare_blip2_model():
    # Load pre-trained BLIP-2 model
    model_name = "Salesforce/blip-image-captioning-large"
    model = BlipForConditionalGeneration.from_pretrained(model_name)
    processor = BlipProcessor.from_pretrained(model_name)

    # Custom dataset class
    class CaptionDataset(torch.utils.data.Dataset):
        def __init__(self, dataset, processor):
            self.dataset = dataset
            self.processor = processor

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            # Process each item individually
            image = self.dataset[idx]['image']
            caption = self.dataset[idx]['image_description']

            # Prepare inputs
            try:
                encoding = self.processor(
                    images=image,
                    text=caption,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=50
                )
            except Exception as e:
                print(f"Error processing item {idx}: {e}")
                # Return dummy data or skip
                return {
                    'pixel_values': torch.zeros(3, 224, 224),
                    'input_ids': torch.zeros(50, dtype=torch.long),
                    'attention_mask': torch.zeros(50, dtype=torch.long)
                }

            return {
                'pixel_values': encoding['pixel_values'].squeeze(),
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze()
            }

    # Create dataset
    train_dataset = CaptionDataset(dataset_train, processor)

    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir="./blip2_finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        report_to="none"  # Disable wandb
    )

    # Custom data collator with padding
    def data_collator(features):
        # Filter out any None or invalid features
        features = [f for f in features if f is not None and 'pixel_values' in f]

        if not features:
            raise ValueError("No valid features after filtering")

        pixel_values = torch.stack([f['pixel_values'] for f in features])
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [f['input_ids'] for f in features],
            batch_first=True,
            padding_value=processor.tokenizer.pad_token_id
        )
        attention_mask = torch.nn.utils.rnn.pad_sequence(
            [f['attention_mask'] for f in features],
            batch_first=True,
            padding_value=0
        )

        return {
            'pixel_values': pixel_values,
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

    # Prepare custom trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
    )

    # Fine-tune the model
    trainer.train()

    return model, processor

In [None]:
class CustomViTTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Extract inputs
        pixel_values = inputs.get('pixel_values')
        labels = inputs.get('labels')

        # Forward pass
        outputs = model(
            pixel_values=pixel_values,
            labels=labels
        )

        # The model returns a dictionary with loss
        loss = outputs.loss

        return (loss, outputs) if return_outputs else loss


In [None]:
def prepare_vit_model():
    # Load pre-trained ViT model
    model_name = "nlpconnect/vit-gpt2-image-captioning"
    model = VisionEncoderDecoderModel.from_pretrained(model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Custom dataset class
    class CaptionDataset(torch.utils.data.Dataset):
        def __init__(self, dataset, feature_extractor, tokenizer):
            self.dataset = dataset
            self.feature_extractor = feature_extractor
            self.tokenizer = tokenizer

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            # Process each item individually
            try:
                # Process image
                pixel_values = self.feature_extractor(
                    self.dataset[idx]['image'],
                    return_tensors="pt"
                ).pixel_values.squeeze()

                # Process caption
                caption = self.dataset[idx]['image_description']
                labels = self.tokenizer(
                    caption,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=50
                ).input_ids.squeeze()

                return {
                    'pixel_values': pixel_values,
                    'labels': labels
                }
            except Exception as e:
                print(f"Error processing item {idx}: {e}")
                # Return dummy data or skip
                return {
                    'pixel_values': torch.zeros(3, 224, 224),
                    'labels': torch.zeros(50, dtype=torch.long)
                }
    # Create dataset
    train_dataset = CaptionDataset(dataset_train, feature_extractor, tokenizer)

    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir="./vit_finetuned",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        report_to="none"  # Disable wandb
    )

    # Custom data collator with padding
    def data_collator(features):
        # Filter out any None or invalid features
        features = [f for f in features if f is not None and 'pixel_values' in f]

        if not features:
            raise ValueError("No valid features after filtering")

        pixel_values = torch.stack([f['pixel_values'] for f in features])
        labels = torch.nn.utils.rnn.pad_sequence(
            [f['labels'] for f in features],
            batch_first=True,
            padding_value=tokenizer.pad_token_id
        )

        return {
            'pixel_values': pixel_values,
            'labels': labels
        }

    # Prepare custom trainer
    trainer = CustomViTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
    )

    # Fine-tune the model
    trainer.train()

    return model, feature_extractor, tokenizer

In [None]:
# Fine-tune both models
blip2_model, blip2_processor = prepare_blip2_model()

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Step,Training Loss
10,5.4374
20,5.4115
30,5.8494
40,5.1935
50,5.3082
60,5.6104
70,5.2048
80,4.9305
90,5.1055
100,4.5261


In [None]:
# Save the fine-tuned models
blip2_model.save_pretrained("./blip2_finetuned_model")
blip2_processor.save_pretrained("./blip2_finetuned_processor")

[]

In [None]:
eg_image = dataset[2]['image']
image_desc=dataset[2]['image_description']
#print(eg_image)
print(image_desc)

There are two men in suits standing at the door of an office both looking at a desk with a medieval sword stuck into the middle of the desk. One man is talking.


In [None]:
# Load the fine-tuned BLIP-2 model and processor
blip2_finetuned_model = BlipForConditionalGeneration.from_pretrained("./blip2_finetuned_model")
blip2_finetuned_processor = BlipProcessor.from_pretrained("./blip2_finetuned_processor")


In [None]:
# Preprocess Image
def preprocess_image(image):
    """Preprocess a PIL image for BLIP-2."""
    if image.mode != "RGB":
        image = image.convert("RGB")
    return blip2_finetuned_processor(images=image, return_tensors="pt")

# Generate Caption
def generate_image_description(image):
    """Generate a description for an image using BLIP-2."""
    inputs = preprocess_image(image)
    pixel_values = inputs["pixel_values"]
    #text_input = blip_processor.tokenizer("Describe the contents of the image:", return_tensors="pt").input_ids
    inputs = blip2_finetuned_processor(images=image, text="Describe this image in detail:", return_tensors="pt")

    with torch.no_grad():
        output_ids = blip2_finetuned_model.generate(
            **inputs,
            max_length=100,
            num_beams=4,  # Use beam search
            early_stopping=True
        )
    caption = blip2_finetuned_processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

# Example usage
cartoon_image = eg_image  # Image from your dataset
description = generate_image_description(cartoon_image)
print("Image Description:", description)

Image Description: describe this image in detail : a man is standing in an office with a sword stuck into his desk, and another man looks at him angrily


In [None]:
vit_model, vit_feature_extractor, vit_tokenizer = prepare_vit_model()

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.46.2"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

Error processing item 142: Unsupported number of image dimensions: 2
Error processing item 56: Unsupported number of image dimensions: 2
Error processing item 198: Unsupported number of image dimensions: 2
Error processing item 77: Unsupported number of image dimensions: 2
Error processing item 50: Unsupported number of image dimensions: 2
Error processing item 55: Unsupported number of image dimensions: 2
Error processing item 4: Unsupported number of image dimensions: 2
Error processing item 69: Unsupported number of image dimensions: 2


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,1.7014
20,0.9797
30,0.8927
40,0.8249
50,0.5206
60,0.4118
70,0.2908
80,0.186
90,0.0409
100,0.2821


Error processing item 174: Unsupported number of image dimensions: 2
Error processing item 47: Unsupported number of image dimensions: 2
Error processing item 88: Unsupported number of image dimensions: 2
Error processing item 114: Unsupported number of image dimensions: 2
Error processing item 58: Unsupported number of image dimensions: 2
Error processing item 171: Unsupported number of image dimensions: 2
Error processing item 155: Unsupported number of image dimensions: 2
Error processing item 67: Unsupported number of image dimensions: 2
Error processing item 195: Unsupported number of image dimensions: 2
Error processing item 162: Unsupported number of image dimensions: 2
Error processing item 111: Unsupported number of image dimensions: 2
Error processing item 7: Unsupported number of image dimensions: 2
Error processing item 89: Unsupported number of image dimensions: 2
Error processing item 76: Unsupported number of image dimensions: 2
Error processing item 115: Unsupported num

Evaluating BLIP2 model

In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=df926072cef73a353411b8f533eb58e19d6b8eddcfeb8baa5cb470cce0d037ce
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from PIL import Image
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
import nltk

In [None]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

In [None]:
try:
    # Try downloading punkt_tab
    nltk.download('punkt_tab', quiet=True)
except:
    pass

In [None]:
class ImageDescriptionEvaluator:
    def __init__(self, ground_truth):
        """
        Initialize evaluator with ground truth description

        Args:
            ground_truth (str): Manually annotated description of the image
        """
        self.ground_truth = ground_truth

        # Initialize evaluation metrics
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def generate_description_vit(self, image):
        """
        Generate description using Vision Transformer model

        Args:
            image (PIL.Image): Input image

        Returns:
            str: Generated description
        """

        image = self.preprocess_image(image)
        # Load ViT model
        model_name = "nlpconnect/vit-gpt2-image-captioning"
        model = VisionEncoderDecoderModel.from_pretrained(model_name)
        feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Preprocess image
        inputs = feature_extractor(image, return_tensors="pt")

        # Generate description
        outputs = model.generate(**inputs, max_length=50)
        description = tokenizer.decode(outputs[0], skip_special_tokens=True)

        return description

    def preprocess_image(self, image):
        """
        Ensure image is in correct format for models

        Args:
            image (PIL.Image or numpy.ndarray): Input image

        Returns:
            PIL.Image: Preprocessed image
        """
        # If image is a numpy array, convert to PIL Image
        if isinstance(image, np.ndarray):
            if image.ndim == 2:
                # Grayscale to RGB
                image = Image.fromarray(image).convert('RGB')
            elif image.ndim == 3:
                # Check if it's already in RGB format
                if image.shape[2] == 3:
                    image = Image.fromarray(image.astype('uint8'), 'RGB')
                else:
                    raise ValueError(f"Unexpected image shape: {image.shape}")

        # If it's not already a PIL Image, convert
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)

        # Ensure RGB
        if image.mode != 'RGB':
            image = image.convert('RGB')

        return image

    # def generate_description_blip2(self, image):
    #     """
    #     Generate description using BLIP-2 model

    #     Args:
    #         image (PIL.Image): Input image

    #     Returns:
    #         str: Generated description
    #     """

    #     image = self.preprocess_image(image)
    #     # Load BLIP-2 model
    #     model_name = "Salesforce/blip-image-captioning-large"
    #     model = BlipForConditionalGeneration.from_pretrained(model_name)
    #     processor = BlipProcessor.from_pretrained(model_name)

    #     # Preprocess image
    #     inputs = processor(image, return_tensors="pt")
    #     #inputs = processor(images=image, text="Describe this image in detail:", return_tensors="pt")

    #     with torch.no_grad():
    #         outputs = model.generate(
    #             **inputs,
    #             max_length=100,
    #             num_beams=4,  # Use beam search
    #             early_stopping=True
    #         )
    #     # Generate description
    #     #outputs = model.generate(**inputs, max_length=50)
    #     description = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return description

    def evaluate_description(self, generated_description):
        """
        Evaluate generated description against ground truth

        Args:
            generated_description (str): Description to evaluate

        Returns:
            dict: Evaluation metrics
        """
        # Tokenize descriptions
        ground_truth_tokens = word_tokenize(self.ground_truth.lower())
        generated_tokens = word_tokenize(generated_description.lower())

        # Calculate ROUGE scores
        rouge_scores = self.rouge_scorer.score(self.ground_truth, generated_description)

        # Calculate BLEU score
        bleu_score = sentence_bleu([ground_truth_tokens], generated_tokens)

        # Calculate unique word coverage
        unique_gt_words = set(ground_truth_tokens)
        unique_gen_words = set(generated_tokens)
        word_coverage = len(unique_gen_words.intersection(unique_gt_words)) / len(unique_gt_words)

        return {
            'ROUGE-1': rouge_scores['rouge1'].fmeasure,
            'ROUGE-2': rouge_scores['rouge2'].fmeasure,
            'ROUGE-L': rouge_scores['rougeL'].fmeasure,
            'BLEU Score': bleu_score,
            'Word Coverage': word_coverage
        }

    def compare_models(self, image):
        """
        Compare ViT and BLIP-2 models for image description

        Args:
            image (PIL.Image): Input image

        Returns:
            dict: Comparative evaluation results
        """
        # Generate descriptions
        #vit_description = self.generate_description_vit(image)
        blip2_description = description

        # Evaluate descriptions
        #vit_metrics = self.evaluate_description(vit_description)
        blip2_metrics = self.evaluate_description(blip2_description)

        return {
            #'ViT Description': vit_description,
            'BLIP-2 Description': blip2_description,
            #'ViT Metrics': vit_metrics,
            'BLIP-2 Metrics': blip2_metrics
        }

In [None]:
# Example usage
# Replace with your actual ground truth description
ground_truth = "There are two men in suits standing at the door of an office both looking at a desk with a medieval sword stuck into the middle of the desk. One man is talking."

# Load your cartoon image
# Assuming cartoon_image is your PIL Image object
evaluator = ImageDescriptionEvaluator(ground_truth)
comparison_results = evaluator.compare_models(cartoon_image)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# Print results
print("Comparison Results:")
#print("\nViT Description:", comparison_results['ViT Description'])
print("\nBLIP-2 Description:", comparison_results['BLIP-2 Description'])


Comparison Results:

BLIP-2 Description: describe this image in detail : a man is standing in an office with a sword stuck into his desk, and another man looks at him angrily


In [None]:
print("\nBLIP-2 Metrics:")
for metric, value in comparison_results['BLIP-2 Metrics'].items():
    print(f"{metric}: {value}")


BLIP-2 Metrics:
ROUGE-1: 0.5084745762711863
ROUGE-2: 0.21052631578947367
ROUGE-L: 0.3728813559322034
BLEU Score: 2.3236098643893214e-78
Word Coverage: 0.4642857142857143


After fine tning BLIPI2->
BLIP-2 Metrics:
ROUGE-1: 0.5084745762711863
ROUGE-2: 0.21052631578947367
ROUGE-L: 0.3728813559322034
BLEU Score: 2.3236098643893214e-78
Word Coverage: 0.4642857142857143