<a href="https://colab.research.google.com/github/shivamkkas/Image_classification_using_CNN/blob/main/Image_captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image

class ImageCaptioner:
    def __init__(self, model_name="microsoft/git-base-coco"):
        """
        Initialize image captioning model using pre-trained transformer

        Args:
            model_name (str): Hugging Face model identifier
        """
        # Check for GPU availability
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load pre-trained processor and model
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(self.device)

    def caption_image(self, image_path, max_new_tokens=50):
        """
        Generate caption for a given image

        Args:
            image_path (str): Path to the input image
            max_new_tokens (int): Maximum number of tokens to generate

        Returns:
            str: Generated image caption
        """
        try:
            # Open and process the image
            image = Image.open(image_path)

            # Prepare inputs for the model
            inputs = self.processor(images=image, return_tensors="pt").to(self.device)

            # Generate caption
            output = self.model.generate(**inputs, max_new_tokens=max_new_tokens)

            # Decode the generated tokens
            caption = self.processor.decode(output[0], skip_special_tokens=True)

            return caption

        except Exception as e:
            print(f"Error processing image: {e}")
            return None

    def batch_caption_images(self, image_paths, max_new_tokens=50):
        captions = []
        for path in image_paths:
            caption = self.caption_image(path, max_new_tokens)
            captions.append(caption)
        return captions

    def compare_models(self, models=None):
        """
        Compare different pre-trained image captioning models

        Args:
            models (list): List of model names to compare
        """
        if models is None:
            models = [
                "microsoft/git-base-coco",
                "microsoft/git-large-coco",
                "nlpconnect/vit-gpt2-image-captioning"
            ]

        print("Model Comparison:")
        for model_name in models:
            try:
                # Load model and processor
                processor = AutoProcessor.from_pretrained(model_name)
                model = AutoModelForCausalLM.from_pretrained(model_name)

                # Print model details
                print(f"\nModel: {model_name}")
                print(f"Total Parameters: {sum(p.numel() for p in model.parameters()):,}")
                print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

            except Exception as e:
                print(f"Error loading {model_name}: {e}")

def main():
    # Initialize the image captioner
    captioner = ImageCaptioner()

    # Optional: Compare available models
    captioner.compare_models()

    # Example usage with a single image
    image_path = "/content/download.jpg"
    caption = captioner.caption_image(image_path)

    if caption:
        print("\nGenerated Caption:")
        print(caption)

    # Batch processing example
    # image_paths = ["image1.jpg", "image2.jpg", "image3.jpg"]
    # captions = captioner.batch_caption_images(image_paths)
    # for path, caption in zip(image_paths, captions):
    #     print(f"{path}: {caption}")

if __name__ == '__main__':
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Model Comparison:

Model: microsoft/git-base-coco
Total Parameters: 176,619,066
Trainable Parameters: 176,619,066


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]


Model: microsoft/git-large-coco
Total Parameters: 394,196,026
Trainable Parameters: 394,196,026


preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

Error loading nlpconnect/vit-gpt2-image-captioning: Unrecognized configuration class <class 'transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder.VisionEncoderDecoderConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FalconMambaConfig, FuyuConfig, GemmaConfig, Gemma2Config, GitConfig, GlmConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GraniteConfig, GraniteMoeConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MllamaConfig, MoshiConfig, MptConfi

In [2]:
captioner = ImageCaptioner()

# Optional: Compare available models
captioner.compare_models()


Model Comparison:

Model: microsoft/git-base-coco
Total Parameters: 176,619,066
Trainable Parameters: 176,619,066

Model: microsoft/git-large-coco
Total Parameters: 394,196,026
Trainable Parameters: 394,196,026
Error loading nlpconnect/vit-gpt2-image-captioning: Unrecognized configuration class <class 'transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder.VisionEncoderDecoderConfig'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FalconMambaConfig, FuyuConfig, GemmaConfig, Gemma2Config, GitConfig, GlmConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GraniteConfig

In [3]:
image_path = "/content/images.jpg"
caption = captioner.caption_image(image_path)
if caption:
    print("\nGenerated Caption:")
    print(caption)


Generated Caption:
a family in a house with a baby and a man
