# 🖼️ Image Caption Generator using BLIP
This Colab notebook lets you upload an image and generate a human-like caption using Hugging Face's BLIP model.

In [1]:
# 🔧 Install dependencies
!pip install transformers torch torchvision pillow gradio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
# 📦 Import required libraries
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import io

In [3]:
!pip install -q gTTS


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to(device)


In [None]:
'''from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load BLIP-2 FLAN-T5-BASE model (lightweight)
processor_blip2 = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-base")
model_blip2 = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-base",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)


In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load grammar polishing model
grammar_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
grammar_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to(device)

def polish_grammar(text):
    input_text = f"paraphrase: {text} </s>"
    input_ids = grammar_tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = grammar_model.generate(
        input_ids,
        max_length=64,
        num_beams=5,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    return grammar_tokenizer.decode(outputs[0], skip_special_tokens=True)



In [12]:
from transformers import MarianMTModel, MarianTokenizer

# Supported translation models from English to various languages
translation_models = {
    "telugu": "Helsinki-NLP/opus-mt-en-te",
    "hindi": "Helsinki-NLP/opus-mt-en-hi",
    "spanish": "Helsinki-NLP/opus-mt-en-es",
    "french": "Helsinki-NLP/opus-mt-en-fr"
}

# Load translation model for the selected language
def load_translator(lang_code="telugu"):
    model_name = translation_models.get(lang_code)
    if not model_name:
        raise ValueError("Unsupported language code.")
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Translate English caption to selected language
def translate_caption(text, lang_code="telugu"):
    tokenizer, model = load_translator(lang_code)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**inputs)
    return tokenizer.decode(translated[0], skip_special_tokens=True)



In [13]:
def generate_caption(image):
    inputs = processor(images=image, return_tensors="pt").to(device)
    output = model.generate(**inputs)
    caption = processor.decode(output[0], skip_special_tokens=True)

    # 🧹 Remove repeated words (case-insensitive)
    seen = set()
    cleaned = []
    for word in caption.split():
        lw = word.lower()
        if lw not in seen:
            cleaned.append(word)
            seen.add(lw)

    return " ".join(cleaned)



In [21]:
# 📤 Upload image
from google.colab import files
uploaded = files.upload()

# Load uploaded image
for filename in uploaded.keys():
    image = Image.open(io.BytesIO(uploaded[filename]))
    image = image.convert('RGB')

Saving istockphoto-185922048-612x612.jpg to istockphoto-185922048-612x612 (1).jpg


In [15]:
from gtts import gTTS

def generate_tts(text, lang="en", filename="caption.mp3"):
    tts = gTTS(text=text, lang=lang)
    path = f"/tmp/{filename}"
    tts.save(path)
    return path


In [16]:
!pip install -q deep-translator


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
from deep_translator import GoogleTranslator

def translate_caption_google(text, target_lang="te"):
    return GoogleTranslator(source='auto', target=target_lang).translate(text)


In [22]:
caption = generate_caption(image)
caption_polished = polish_grammar(caption)
caption_translated = translate_caption_google(caption_polished, target_lang="te")  # te = Telugu

print("📷 Raw Caption:", caption)
print("✅ Polished Caption:", caption_polished)
print("🌐 Translated (Telugu):", caption_translated)
image.show()


📷 Raw Caption: two boys fighting each other
✅ Polished Caption: Two boys fighting each other
🌐 Translated (Telugu): ఇద్దరు కుర్రాళ్ళు ఒకరితో ఒకరు పోరాడుతున్నారు


In [23]:
# (Optional) 🌐 Gradio UI preview
import gradio as gr
from PIL import Image

def caption_image(image, language):
    caption = generate_caption(image)
    polished = polish_grammar(caption)

    lang_codes = {
        "Telugu": "te",
        "Hindi": "hi",
        "French": "fr",
        "Spanish": "es"
    }

    # Translate polished caption to selected language (for text display only)
    translated = translate_caption_google(polished, target_lang=lang_codes[language])

    # Always generate English voice narration from the polished English caption
    audio_path = generate_tts(polished, lang="en", filename="caption.mp3")

    return caption, polished, translated, audio_path



# Gradio UI
import gradio as gr

demo = gr.Interface(
    fn=caption_image,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Dropdown(["Telugu", "Hindi", "French", "Spanish"], label="Select Translation Language")


    ],
    outputs=[
        gr.Text(label="Raw Caption"),
        gr.Text(label="Polished Caption"),
        gr.Text(label="Translated Caption"),
        gr.Audio(label="Voice Narration (TTS)")
    ],
    title="🖼️ AI Image Captioning with 🎤 Voice Narration",
    description="Upload an image to get a caption with grammar polishing, translation, and voice narration."
)

demo.launch()



It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7235281bdb79af6352.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


