In [None]:
# -*- coding: utf-8 -*-
"""Agentic_AI_Image_Alt_Text.ipynb
"""

    

In [None]:
# Install necessary libraries
!pip install crewai langchain-community ollama langchain pillow transformers torch

import os
from crewai import Agent, Task, Crew, Process
from langchain_community.llms import Ollama
from PIL import Image
import io
import base64
from transformers import BlipProcessor, BlipForConditionalGeneration

# Ensure you have Ollama running with Llama3.2 model
# You can pull llama3.2 with `ollama pull llama3:2`

# Set up Ollama LLM
ollama_llm = Ollama(model="llama3:2")

# Load BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")



In [None]:
def describe_image(image_path):
    """Describes an image using BLIP."""
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        outputs = model.generate(**inputs)
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        return f"Error describing image: {e}"



In [None]:
# Agents
image_analyzer = Agent(
    role="Image Context Analyzer",
    goal="Analyze images and understand their context to generate accurate alternative text.",
    backstory="You are an expert in image recognition and understanding visual content.",
    llm=ollama_llm,
    verbose=True,
)

alt_text_generator = Agent(
    role="Alternative Text Generator",
    goal="Generate concise and descriptive alternative text for images based on their context.",
    backstory="You are a skilled writer with a focus on accessibility and clear communication.",
    llm=ollama_llm,
    verbose=True,
)

review_agent = Agent(
    role="Human Review Suggestion Agent",
    goal="When alt text is hard to generate, provide clear and concise suggestions for a human to review and complete.",
    backstory="You are an expert in knowing when a human is needed in the loop for complex tasks.",
    llm=ollama_llm,
    verbose=True,
)



In [None]:
# Tasks
analyze_image_task = Task(
    description="Analyze the image and describe its main elements and context.",
    agent=image_analyzer,
)

generate_alt_text_task = Task(
    description="Based on the image analysis, generate a concise and descriptive alternative text for the image. If the image is complex or you are unsure, provide a suggestion for human review.",
    agent=alt_text_generator,
)

review_task = Task(
    description="If the Alt text generation agent is unable to create a good alt text, generate suggestions for a human to review and complete the alt text.",
    agent = review_agent,
    context = [generate_alt_text_task],
)

# Crew
crew = Crew(
    agents=[image_analyzer, alt_text_generator, review_agent],
    tasks=[analyze_image_task, generate_alt_text_task, review_task],
    process=Process.sequential,  # You can also use Process.hierarchical
)

# Example usage (replace with actual image path)
# Upload your image to Colab's file storage and change the path.
image_path = "your_image.jpg" # Replace with the path to your uploaded image

image_description = describe_image(image_path)

# Simulate task execution
analyze_image_task.context = image_description
result = crew.kickoff()

print("Result:")
print(result)