In [None]:
import ollama
import base64
import requests
from io import BytesIO

In [None]:
def pull_model_if_needed(model_name):
    """Pulls the specified model from Ollama if it doesn't exist."""
    try:
        ollama.pull(model=model_name)
        print(f"Model '{model_name}' pulled successfully/already existed.")
        return True
    except Exception as e:
        print(f"Error pulling model '{model_name}': {e}")
        return False

In [None]:
def encode_image_from_url(image_url):
    """Encodes an image from a URL to base64, including a User-Agent header."""
    headers = {'User-Agent': 'NameOfApplication/1.0 (youremail@emaildomain.com)'}  # ***Replace with your own info***
    try:
        response = requests.get(image_url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        image = BytesIO(response.content)
        return base64.b64encode(image.read()).decode('utf-8')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching image from URL '{image_url}': {e}")
        return None

In [None]:
def encode_image_from_path(image_path):
    """Encodes a local image file to base64."""
    try:
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    except FileNotFoundError:
        print(f"Error: Image file not found at '{image_path}'")
        return None
    except Exception as e:
        print(f"Error encoding local image '{image_path}': {e}")
        return None

In [None]:
def query_multimodal_model(model_name, question, images):
    """Queries a multimodal Ollama model using the 'images' parameter as in the curl example."""
    try:
        response = ollama.generate(
            model=model_name,
            prompt=question,
            images=images  # Pass the list of base64 encoded images here
        )
        return response['response'].strip()
    except Exception as e:
        print(f"Error querying model '{model_name}': {e}")
        return None

In [None]:
print("Starting experiment...pulling models as required...")
multimodal_model = "llava"  # Or "llama3.2-vision" or another multimodal model you have access to on ollama
pull_result = pull_model_if_needed(multimodal_model)
if pull_result is True:
    
    # --- Experiment Setup ---
    image_sources = [
        {"type": "url", "path": "https://upload.wikimedia.org/wikipedia/commons/4/4a/Yawning_cat_portrait_%288423278464%29.jpg"},
        {"type": "url", "path": "https://upload.wikimedia.org/wikipedia/commons/3/30/Blausen_0389_EyeAnatomy_02.png"},
        {"type": "local", "path": r"test1.jpg"},  # Replace with your actual local image path
        {"type": "local", "path": r"test2.jpg"},  # Replace with your actual local image with text
        {"type": "url", "path": "https://upload.wikimedia.org/wikipedia/commons/0/05/William_Frederick_Yeames_-_And_when_did_you_last_see_your_father%3F_-_Google_Art_Project.jpg"}, # Complex image
        {"type": "url", "path": "https://upload.wikimedia.org/wikipedia/commons/0/0f/Piet_Mondriaan%2C_1939-1942_-_Composition_10.jpg"} # Abstract image
    ]

    questions = [
        "Describe this image.",
        "What objects are in this picture?",
        "Is there text in this image? If so, what does it say?",
        "What is the main subject of this image?",
        "What colors are prominent in this image?"
    ]

    # --- Run the Experiment ---
    print(f"\n--- Exploring Multimodal Capabilities of {multimodal_model} ---")

    for image_info in image_sources:
        print(f"\n--- Processing Image: {image_info['path']} ---")
        encoded_image = None
        if image_info["type"] == "url":
            encoded_image = encode_image_from_url(image_info["path"])
        elif image_info["type"] == "local":
            encoded_image = encode_image_from_path(image_info["path"])

        if encoded_image:
            for question in questions:
                response = query_multimodal_model(multimodal_model, question, [encoded_image])
                if response:
                    print(f"Question: {question}")
                    print(f"Answer: {response}\n")
        else:
            print("Skipping this image due to encoding failure.\n")

    print("\n--- Experiment Finished ---")
    print("Observe the accuracy of the descriptions, object identification, and text reading capabilities.")
    print("Consider how the model handles different types of images (simple, diagrams, local, with text, complex, abstract).")
else:
    print("Exiting due to model pulling failure.")