In [1]:
import ollama
import rir_api
import os
from dotenv import load_dotenv

def process_with_vlm(segmented_image_path):
    load_dotenv()
    openai_api_key = os.getenv('OPENAI_API_KEY')

    system_text = "You are an advanced visual analysis assistant. Your role is to analyze images, identify key elements, and describe the scene in detail."
    
    api = rir_api.RIR_API(openai_api_key)
    image_url = f"file://{segmented_image_path}"
    query_text = "What is in this image?"
    
    response = api.query_with_image(image_url, query_text)
    additional_context = response['context']
    
    text = (
        f"Based on the image and additional context: {additional_context}, analyze the image in detail. "
        "Describe all visible objects, their colors, sizes, shapes, positions, and any unique features. "
        "Explain the relationships between the objects, their potential functions, and the context of the scene. "
        "If there are any patterns, textures, or materials, describe them as well. "
        "Additionally, infer the possible purpose or scenario depicted in the image, considering cultural, environmental, or functional aspects. "
        "Provide any relevant background information or assumptions that could be drawn based on the visual elements."
    )

    prompt = f"<s>[INST]<<SYS>>\n{system_text}<<\SYS>>\n{text}[/INST]"

    vlm_response = ollama.chat(
        model='llava',
        messages=[
            {
                'role': 'user',
                'content': prompt,
                'image': segmented_image_path
            }
        ]
    )

    return vlm_response['message']['content']


 The image shows a person standing indoors, looking out of a window. The individual is wearing a white shirt and dark pants. They are holding a camera, suggesting they may be preparing to take a photograph.

The room has a white wall on the left side, and there appears to be a small object with red and white colors near the person's feet. The flooring is not clearly visible due to the angle of the photo. On the right side, there is another object that looks like a light-colored box or package.

The window the person is looking through has no visible curtains or blinds. Outside the window, there are trees and what appears to be a clear sky, indicating it might be daytime with good weather conditions. The presence of natural light suggests that the room is well lit.

Based on the available information, it can be inferred that the person could be capturing a photograph of the outdoor scenery or an interesting subject outside the window. The red and white object near the person's feet may 