In [10]:
import openai
from PIL import Image
import base64
import io
from dotenv import load_dotenv
import os

load_dotenv() 

api_key = os.getenv("OPENAI_API_KEY")

In [None]:
openai.api_key = api_key

def encode_image(image_path):
    """Encode image to base64 for OpenAI vision API."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def router_llm(input_data):
    """Using GPT-3.5-turbo to decide which LLM to use."""
    if isinstance(input_data, str):
        input_desc = input_data
    elif isinstance(input_data, dict) and "image" in input_data:
        input_desc = f"Image with prompt: {input_data.get('text', '')}"
    else:
        input_desc = str(input_data)

    router_prompt = f"""
You are a router LLM. Decide which model to use for the following input.

Available models:
1. Text LLM: Handles pure text input.
2. Vision LLM: Handles images or multimodal input (image + text).

Input: {input_desc}

Which model should be used? Reply with only 'text' or 'vision'.
"""
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": router_prompt}],
        max_tokens=5,
        temperature=0
    )
    choice = response.choices[0].message.content.strip().lower()
    if "vision" in choice:
        return "vision"
    return "text"

def text_llm(prompt):
    """Call GPT-3.5-turbo for text input."""
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=256,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

def vision_llm(image_path, prompt):
    """Call GPT-4o-vision-preview for image + text input."""
    base64_image = encode_image(image_path)
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
            ]}
        ],
        max_tokens=256,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

def main_router(input_data):
    route = router_llm(input_data)
    if route == "text":
        print("[Router] Using Text LLM")
        return text_llm(input_data if isinstance(input_data, str) else input_data.get("text", ""))
    elif route == "vision":
        print("[Router] Using Vision LLM")
        return vision_llm(input_data["image"], input_data.get("text", ""))
    else:
        return "Unable to route input."

In [16]:
if __name__ == "__main__":
    #test with text
    print("TEXT TEST:")
    result = main_router("What is the capital of Jordan?")
    print("Result:", result)

    #test with image
    print("\nIMAGE TEST:")
    result = main_router({"image": "cat.jpg", "text": "Describe this image."})
    print("Result:", result)

TEXT TEST:
[Router] Using Text LLM
Result: The capital of Jordan is Amman.

IMAGE TEST:
[Router] Using Vision LLM
Result: The image shows a ginger tabby cat sitting on a paved surface. The cat has a rich orange coat with distinct stripes and is looking directly at the camera. In the background, there is a blurred wooden fence and some greenery. The setting appears to be outdoors.
