## Setup Data

In [1]:
url = "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"

from llama_hub.web.simple_web.base import SimpleWebPageReader

reader = SimpleWebPageReader(html_to_text=True)

In [2]:
documents = reader.load_data(urls=[url])

In [3]:
# print(documents[0].get_content())

In [4]:
# image document
from llama_index.schema import ImageDocument

image_document = ImageDocument(image_path="other_images/openai/dev_day.png")

## Setup Tools

In [5]:
from llama_index.llms import OpenAI
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.tools import QueryEngineTool, ToolMetadata

In [6]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm)

In [7]:
vector_index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [18]:
query_tool = QueryEngineTool(
    query_engine=vector_index.as_query_engine(),
    metadata=ToolMetadata(
        name=f"vector_tool",
        description=(
            "Useful to lookup new features announced by OpenAI"
            # "Useful to lookup any information regarding the image"
        ),
    ),
)

## Setup Agent

In [39]:
from llama_index.agent.react_multimodal.step import MultimodalReActAgentWorker

In [40]:
from llama_index.agent import AgentRunner
from llama_index.multi_modal_llms import MultiModalLLM, OpenAIMultiModal
from llama_index.agent import Task

mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", max_new_tokens=1000)

# Option 2: Initialize AgentRunner with OpenAIAgentWorker
react_step_engine = MultimodalReActAgentWorker.from_tools(
    [query_tool], 
    # [],
    multi_modal_llm=mm_llm, 
    verbose=True
)
agent = AgentRunner(react_step_engine)

In [41]:
task = agent.create_task(
    # "The photo shows some new features released by OpenAI. Can you pinpoint the features in the photo and give more details using relevant tools?",
    "Tell me more about code_interpreter and how it's used",
    extra_state={"image_docs": [image_document]}
)

In [42]:
# mm_response = mm_llm.complete(
#     "Can you pinpoint the features in the photo and explain them?",
#     [image_document]
# )

In [43]:
# message_dicts = mm_llm._get_multi_modal_chat_messages(
#     prompt="Can you pinpoint the features in the photo and explain them?",
#     role="user",
#     image_documents=[image_document]
# )

In [44]:
# print(message_dicts[0])

In [45]:
# print(str(mm_response))

In [46]:
def execute_step(agent: AgentRunner, task: Task):
    step_output = agent.run_step(task.task_id)
    if step_output.is_last:
        response = agent.finalize_response(task.task_id)
        print(f"> Agent finished: {str(response)}")
        return response
    else:
        return None

def execute_steps(agent: AgentRunner, task: Task):
    response = execute_step(agent, task)
    while response is None:
        response = execute_step(agent, task)
    return response

In [47]:
# execute_steps(agent, task)
execute_step(agent, task)

[1;3;38;5;200mThought: The image shows a code snippet in a code editor within the Playground interface. The code is written in Python and appears to be calculating the time it takes for light to travel from Earth to the Moon. The user is asking about the "code_interpreter" tool, which is likely a feature within the Playground interface that allows users to run code snippets and see the output. I need to use a tool to look up the latest information on the "code_interpreter" feature and how it is used.
Action: vector_tool
Action Input: {'input': 'code_interpreter feature OpenAI Playground'}
[0m[1;3;34mObservation: The Assistants API, which is part of OpenAI's platform, includes a feature called Code Interpreter. This feature allows developers to write and run Python code within a sandboxed execution environment. The Code Interpreter can generate graphs and charts, process files with diverse data and formatting, and solve challenging code and math problems iteratively. It provides a co

In [None]:
execute_step(agent, task)

In [24]:
print(str(task.extra_state["input_chat"][3]))

user: Observation: The latest features released by OpenAI include the GPT-4 Turbo model, the Assistants API, and multimodal capabilities such as vision, image creation (DALLÂ·E 3), and text-to-speech (TTS). These features were announced in a recent blog post and will be rolled out to OpenAI customers starting at 1pm PT today.
