In [1]:
# %reload_ext autoreload
# %autoreload 0

# %set_env CUDA_VISIBLE_DEVICES=7
# import sys; sys.path.append('/future/u/okhattab/repos/public/stanfordnlp/dspy')

import dspy
from dspy.predict import Predict
import dsp
from dsp.primitives.vision import Image
from dsp.modules.gpt4vision import GPT4Vision
from pydantic import BaseModel, ConfigDict

  from .autonotebook import tqdm as notebook_tqdm


### 1) Configure the VLM

In [2]:
gpt = GPT4Vision()
dspy.settings.configure(lm=gpt)

### 2) Ask questions about the scene

In [3]:
#Define a simple signature for basic question answering
class QASignature(dspy.Signature):
    "Basic visual question answering"
    question: str = dspy.InputField(desc="The question to ask")
    image: Image = dspy.InputField(desc="The image to ask the question about")
    answer: str = dspy.OutputField(desc="The answer to the question")

gpt = GPT4Vision()
dspy.settings.configure(lm=gpt, num_generations=1)
#Pass signature to ChainOfThought module
generate_answer = dspy.Predict(QASignature)

# Call the predictor on a particular input alongside a hint.
question='What do you see?'
image= Image('gripper_mid_to_grasp.png', encoding='png')
pred = generate_answer(question=question, image=image)


print(f"Question: {question}")
print(f"Predicted Answer: {pred.answer}")



 prompt to LM
('Basic visual question answering\n\n---\n\nFollow the following format.\n\nQuestion: The question to ask\nImage: The image to ask the question about\nAnswer: The answer to the question\n\n---\n\nQuestion: What do you see?', Image(base64=iVBORw0KGg..., encoding=png, size=(1078, 1138)))
Question: What do you see?
Predicted Answer: Answer: I see a robotic arm positioned over a large white sheet of paper on a wooden floor. There is a small object with yellow and blue colors on the paper. The robot appears to be equipped with sensors or cameras, and there are some electronic components visible on its body. There's also a black speaker mounted on the wall.


### 3) Get a high level plan to complete the task.

In [4]:

class RobotBrain(dspy.Signature):
    """Imagine you have a robot that can move in 3D space and has a gripper."""
    task:str = dspy.InputField(desc="Task description for the robot")
    image: Image = dspy.InputField(desc="Image the robot sees")
    steps:list[str] = dspy.OutputField(desc="Plan to complete task")

dspy.settings.configure(lm=gpt, num_generations=1)
#Pass signature to ChainOfThought module
generate_answer = dspy.Predict(RobotBrain)

# Call the predictor on a particular input alongside a hint.
task='What are the next 3 action should the robot take to pick up the block?'
image= Image('gripper_mid_to_grasp.png', encoding='png')
hint = "The robot has only one gripper and can move in 3D space."
pred = generate_answer(task=task, image=image, hint=hint)


print(f"Question: {task}")
print(f"Predicted Answer: {pred.steps}")
gpt.inspect_history(n=10)



 prompt to LM
('Imagine you have a robot that can move in 3D space and has a gripper.\n\n---\n\nFollow the following format.\n\nTask: Task description for the robot\nImage: Image the robot sees\nSteps: Plan to complete task\n\n---\n\nTask: What are the next 3 action should the robot take to pick up the block?', Image(base64=iVBORw0KGg..., encoding=png, size=(1078, 1138)))
Question: What are the next 3 action should the robot take to pick up the block?
Predicted Answer: Task: Pick up the block

Image: The robot sees a block on a white surface, with the gripper positioned above and to the side of the block.

Steps:
1. Move the gripper directly above the block while maintaining a safe distance to avoid any collision.
2. Lower the gripper until it is just above the block, ensuring proper alignment for grasping.
3. Open the gripper, position it around the block, and then close the gripper to securely grasp the block.




Basic visual question answering

---

Follow the following format.



### 4) Execute the plan

In [5]:

class RobotMover(dspy.Signature):
    """Imagine you have a robot that can move in 3D space and has a gripper."""
    task:str = dspy.InputField(desc="Task description for the robot")
    image: Image = dspy.InputField(desc="Image the robot sees")
    actions:list[dict] = dspy.OutputField(desc="[{x:?, y:?, z:?, roll:?, pitch:?, yaw:?, gripper_open_or_close:?},...]")


gpt = GPT4Vision(max_tokens=1000)
dspy.settings.configure(lm=gpt, num_generations=1)
generate_answer = dspy.Predict(RobotMover, max_tokens=1000)
task = 'What are the next three actions that the robot should take to pick up the block?'
image = Image('gripper_mid_to_grasp.png', encoding='jpeg')
hint = "Usually, the depth is underestimated. Make sure the output is a valid python list of dictionaries."
actions = generate_answer(task=task, image=image, hint=hint)
print(f"task: {task}")
print(f"Predicted Answer: {actions.actions}")
  




 prompt to LM
('Imagine you have a robot that can move in 3D space and has a gripper.\n\n---\n\nFollow the following format.\n\nTask: Task description for the robot\nImage: Image the robot sees\nActions: [{x:?, y:?, z:?, roll:?, pitch:?, yaw:?, gripper_open_or_close:?},...]\n\n---\n\nTask: What are the next three actions that the robot should take to pick up the block?', Image(base64=iVBORw0KGg..., encoding=png, size=(1078, 1138)))
task: What are the next three actions that the robot should take to pick up the block?
Predicted Answer: Task: Pick up the block

Image: The robot sees a block on a white surface, positioned at a certain distance directly in front of it.

Actions:
- {x:0, y:1, z:0, roll:0, pitch:0, yaw:0, gripper_open_or_close:open} // Move to above the block and ensure the gripper is open
- {x:0, y:1, z:-1, roll:0, pitch:0, yaw:0, gripper_open_or_close:open} // Lower the gripper to the block's height
- {x:0, y:1, z:-1, roll:0, pitch:0, yaw:0, gripper_open_or_close:close} 

## 

In [6]:
SYSTEM_PROMPT = """Example json structure of output you should follow:
{"actions":[
{x:0, y:1, z:0, roll:0, pitch:0, yaw:0, gripper_open_or_close:1}, 
{x:0, y:1, z:-1, roll:0, pitch:0, yaw:0, gripper_open_or_close:0},
]}"""


class RobotMover(dspy.Signature):
    """Imagine that you have a robot that can move in 3D space and has a gripper."""
    task:str = dspy.InputField(desc="Task description for the robot")
    image: Image = dspy.InputField(desc="Image the robot sees")
    actions:list[dict] = dspy.OutputField(desc="[{x:<meters>, y:<meters>, z:<meters>, roll:<radians>, pitch:<radians>, yaw:<radians>, gripper_open_or_close:<0 or 1>},...]")

generate_answer = dspy.Predict(RobotMover)
gpt = GPT4Vision(max_tokens=1000)
task = 'What are the next three actions that the robot should take to pick up the block?'

hint = "Usually, the depth is underestimated. Make sure the output is a valid python list of dictionaries."
actions = generate_answer(task=task, image=image, hint=hint, response_format={'type': 'json'})
print(f"task: {task}")
print(f"Predicted Answer: {actions.actions}")



 prompt to LM
('Imagine that you have a robot that can move in 3D space and has a gripper.\n\n---\n\nFollow the following format.\n\nTask: Task description for the robot\nImage: Image the robot sees\nActions: [{x:<meters>, y:<meters>, z:<meters>, roll:<radians>, pitch:<radians>, yaw:<radians>, gripper_open_or_close:<0 or 1>},...]\n\n---\n\nTask: What are the next three actions that the robot should take to pick up the block?', Image(base64=/9j/4AAQSk..., encoding=jpeg, size=(1078, 1138)))
task: What are the next three actions that the robot should take to pick up the block?
Predicted Answer: Task: Pick up the block

Image: The robot sees a block on a white surface, with the gripper positioned above and to the side of the block.

Actions: 
[
  {x:0.2, y:0, z:-0.2, roll:0, pitch:0, yaw:0, gripper_open_or_close:1},
  {x:0, y:0, z:-0.05, roll:0, pitch:0, yaw:0, gripper_open_or_close:1},
  {x:0, y:0, z:0, roll:0, pitch:0, yaw:0, gripper_open_or_close:0}
]

(Note: The x, y, z coordinates a

In [7]:
gpt.inspect_history(n=10)

In [9]:
from pydantic import BaseModel, ConfigDict
class PoseGrasp(BaseModel):
    x: float
    y: float
    z: float
    roll: float
    pitch: float
    yaw: float
    gripper_open_or_close: int


class TypedRobotMover(dspy.Signature):
    """Imagine that you have a robot that can move in 3D space and has a gripper."""
    task:str = dspy.InputField(desc="Task description for the robot")
    image: Image = dspy.InputField(desc="Image the robot sees")
    actions:list[PoseGrasp] = dspy.OutputField(desc="{actions: [{x:<meters>, y:<meters>, z:<meters>, roll:<radians>, pitch:<radians>, yaw:<radians>, gripper_open_or_close:<0 or 1>},...{}]}")

generate_answer = dspy.TypedPredictor(TypedRobotMover)
gpt = GPT4Vision(max_tokens=1000)
task = 'What are the next three actions that the robot should take to pick up the block?'
hint = "Usually the depth is underestimated. Make sure the output is a valid python list of dictionaries."
actions = generate_answer(task=task, image=image, hint=hint)
print(f"task: {task}")
print(f"Predicted Answer: {actions.actions}")



 prompt to LM
Imagine that you have a robot that can move in 3D space and has a gripper.

---

Follow the following format.

Task: Task description for the robot
Image: Image the robot sees
Actions: {actions: [{x:<meters>, y:<meters>, z:<meters>, roll:<radians>, pitch:<radians>, yaw:<radians>, gripper_open_or_close:<0 or 1>},...{}]}. Respond with a single JSON object. JSON Schema: {"$defs": {"PoseGrasp": {"properties": {"x": {"title": "X", "type": "number"}, "y": {"title": "Y", "type": "number"}, "z": {"title": "Z", "type": "number"}, "roll": {"title": "Roll", "type": "number"}, "pitch": {"title": "Pitch", "type": "number"}, "yaw": {"title": "Yaw", "type": "number"}, "gripper_open_or_close": {"title": "Gripper Open Or Close", "type": "integer"}}, "required": ["x", "y", "z", "roll", "pitch", "yaw", "gripper_open_or_close"], "title": "PoseGrasp", "type": "object"}}, "properties": {"value": {"items": {"$ref": "#/$defs/PoseGrasp"}, "title": "Value", "type": "array"}}, "required": ["value