<a href="https://colab.research.google.com/github/sapan-ostic/openvla/blob/neha_dev/OpenVLA_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
from io import BytesIO
from PIL import Image
import matplotlib.pyplot as plt

# Assuming you uploaded an image named 'my_image.jpg'
image_name = list(uploaded.keys())[0]
img = Image.open(BytesIO(uploaded[image_name]))

plt.imshow(img)
plt.show()


In [None]:
# Install minimal dependencies (`torch`, `transformers`, `timm`, `tokenizers`, ...)
!pip install -r https://raw.githubusercontent.com/openvla/openvla/main/requirements-min.txt

# Install build-essential for compilation tools
!apt-get update && apt-get install -y build-essential

# The previous attempt to install flash_attn failed and it's not compatible with the current GPU.
# Therefore, we remove the installation command.
# !pip install flash_attn --no-build-isolation

from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

import torch

In [None]:
import torch

# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    # attn_implementation="flash_attention_2",  # FlashAttention is not supported on this GPU
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).to("cuda:0")

# Grab image input & format prompt

# image: img # Load image from file
instruction = "put the carrot on the plate"
prompt = "In: What action should the robot take to {instruction}?\nOut:"

# Predict Action (7-DoF; un-normalize for BridgeData V2)
inputs = processor(prompt, img).to("cuda:0", dtype=torch.bfloat16)
action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)

print(f"Instruction: {instruction}")
print(f"Predicted Action (7-DoF): {action}")
# Execute...
# robot.act(action, ...)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Your output data
action = np.array([-2.08787322e-04, -1.00362692e-04, -7.25652626e-03,
                   4.96987302e-03, 1.66358845e-02, -6.67113405e-05, 9.96078431e-01])

labels = ['Rel X', 'Rel Y', 'Rel Z', 'Rel Roll', 'Rel Pitch', 'Rel Yaw', 'Gripper']

fig = plt.figure(figsize=(14, 5))

# Plot 1: Bar chart of all 7 dimensions
ax1 = fig.add_subplot(1, 2, 1)
colors = ['skyblue'] * 3 + ['salmon'] * 3 + ['lightgreen']
ax1.bar(labels, action, color=colors)
ax1.set_title('Action Vector Magnitudes')
ax1.set_ylabel('Un-normalized Value')
ax1.grid(axis='y', linestyle='--', alpha=0.7)

# Plot 2: 3D Visualization of the Translation Vector (X, Y, Z)
ax2 = fig.add_subplot(1, 2, 2, projection='3d')

# Plotting the vector from origin (0,0,0) to (action[0], action[1], action[2])
ax2.quiver(0, 0, 0, action[0], action[1], action[2],
           color='blue', length=1.0, arrow_length_ratio=0.3)

# Setting limits for visibility (adjust if action values are larger)
limit = max(abs(action[:3])) * 1.5 if max(abs(action[:3])) > 0 else 0.01
ax2.set_xlim([-limit, limit])
ax2.set_ylim([-limit, limit])
ax2.set_zlim([-limit, limit])

ax2.set_xlabel('X (Forward)')
ax2.set_ylabel('Y (Left/Right)')
ax2.set_zlabel('Z (Up/Down)')
ax2.set_title('Predicted 3D Movement Direction')

plt.tight_layout()
plt.show()