In [1]:
import argparse
import json
import h5py
import imageio
import numpy as np
import os
from copy import deepcopy

import torch

import robomimic
import robomimic.utils.file_utils as FileUtils
import robomimic.utils.torch_utils as TorchUtils
import robomimic.utils.tensor_utils as TensorUtils
import robomimic.utils.obs_utils as ObsUtils
from robomimic.envs.env_base import EnvBase
from robomimic.algo import RolloutPolicy

from robomimic.envs.env_gym import EnvGym
import robosuite as suite

from robosuite.wrappers import GymWrapper
from robosuite.controllers import load_controller_config
import matplotlib.pyplot as plt

import urllib.request

from diffusion_policy.policy.diffusion_transformer_lowdim_policy import DiffusionTransformerLowdimPolicy
import hydra
from omegaconf import OmegaConf

from diffusion_policy.policy.diffusion_transformer_hybrid_image_policy import DiffusionTransformerHybridImagePolicy
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusion_policy.model.diffusion.ema_model import EMAModel

from diffusion_policy.env_runner.base_image_runner import BaseImageRunner
import os
import json
import cv2


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm




### Load checkpoint

In [2]:
cd ..

/home/sen/Desktop/awe


In [3]:
ckpt_path = "ckpts/latest.ckpt"
ckpt = torch.load(ckpt_path, map_location=device)
assert os.path.exists(ckpt_path)

### Load config

In [4]:

# Load the YAML configuration using OmegaConf
cfg = OmegaConf.load("config/waypoint_image_can_ph_diffusion_policy_transformer.yaml")
OmegaConf.register_new_resolver("eval", eval, replace=True)
# Resolve variables like ${horizon}
cfg = OmegaConf.to_container(cfg, resolve=True)  # Now it replaces ${horizon} with its actual value

policy_cfg = cfg["policy"]  # Now it should have resolved values


# Create Noise Scheduler
noise_scheduler = DDPMScheduler(
    num_train_timesteps=policy_cfg["noise_scheduler"]["num_train_timesteps"],
    beta_start=policy_cfg["noise_scheduler"]["beta_start"],
    beta_end=policy_cfg["noise_scheduler"]["beta_end"],
    beta_schedule=policy_cfg["noise_scheduler"]["beta_schedule"],
    variance_type=policy_cfg["noise_scheduler"]["variance_type"],
    clip_sample=policy_cfg["noise_scheduler"]["clip_sample"],
    prediction_type=policy_cfg["noise_scheduler"]["prediction_type"],
)

# Initialize the policy
policy = DiffusionTransformerHybridImagePolicy(
    shape_meta=policy_cfg["shape_meta"],
    noise_scheduler=noise_scheduler,
    horizon=int(policy_cfg["horizon"]),  # Ensure it's an integer
    n_action_steps=int(policy_cfg["n_action_steps"]),
    n_obs_steps=int(policy_cfg["n_obs_steps"]),
    num_inference_steps=int(policy_cfg["num_inference_steps"]),
    crop_shape=tuple(map(int, policy_cfg["crop_shape"])),  # Convert tuple elements to int
    obs_encoder_group_norm=policy_cfg["obs_encoder_group_norm"],
    eval_fixed_crop=policy_cfg["eval_fixed_crop"],
    n_layer=int(policy_cfg["n_layer"]),
    n_cond_layers=int(policy_cfg["n_cond_layers"]),
    n_head=int(policy_cfg["n_head"]),
    n_emb=int(policy_cfg["n_emb"]),
    p_drop_emb=float(policy_cfg["p_drop_emb"]),  # Ensure float conversion
    p_drop_attn=float(policy_cfg["p_drop_attn"]),
    causal_attn=policy_cfg["causal_attn"],
    time_as_cond=policy_cfg["time_as_cond"],
    obs_as_cond=policy_cfg["obs_as_cond"],
)


# Set to evaluation mode
policy.eval()

# (Optional) Load checkpoint
checkpoint_path = "ckpts/latest.ckpt"
checkpoint = torch.load(checkpoint_path, map_location="cuda" if torch.cuda.is_available() else "cpu")
policy.load_state_dict(checkpoint["state_dicts"]["ema_model"])

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy.to(device)

print("Model is initialized and ready for evaluation.")




using obs modality: low_dim with keys: ['robot0_eef_pos', 'robot0_eef_quat', 'robot0_gripper_qpos']
using obs modality: rgb with keys: ['robot0_eye_in_hand_image', 'agentview_image']
using obs modality: depth with keys: []
using obs modality: scan with keys: []




Model is initialized and ready for evaluation.


### Test model: Pass fake input through model

In [5]:
"""
# Define batch size and number of observation steps
batch_size = 30  
n_obs_steps = policy.n_obs_steps  # Extract from model

# Define the observation structure
shape_meta = policy_cfg["shape_meta"]

# Generate Fake Observations (WITHOUT "obs" wrapper)
fake_obs = {
    key: torch.randn(batch_size, n_obs_steps, *attr["shape"]).to(policy.device)
    for key, attr in shape_meta["obs"].items()
}

# Use the correct structure (no "obs" key)
obs_dict = fake_obs

# Run the model on fake data
result = policy.predict_action(obs_dict)

# Print the output
print("Fake Data Passed to Model.")
print("Action Shape:", result["action"].shape)
print("Predicted Action Shape:", result["action_pred"].shape)
print("Predicted Action:", result["action"])

"""

'\n# Define batch size and number of observation steps\nbatch_size = 30  \nn_obs_steps = policy.n_obs_steps  # Extract from model\n\n# Define the observation structure\nshape_meta = policy_cfg["shape_meta"]\n\n# Generate Fake Observations (WITHOUT "obs" wrapper)\nfake_obs = {\n    key: torch.randn(batch_size, n_obs_steps, *attr["shape"]).to(policy.device)\n    for key, attr in shape_meta["obs"].items()\n}\n\n# Use the correct structure (no "obs" key)\nobs_dict = fake_obs\n\n# Run the model on fake data\nresult = policy.predict_action(obs_dict)\n\n# Print the output\nprint("Fake Data Passed to Model.")\nprint("Action Shape:", result["action"].shape)\nprint("Predicted Action Shape:", result["action_pred"].shape)\nprint("Predicted Action:", result["action"])\n\n'

In [6]:
"""
# Load the environment runner from the config
cfg["task"]["env_runner"]["n_envs"] = 1  # Set num_envs to 1
env_runner = hydra.utils.instantiate(cfg["task"]["env_runner"], output_dir="rollout_outputs")
"""

'\n# Load the environment runner from the config\ncfg["task"]["env_runner"]["n_envs"] = 1  # Set num_envs to 1\nenv_runner = hydra.utils.instantiate(cfg["task"]["env_runner"], output_dir="rollout_outputs")\n'

### Visualise view of env without taking action

In [7]:
"""
obs = env_runner.env.reset()
print("Observation keys:", obs.keys())

# Check if images are available
if "agentview_image" in obs:

    img = obs["agentview_image"][0, -1]  # Extract agent view image
    print(img.shape)
    img = np.transpose(img, (1, 2, 0))  # Convert from (C, H, W) to (H, W, C)
    
    plt.imshow(img)
    plt.axis("off")
    plt.title("Agent View Image from Observation")
    plt.show()
"""

'\nobs = env_runner.env.reset()\nprint("Observation keys:", obs.keys())\n\n# Check if images are available\nif "agentview_image" in obs:\n\n    img = obs["agentview_image"][0, -1]  # Extract agent view image\n    print(img.shape)\n    img = np.transpose(img, (1, 2, 0))  # Convert from (C, H, W) to (H, W, C)\n    \n    plt.imshow(img)\n    plt.axis("off")\n    plt.title("Agent View Image from Observation")\n    plt.show()\n'

### Visualise Rollout

In [None]:
# Load the environment runner from the config
cfg["task"]["env_runner"]["n_envs"] = 1  # Set num_envs to 1
cfg["task"]["env_runner"]["n_test_vis"] = 1 # visualise 1 test
env_runner = hydra.utils.instantiate(cfg["task"]["env_runner"], output_dir="rollout_outputs")
# run roll out and save vid 
env_runner.run(policy)


env_meta {'env_name': 'PickPlaceCan', 'type': 1, 'env_kwargs': {'has_renderer': False, 'has_offscreen_renderer': True, 'ignore_done': True, 'use_object_obs': False, 'use_camera_obs': True, 'control_freq': 20, 'controller_configs': {'type': 'OSC_POSE', 'input_max': 1, 'input_min': -1, 'output_max': [0.05, 0.05, 0.05, 0.5, 0.5, 0.5], 'output_min': [-0.05, -0.05, -0.05, -0.5, -0.5, -0.5], 'kp': 150, 'damping': 1, 'impedance_mode': 'fixed', 'kp_limits': [0, 300], 'damping_limits': [0, 10], 'position_limits': None, 'orientation_limits': None, 'uncouple_pos_ori': True, 'control_delta': False, 'interpolation': None, 'ramp_ratio': 0.2}, 'robots': ['Panda'], 'camera_depths': False, 'camera_heights': 84, 'camera_widths': 84, 'reward_shaping': False, 'camera_names': ['agentview', 'robot0_eye_in_hand'], 'render_gpu_device_id': 0}}
Created environment with name PickPlaceCan
Action size is 7
Created environment with name PickPlaceCan
Action size is 7
Processed image shape: (3, 84, 84)
Processed imag

Eval PickPlaceCanImage 1/50:   0%|          | 0/200 [00:00<?, ?it/s]

Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)


Eval PickPlaceCanImage 1/50:   4%|▍         | 8/200 [00:01<00:42,  4.48it/s]

Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)


Eval PickPlaceCanImage 1/50:   8%|▊         | 16/200 [00:03<00:41,  4.44it/s]

Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)


Eval PickPlaceCanImage 1/50:  12%|█▏        | 24/200 [00:05<00:39,  4.51it/s]

Processed image shape: (3, 84, 84)
Processed image shape: (3, 84, 84)


KeyboardInterrupt: 

Eval PickPlaceCanImage 1/50:  12%|█▏        | 24/200 [00:19<00:39,  4.51it/s]