In [None]:
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id="IPEC-COMMUNITY/spatialvla-4b-224-pt",
    local_dir="checkpoints/spatialvla-4b",
    local_dir_use_symlinks=False,   # реальные файлы, не симлинки
    resume_download=True            # докачает прерванное
)

from huggingface_hub import snapshot_download
snapshot_download(
    repo_id="openvla/openvla-7b",
    local_dir="checkpoints/openvla-7b",
    local_dir_use_symlinks=False,   # реальные файлы, без ссылок
    resume_download=True            # можно докачивать при обрыве
)

In [None]:
import torch, transformers, flash_attn
print("Torch:", torch.__version__, "| CUDA:", torch.version.cuda)
print("Transformers:", transformers.__version__)
from flash_attn import flash_attn_qkvpacked_func
print("Flash-Attn OK")

In [None]:
from transformers import AutoProcessor
proc = AutoProcessor.from_pretrained("checkpoints/spatialvla-4b")
print("Processor OK ✔︎")

### OpenVLA

In [None]:
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
from simpler_env.policies.openvla.openvla_model import OpenVLAInference
import numpy as np
import mediapy

# 1. строим окружение
env = simpler_env.make("google_robot_open_drawer")   # или widowx_* задачи
obs, _ = env.reset()
instruction = env.get_language_instruction()

# 2. создаём policy‑обёртку
policy = OpenVLAInference(
        saved_model_path="checkpoints/openvla-7b",
        policy_setup="google_robot")                  # или 'widowx_bridge' google_robot
policy.reset(instruction)

done, trunc = False, False
frames = []
while not (done or trunc):
    image = get_image_from_maniskill2_obs_dict(env, obs)      # uint8 H×W×3
    _, act = policy.step(image, instruction)
    frames.append(image)
    # переводим dict → вектор, SimplerEnv ожидает [dx,dy,dz, dR(3), grip]
    vec = np.concatenate([act["world_vector"], act["rot_axangle"], act["gripper"]])
#     trajectory.append(vec)
    obs, _, done, trunc, info = env.step(vec)
    print(done, trunc)

mediapy.show_video(frames, fps=10)

### SpatialVLA

In [None]:
# export TORCH_DYNAMO_DISABLE=1        # Linux/mac
# # или в Python перед импортом PyTorch
# import os; os.environ["TORCH_DYNAMO_DISABLE"] = "1"
import torch, os
torch._dynamo.config.suppress_errors = True   # не падать, а тихо откатываться
torch._dynamo.disable()                       # полностью выключить граф-банкирование
# или переменная окружения
os.environ["TORCH_COMPILE"] = "0"


import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict


import transformers
import types

# в новых версиях осталась make_flat_list_of_images; используем её
from transformers.image_utils import make_flat_list_of_images

import transformers.models.paligemma.processing_paligemma as pp

if not hasattr(pp, "make_batched_images"):
    # создаём совместимую обёртку
    def make_batched_images(images):
        """
        Fallback-реализация: просто разворачивает вложенные списки
        в плоский и возвращает его (то же поведение).
        """
        return make_flat_list_of_images(images)

    pp.make_batched_images = make_batched_images
# ---------------------------------------------------------------

from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
import numpy as np
import mediapy

# 1. строим окружение
env = simpler_env.make("google_robot_open_drawer")   # или widowx_* задачи
obs, _ = env.reset()
instruction = env.get_language_instruction()

# 2. создаём policy‑обёртку
policy = SpatialVLAInference(
        # saved_model_path="checkpoints/spatialvla-4b",
        policy_setup="google_robot")                  # или 'widowx_bridge' google_robot
policy.reset(instruction)

done, trunc = False, False
frames = []
while not (done or trunc):
    image = get_image_from_maniskill2_obs_dict(env, obs)      # uint8 H×W×3
    _, act = policy.step(image, instruction)
    frames.append(image)
    # переводим dict → вектор, SimplerEnv ожидает [dx,dy,dz, dR(3), grip]
    vec = np.concatenate([act["world_vector"], act["rot_axangle"], act["gripper"]])
#     trajectory.append(vec)
    obs, _, done, trunc, info = env.step(vec)
    print(done, trunc)

mediapy.show_video(frames, fps=10)

  from .autonotebook import tqdm as notebook_tqdm


In [1]:

import os
os.environ["TORCH_COMPILE"] = "0"
os.environ["TORCHDYNAMO_DISABLE"] = "1"

import torch
torch._dynamo.reset()
torch._dynamo.config.suppress_errors = True
torch._dynamo.disable()

import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
import transformers
import numpy as np
import mediapy
from transformers.image_utils import make_flat_list_of_images
import transformers.models.paligemma.processing_paligemma as pp


if not hasattr(pp, "make_batched_images"):
    pp.make_batched_images = lambda images: make_flat_list_of_images(images)

from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference

class PatchedSpatialVLAInference(SpatialVLAInference):
    @torch.inference_mode()
    @torch.autocast(device_type='cuda', enabled=False)
    def step(self, image, instruction):
        with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):
            return super().step(image, instruction)

# Класс постпроцессора
class SpatialPostProcessor:
    def __init__(self, env):
        self.env = env
        self.prev_actions = []
        self.max_speed = 0.25
        self.safe_height = 0.1

    def _get_environment_constraints(self):
        return {
            "collisions": self.env.get_physics().get_collision_info(),
            "objects": [{
                "position": obj.position,
                "bounds": obj.bounding_box,
                "material": obj.material_properties
            } for obj in self.env.scene.objects]
        }

    def _apply_physics_constraints(self, action, constraints):
        if len(self.prev_actions) >= 2:
            prev_speed = np.linalg.norm(self.prev_actions[-1][:3] - self.prev_actions[-2][:3])
            if prev_speed > self.max_speed:
                action[:3] *= self.max_speed / prev_speed
        
        for collision in constraints["collisions"]:
            if collision["type"] == "robot-object":
                action[:3] = self._avoid_collision(action[:3], collision["point"])
        
        target_obj = next((obj for obj in constraints["objects"] if obj.get("is_target", False)), None)
        if target_obj and target_obj["material"].get("fragility", 0) > 0.7:
            action[-1] = 0.5
        
        return action

    def _apply_spatial_consistency(self, action):
        current_pos = self.env.get_robot_state()["end_effector_pos"]
        if current_pos[2] - action[2] < self.safe_height:
            action[2] = current_pos[2] + self.safe_height
        
        workspace_limits = self.env.get_workspace_bounds()
        action[:3] = np.clip(action[:3], workspace_limits[0], workspace_limits[1])
        return action

    def process(self, raw_action):
        constraints = self._get_environment_constraints()
        processed_action = self._apply_spatial_consistency(raw_action.copy())
        processed_action = self._apply_physics_constraints(processed_action, constraints)
        self.prev_actions = [*self.prev_actions[-4:], processed_action]
        return processed_action

if __name__ == "__main__":
    # Инициализация окружения
    env = simpler_env.make("google_robot_open_drawer")
    obs, _ = env.reset()
    instruction = env.get_language_instruction()

    # Инициализация модели и постпроцессора
    policy = PatchedSpatialVLAInference(policy_setup="google_robot")
    policy.reset(instruction)
    post_processor = SpatialPostProcessor(env)

    # Главный цикл выполнения
    done, trunc = False, False
    frames = []
    
    while not (done or trunc):
        with torch._dynamo.disable():  # Локальное отключение для критической секции
            image = get_image_from_maniskill2_obs_dict(env, obs)
            _, act = policy.step(image, instruction)
        
        raw_vec = np.concatenate([act["world_vector"], act["rot_axangle"], act["gripper"]])
        processed_vec = post_processor.process(raw_vec)
        
        frames.append(image)
        obs, _, done, trunc, info = env.step(processed_vec)
        print(f"Step status: Done={done}, Truncated={trunc}")

    mediapy.show_video(frames, fps=10)

  from .autonotebook import tqdm as notebook_tqdm
2025-04-25 02:30:38.437869: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-25 02:30:38.437924: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-25 02:30:38.439461: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 02:30:38.446388: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[20

*** policy_setup: google_robot, unnorm_key: fractal20220817_data/0.1.0 ***


Some kwargs in processor config are unused and will not have any effect: action_config, action_chunk_size, statistics, intrinsic_config, bin_policy, num_obs_steps, obs_delta. 


Add 0 TRANSLATION TOKENS, tokenizer vocab size 257152 / 265347
Add 0 ROTATION TOKENS to tokenizer, tokenizer vocab size 257152 / 265347
Add 0 GRIPPER TOKENS to tokenizer, tokenizer vocab size 257152 / 265347


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.09it/s]


RuntimeError: torch._dynamo.optimize(...) is used with a context manager. Please refer to https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html to use torch._dynamo.optimize(...) as an annotation/decorator. 

In [None]:
import transformers, inspect
from transformers.models.paligemma import processing_paligemma as pp
print("Transformers", transformers.__version__)
print("make_batched_images →", "OK" if hasattr(pp, "make_batched_images") else "MISSING")

In [None]:
import simpler_env, numpy as np
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
from simpler_env.policies.openvla.openvla_model import OpenVLAInference
from mani_skill2.utils.record_utils import save_episode_as_gif

# --- 1. создаём среду ---------------------------------------------------------
env = simpler_env.make(
        "google_robot_pick_coke_can",
        render_mode="rgb_array"     # важно: нам нужны кадры
)
obs, _ = env.reset()
instruction = env.get_language_instruction()

# --- 2. инициализируем политику ----------------------------------------------
policy = OpenVLAInference("checkpoints/openvla-7b", policy_setup="google_robot")
policy.reset(instruction)

trajectory = []          # здесь будем копить actions

# --- 3. роллаут --------------------------------------------------------------
done = trunc = False
while not (done or trunc):
    image = get_image_from_maniskill2_obs_dict(env, obs)
    _, act = policy.step(image, instruction)

    vec = np.concatenate([act["world_vector"], act["rot_axangle"], act["gripper"]])
    trajectory.append(vec)                       # <= сохраняем действие

    obs, _, done, trunc, _ = env.step(vec)

# --- 4. сохраняем gif --------------------------------------------------------
save_episode_as_gif(
        env,                                # среда (её reset происходит внутри)
        actions=np.stack(trajectory),       # N×8 numpy-массив
        gif_path="pick_coke_can.gif",
        camera_names=["base"],              # можно ['base', 'hand', …]
        fps=20
)

env.close()
print("Гифка сохранена: pick_coke_can.gif")


In [None]:
import torch, sys
print("Torch", torch.__version__, "CUDA", torch.version.cuda)


In [None]:
import ctypes, importlib.util, sys
candidates = ("flash_attn", "bitsandbytes", "xformers")
for name in candidates:
    try:
        mod = importlib.import_module(name)
    except Exception as e:
        print(f"[{name}] → {type(e).__name__}: {e}")


In [None]:
import pkg_resources, sys
for pkg in ("numpy","transformers"):
    try:
        print(pkg, pkg_resources.get_distribution(pkg).version)
    except: pass

In [None]:
import importlib.util, transformers, numpy, sys, os
print("Transformers:", transformers.__version__,
      " →", os.path.dirname(importlib.util.find_spec("transformers").origin))
print("NumPy       :", numpy.__version__,
      " →", os.path.dirname(importlib.util.find_spec("numpy").origin))


In [None]:
import mani_skill2, mani_skill2_real2sim, pkg_resources
print("mani_skill2           :", mani_skill2.__version__)
print("mani_skill2_real2sim  :", mani_skill2_real2sim.__version__)

In [1]:
import transformers, inspect
from transformers.models.paligemma import processing_paligemma as pp
print("Transformers", transformers.__version__)
print("make_batched_images →", "OK" if hasattr(pp, "make_batched_images") else "MISSING")
import torch, os
torch._dynamo.config.suppress_errors = True   # не падать, а тихо откатываться
torch._dynamo.disable()                       # полностью выключить граф-банкирование
# или переменная окружения
os.environ["TORCH_COMPILE"] = "0"


import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict


import transformers
import types

# в новых версиях осталась make_flat_list_of_images; используем её
from transformers.image_utils import make_flat_list_of_images

import transformers.models.paligemma.processing_paligemma as pp

if not hasattr(pp, "make_batched_images"):
    # создаём совместимую обёртку
    def make_batched_images(images):
        """
        Fallback-реализация: просто разворачивает вложенные списки
        в плоский и возвращает его (то же поведение).
        """
        return make_flat_list_of_images(images)

    pp.make_batched_images = make_batched_images
# ---------------------------------------------------------------

from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
import numpy as np
import mediapy
# ---------- Отключаем torch.compile жёстко ----------
import torch, functools
torch.compile = functools.wraps(torch.compile)(lambda m, *a, **kw: m)

# ---------- Дальше обычный код ----------
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
import numpy as np, mediapy

frames = [[], []]
for i in range(2):
    env = simpler_env.make("widowx_put_eggplant_in_basket") #widowx_put_eggplant_in_basket google_robot_open_drawer
    obs, _ = env.reset(seed=np.random.randint(0, 9999))
    instruction = env.get_language_instruction()

    policy = SpatialVLAInference("checkpoints/spatialvla-4b", policy_setup="widowx_bridge", action_ensemble_temp=2.01) #widowx_bridge google_robot
    policy.reset(instruction)

    frames[i], done, trunc = [], False, False
    while not (done or trunc):
        img = get_image_from_maniskill2_obs_dict(env, obs)
        _, act = policy.step(img, instruction)
        frames[i].append(img)
        vec = np.concatenate([act["world_vector"], act["rot_axangle"], act["gripper"]])
        obs, _, done, trunc, _ = env.step(vec)


# mediapy.show_video(frames1, fps=10)
# mediapy.show_video(frames2, fps=10)

  from .autonotebook import tqdm as notebook_tqdm


Transformers 4.49.0
make_batched_images → MISSING


2025-04-25 04:05:51.955309: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-25 04:05:51.955378: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-25 04:05:51.956932: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 04:05:51.964014: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


KeyboardInterrupt: 

In [3]:
mediapy.show_video(frames[0], fps=10)
mediapy.show_video(frames[1], fps=10)

0
This browser does not support the video tag.


0
This browser does not support the video tag.


Улучшенный SpatialVLA(старый)

In [2]:
# Полное отключение Torch Dynamo и компиляции
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_COMPILE"] = "0"

import numpy as np
import torch
import cv2 as cv
from typing import Optional
from transforms3d.euler import euler2axangle
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor

# ----- Патч для make_batched_images -----
import transformers
from transformers.models.paligemma import processing_paligemma as pp
if not hasattr(pp, 'make_batched_images'):
    from transformers.image_utils import make_flat_list_of_images
    def make_batched_images(images):
        return make_flat_list_of_images(images)
    pp.make_batched_images = make_batched_images

# ---------- SpatialVLA ----------
from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference

# ---------- ECoTInference ----------
class EcoTInference:
    def __init__(
        self,
        saved_model_path: str,
        unnorm_key: Optional[str] = None,
        policy_setup: str = "widowx_bridge",
        image_size: list[int] = [224, 224],
        action_scale: float = 1.0,
    ) -> None:
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        self.policy_setup = policy_setup
        self.unnorm_key = (
            unnorm_key
            or ("bridge_orig" if policy_setup == "widowx_bridge" else "fractal20220817_data")
        )
        print(f"*** ECoT policy_setup: {policy_setup}, unnorm_key: {self.unnorm_key} ***")
        self.processor = AutoProcessor.from_pretrained(saved_model_path, trust_remote_code=True)
        self.model = AutoModelForVision2Seq.from_pretrained(
            saved_model_path,
            attn_implementation="sdpa",
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        ).cuda()
        self.image_size = image_size
        self.action_scale = action_scale
        self.sticky_repeat = 1 if policy_setup == "widowx_bridge" else 15
        self.reset(None)

    def reset(self, task_description: Optional[str]) -> None:
        self.task_description = task_description
        self.prev_grip = None
        self.grip_count = 0
        self.sticky_on = False
        self.sticky_val = 0.0

    def _resize(self, img: np.ndarray) -> np.ndarray:
        return cv.resize(img, tuple(self.image_size), interpolation=cv.INTER_AREA)

    def step(self, image: np.ndarray, task: str) -> tuple[dict, dict]:
        if task != self.task_description:
            self.reset(task)
        img = self._resize(image)
        pil = Image.fromarray(img)
        inputs = self.processor(task, pil).to("cuda", torch.bfloat16)
        out = self.model.predict_action(
            **inputs, unnorm_key=self.unnorm_key, do_sample=False
        )
        # handle tuple output
        raw_tensor = out[0] if isinstance(out, tuple) else out
        # to numpy
        raw_np = (
            raw_tensor.detach().cpu().numpy()
            if torch.is_tensor(raw_tensor)
            else np.array(raw_tensor)
        )
        # parse components
        wv, rd, gr = raw_np[:3], raw_np[3:6], raw_np[6]
        ax, ang = euler2axangle(*rd)
        rot = (ax * ang) * self.action_scale
        # gripper logic
        if self.prev_grip is not None:
            rel = self.prev_grip - gr
            if abs(rel) > 0.5 and not self.sticky_on:
                self.sticky_on = True
                self.sticky_val = rel
            if self.sticky_on:
                self.grip_count += 1
                rel = self.sticky_val
            if self.grip_count >= self.sticky_repeat:
                self.sticky_on = False
                self.grip_count = 0
            grip = rel
        else:
            grip = 2 * (gr > 0.5) - 1
        self.prev_grip = gr
        raw_act = {"world_vector": wv, "rotation_delta": rd, "open_gripper": gr}
        act = {
            "world_vector": wv * self.action_scale,
            "rot_axangle": rot,
            "gripper": grip,
            "terminate_episode": np.array([0.0]),
        }
        return raw_act, act

# ---------- CombinedSpatialECoTInference ----------
class CombinedSpatialECoTInference:
    def __init__(
        self,
        spatial_cp: str,
        ecot_cp: str,
        setup: str = "widowx_bridge",
        temp: float = 2.0,
        size: list[int] = [224, 224],
        scale: float = 1.0,
    ):
        self.spatial = SpatialVLAInference(
            spatial_cp,
            policy_setup=setup,
            action_ensemble_temp=temp,
        )
        self.ecot = EcoTInference(
            ecot_cp,
            policy_setup=setup,
            image_size=size,
            action_scale=scale,
        )
        self.task = None

    def reset(self, task: str):
        self.task = task
        self.spatial.reset(task)
        self.ecot.reset(task)

    def step(self, img: np.ndarray, task: Optional[str] = None):
        if task and task != self.task:
            self.reset(task)
        _, sa = self.spatial.step(img, self.task)
        feat = (
            f"EMBODIED FEATURES: pos={sa['world_vector'].tolist()}, "
            f"rot={sa.get('rot_axangle', [0,0,0]).tolist()}"
        )
        prompt = f"{feat}\n{self.task}"
        re, ea = self.ecot.step(img, prompt)
        final = {
            "world_vector": ea["world_vector"],
            "rot_axangle": ea["rot_axangle"],
            "gripper": np.atleast_1d(ea["gripper"]),
            "terminate_episode": np.array([0.0]),
        }
        return re, final

if __name__ == '__main__':
    import gymnasium as gym
    import mediapy
    import numpy as np
    from simpler_env import ENVIRONMENT_MAP

    # Создание среды без глубины (rgb only) с рендером в массив
    env_name, env_kwargs = ENVIRONMENT_MAP['widowx_put_eggplant_in_basket']
    env = gym.make(
        env_name,
        obs_mode='rgb',
        render_mode='rgb_array',      # включаем возврат кадра
        prepackaged_config=True,
        **env_kwargs
    )
    obs, _ = env.reset(seed=0)
    # Получение инструкции
    try:
        task = env.unwrapped.get_language_instruction()
    except Exception:
        task = env.get_language_instruction()

    policy = CombinedSpatialECoTInference(
        spatial_cp='checkpoints/spatialvla-4b',
        ecot_cp='Embodied-CoT/ecot-openvla-7b-bridge',
    )
    policy.reset(task)

    def fetch_image(obs_dict, camera_name="base_camera"):
        # 1) nested obs["image"][camera_name]["rgb"]
        try:
            return obs_dict["image"][camera_name]["rgb"]
        except (KeyError, TypeError):
            pass
        # 2) obs["image"] — сам numpy-массив
        if isinstance(obs_dict.get("image"), np.ndarray):
            return obs_dict["image"]
        # 3) obs["image"] — dict без camera_name
        cand = obs_dict.get("image")
        if isinstance(cand, dict):
            for v in cand.values():
                if isinstance(v, dict) and "rgb" in v:
                    return v["rgb"]
        # 4) fallback через render()
        img = env.render()  # теперь гарантированно работает
        if isinstance(img, tuple):
            img = img[0]
        return img

    frames = []
    done = False
    while not done:
        img = fetch_image(obs)
        _, action = policy.step(img)
        frames.append(img)
        vec = np.concatenate([
            action['world_vector'],
            action['rot_axangle'],
            action['gripper'],
        ])
        try:
            obs, _, done, _, _ = env.step(vec)
        except ValueError:
            obs, _, done, _ = env.step(vec)

    mediapy.show_video(frames, fps=10)

[2025-04-25 03:43:46.441] [svulkan2] [error] GLFW error: X11: The DISPLAY environment variable is missing


*** policy_setup: widowx_bridge, unnorm_key: bridge_orig/1.0.0 ***


KeyboardInterrupt: 

Улучшенный SpatialVLA

In [2]:
import os
import numpy as np
import mediapy
import torch
import transformers
from typing import List, Tuple

# Конфигурация окружения
os.environ["TORCH_COMPILE"] = "0"
torch._dynamo.config.suppress_errors = True
torch._dynamo.disable()

# Инициализация обработки изображений
def setup_image_processing():
    from transformers.image_utils import make_flat_list_of_images
    import transformers.models.paligemma.processing_paligemma as pp
    
    if not hasattr(pp, "make_batched_images"):
        pp.make_batched_images = lambda images: make_flat_list_of_images(images)
    return pp

# Класс для управления политикой
class VLAPolicyController:
    def __init__(self, model_path: str, policy_setup: str, temp: float = 2.01):
        from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
        
        self.policy = SpatialVLAInference(
            model_path, 
            policy_setup=policy_setup,
            action_ensemble_temp=temp
        )
        self.action_history = []
        
    def reset(self, instruction: str):
        self.policy.reset(instruction)
        self.action_history.clear()
        
    def get_action(self, image: np.ndarray, instruction: str) -> np.ndarray:
        _, act = self.policy.step(image, instruction)
        return self._process_action(act)
        
    def _process_action(self, action: dict) -> np.ndarray:
        vec = np.concatenate([
            action["world_vector"], 
            action["rot_axangle"], 
            action["gripper"]
        ])
        # Сглаживание действий
        self.action_history.append(vec)
        return np.mean(self.action_history[-3:], axis=0) if len(self.action_history) > 2 else vec

# Функция выполнения эпизода
def run_episode(env, policy_controller: VLAPolicyController, 
               max_steps: int = 100) -> Tuple[List[np.ndarray], bool]:
    frames = []
    obs, _ = env.reset(seed=np.random.randint(0, 10000))
    instruction = env.get_language_instruction()
    policy_controller.reset(instruction)
    
    done, trunc = False, False
    for _ in range(max_steps):
        img = get_image_from_maniskill2_obs_dict(env, obs)
        action = policy_controller.get_action(img, instruction)
        obs, _, done, trunc, _ = env.step(action)
        frames.append(img)
        
        if done or trunc:
            break
            
    return frames, done

# Основной исполняемый код
if __name__ == "__main__":
    # Инициализация компонентов
    pp = setup_image_processing()
    task_configs = [
        ("widowx_put_eggplant_in_basket", "widowx_bridge")
        #("google_robot_open_drawer", "google_robot")
    ]
    
    results = {}
    for task_name, policy_setup in task_configs:
        env = simpler_env.make(task_name)
        controller = VLAPolicyController(
            "checkpoints/spatialvla-4b", 
            policy_setup,
            temp=2.01
        )
        
        # Выполнение 3 попыток для каждой задачи
        attempts = []
        for _ in range(3):
            frames, success = run_episode(env, controller)
            attempts.append((frames, success))
        env.close()
        
        results[task_name] = attempts
    
    # Визуализация результатов
    for task_name, attempts in results.items():
        print(f"\nResults for {task_name}:")
        for i, (frames, success) in enumerate(attempts):
            status = "SUCCESS" if success else "FAILED"
            mediapy.show_video(frames, fps=10, title=f"{task_name} - Attempt {i+1} ({status})")



*** policy_setup: widowx_bridge, unnorm_key: bridge_orig/1.0.0 ***


Some kwargs in processor config are unused and will not have any effect: obs_delta, statistics, action_config, intrinsic_config, action_chunk_size, num_obs_steps, bin_policy. 


Add 0 TRANSLATION TOKENS, tokenizer vocab size 257152 / 265347
Add 0 ROTATION TOKENS to tokenizer, tokenizer vocab size 257152 / 265347
Add 0 GRIPPER TOKENS to tokenizer, tokenizer vocab size 257152 / 265347


KeyboardInterrupt: 

ECoT+SpatialVLA 
(ТОЧНО РАБОТАЕТ НО ДОЛГО ВОТ ЕГО ПРЯМ НАДО ЗАПУСТИТЬ)

In [1]:
import os
import numpy as np
import mediapy
import torch
import transformers
from typing import List, Tuple
from PIL import Image
# Конфигурация окружения
os.environ["TORCH_COMPILE"] = "0"
torch._dynamo.config.suppress_errors = True
torch._dynamo.disable()
import transformers, inspect
from transformers.models.paligemma import processing_paligemma as pp
print("Transformers", transformers.__version__)
print("make_batched_images →", "OK" if hasattr(pp, "make_batched_images") else "MISSING")
import torch, os
torch._dynamo.config.suppress_errors = True   # не падать, а тихо откатываться
torch._dynamo.disable()                       # полностью выключить граф-банкирование
# или переменная окружения
os.environ["TORCH_COMPILE"] = "0"


import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict


import transformers
import types

# в новых версиях осталась make_flat_list_of_images; используем её
from transformers.image_utils import make_flat_list_of_images

import transformers.models.paligemma.processing_paligemma as pp

if not hasattr(pp, "make_batched_images"):
    # создаём совместимую обёртку
    def make_batched_images(images):
        """
        Fallback-реализация: просто разворачивает вложенные списки
        в плоский и возвращает его (то же поведение).
        """
        return make_flat_list_of_images(images)

    pp.make_batched_images = make_batched_images
# ---------------------------------------------------------------

from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
import numpy as np
import mediapy
# ---------- Отключаем torch.compile жёстко ----------
import torch, functools
torch.compile = functools.wraps(torch.compile)(lambda m, *a, **kw: m)

# ---------- Дальше обычный код ----------
import simpler_env
from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
import numpy as np, mediapy

# Инициализация ECoT
from transformers import AutoProcessor, AutoModelForVision2Seq

class ECoTReasoner:
    def __init__(self, device="cuda:0"):
        self.device = device
        self.processor = AutoProcessor.from_pretrained(
            "Embodied-CoT/ecot-openvla-7b-bridge", 
            trust_remote_code=True
        )
        self.model = AutoModelForVision2Seq.from_pretrained(
            "Embodied-CoT/ecot-openvla-7b-bridge",
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        ).to(device).eval()
    
    def generate_reasoning(self, image: np.ndarray, prompt: str) -> str:
        processed_image = Image.fromarray(image)
        inputs = self.processor(
            prompt, 
            processed_image,
            return_tensors="pt"
        ).to(self.device, dtype=torch.bfloat16)
        
        generated_ids = self.model.generate(**inputs, max_new_tokens=100)
        return self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
def setup_image_processing():
    from transformers.image_utils import make_flat_list_of_images
    import transformers.models.paligemma.processing_paligemma as pp
    
    if not hasattr(pp, "make_batched_images"):
        pp.make_batched_images = lambda images: make_flat_list_of_images(images)
    return pp
# Остальной код с изменениями
class VLAPolicyController:
    def __init__(self, model_path: str, policy_setup: str, temp: float = 2.01):
        from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
        self.ecot = ECoTReasoner()  # Инициализация ECoT
        self.policy = SpatialVLAInference(
            model_path, 
            policy_setup=policy_setup,
            action_ensemble_temp=temp
        )
        self.action_history = []
        self.current_reasoning = ""  # Добавляем новое состояние

    def reset(self, instruction: str):
        """Важно сохранить этот метод!"""
        self.policy.reset(instruction)
        self.action_history.clear()
        self.current_reasoning = ""  # Сброс состояния рассуждений
    def _process_action(self, action: dict) -> np.ndarray:
        vec = np.concatenate([
            action["world_vector"], 
            action["rot_axangle"], 
            action["gripper"]
        ])
        # Сглаживание действий
        self.action_history.append(vec)
        return np.mean(self.action_history[-3:], axis=0) if len(self.action_history) > 2 else vec
        
    def get_action(self, image: np.ndarray, instruction: str) -> np.ndarray:
        # Генерация цепочки рассуждений
        self.current_reasoning = self.ecot.generate_reasoning(image, instruction)
        
        # Получение действия с учетом рассуждений
        _, act = self.policy.step(image, self.current_reasoning)
        return self._process_action(act)
def run_episode(env, policy_controller: VLAPolicyController, 
               max_steps: int = 100) -> Tuple[List[np.ndarray], bool]:
    frames = []
    obs, _ = env.reset(seed=np.random.randint(0, 10000))
    instruction = env.get_language_instruction()
    policy_controller.reset(instruction)
    
    done, trunc = False, False
    for _ in range(max_steps):
        img = get_image_from_maniskill2_obs_dict(env, obs)
        action = policy_controller.get_action(img, instruction)
        obs, _, done, trunc, _ = env.step(action)
        frames.append(img)
        
        if done or trunc:
            break
            
    return frames, done

# Основной исполняемый код
if __name__ == "__main__":
    # Инициализация компонентов
    pp = setup_image_processing()
    task_configs = [
        ("widowx_put_eggplant_in_basket", "widowx_bridge")
        #("google_robot_open_drawer", "google_robot")
    ]
    
    results = {}
    for task_name, policy_setup in task_configs:
        env = simpler_env.make(task_name)
        controller = VLAPolicyController(
            "checkpoints/spatialvla-4b", 
            policy_setup,
            temp=2.01
        )
        
        # Выполнение 3 попыток для каждой задачи
        attempts = []
        for _ in range(10):
            frames, success = run_episode(env, controller)
            attempts.append((frames, success))
        env.close()
        
        results[task_name] = attempts
    
    for task_name, attempts in results.items():
        print(f"\nResults for {task_name}:")
        for i, (frames, success) in enumerate(attempts):
            status = "SUCCESS" if success else "FAILED"
            mediapy.show_video(frames, fps=10, title=f"{task_name} - Attempt {i+1} ({status})")

  from .autonotebook import tqdm as notebook_tqdm


Transformers 4.49.0
make_batched_images → MISSING


2025-04-25 09:59:17.537501: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-25 09:59:17.537615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-25 09:59:17.540191: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 09:59:17.552642: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2025-04-25 09:59:19.540] [svulkan2] [error] GLFW err

*** policy_setup: widowx_bridge, unnorm_key: bridge_orig/1.0.0 ***


Some kwargs in processor config are unused and will not have any effect: bin_policy, obs_delta, action_chunk_size, num_obs_steps, intrinsic_config, action_config, statistics. 


Add 0 TRANSLATION TOKENS, tokenizer vocab size 257152 / 265347
Add 0 ROTATION TOKENS to tokenizer, tokenizer vocab size 257152 / 265347
Add 0 GRIPPER TOKENS to tokenizer, tokenizer vocab size 257152 / 265347


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.65it/s]
  logger.warn(
  logger.warn(



Results for widowx_put_eggplant_in_basket:


0
widowx_put_eggplant_in_basket - Attempt 1 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 2 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 3 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 4 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 5 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 6 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 7 (SUCCESS)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 8 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 9 (FAILED)  This browser does not support the video tag.


0
widowx_put_eggplant_in_basket - Attempt 10 (SUCCESS)  This browser does not support the video tag.


In [None]:
import os
# 1) Отключаем PyTorch-динамку и JIT-компиляцию одним env-вариантом
os.environ["TORCH_COMPILE"] = "0"

import numpy as np
import mediapy
import gymnasium as gym
from PIL import Image
from typing import List, Tuple

# 2) Импортируем и патчим make_batched_images до transformers.load
import transformers
from transformers.image_utils import make_flat_list_of_images
import transformers.models.paligemma.processing_paligemma as pp
if not hasattr(pp, "make_batched_images"):
    pp.make_batched_images = lambda images: make_flat_list_of_images(images)

# 3) Импорт SpatialVLA и ECoT
import torch
from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
from transformers import AutoProcessor, AutoModelForVision2Seq
from transforms3d.euler import euler2axangle

# 4) Утилита: безопасно получить RGB-кадр из obs
def get_image(obs, env, camera_name="base_camera"):
    try:
        return obs["image"][camera_name]["rgb"]
    except Exception:
        img = obs.get("image")
        if isinstance(img, np.ndarray):
            return img
        if isinstance(img, dict):
            for v in img.values():
                if isinstance(v, dict) and "rgb" in v:
                    return v["rgb"]
    # fallback: env.render
    ret = env.render()
    return ret[0] if isinstance(ret, tuple) else ret

# 5) Планировщик ECoT — один раз на reset
class ECoTPlanner:
    def __init__(self, model_path: str, unnorm_key: str, device="cuda"):
        self.device = torch.device(device)
        # use_fast=True для ускорения токенизации
        self.processor = AutoProcessor.from_pretrained(
            model_path, trust_remote_code=True, use_fast=True
        )
        self.model = AutoModelForVision2Seq.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        ).to(self.device).eval()
        
    def plan(self, image: np.ndarray, instruction: str) -> str:
        img = Image.fromarray(image)
        inputs = self.processor(instruction, img, return_tensors="pt")
        inputs = {
            k: v.to(self.device, dtype=torch.bfloat16) if v.dtype == torch.float else v.to(self.device)
            for k, v in inputs.items()
        }
        with torch.inference_mode():
            gen = self.model.generate(
                **inputs,
                max_new_tokens=10,
                do_sample=False,
                use_cache=True
            )
        return self.processor.batch_decode(gen, skip_special_tokens=True)[0]

# 6) Контроллер: один CoT-план + fast SpatialVLA
class FastCombinedController:
    def __init__(self,
                 spatial_ckpt: str,
                 ecot_ckpt: str,
                 policy_setup: str,
                 ecot_unnorm_key: str,
                 spatial_temp: float = 2.0):
        self.spatial = SpatialVLAInference(
            spatial_ckpt,
            policy_setup=policy_setup,
            action_ensemble_temp=spatial_temp
        )
        self.planner = ECoTPlanner(ecot_ckpt, ecot_unnorm_key)
        self._reset_state()

    def _reset_state(self):
        self.plan_text = None
        self.task = None
        self.action_hist: List[np.ndarray] = []

    def reset(self, image: np.ndarray, instruction: str):
        # 1) генерируем короткий план
        self.plan_text = self.planner.plan(image, instruction)
        self.task = instruction
        # 2) reset SpatialVLA с этим планом
        self.spatial.reset(self.plan_text)
        self.action_hist.clear()

    def get_action(self, image: np.ndarray) -> np.ndarray:
        # Сразу в inference_mode (SpatialVLAInference внутри уже .step без grad)
        _, act = self.spatial.step(image, self.plan_text)
        vec = np.concatenate([act["world_vector"], act["rot_axangle"], act["gripper"]])
        # Простое сглаживание последних 3 шагов
        self.action_hist.append(vec)
        if len(self.action_hist) > 3:
            return np.mean(self.action_hist[-3:], axis=0)
        return vec

# 7) Эпизод
def run_episode(env, controller: FastCombinedController,
                max_steps: int = 100) -> Tuple[List[np.ndarray], bool]:
    frames = []
    obs, _ = env.reset(seed=np.random.randint(0, 10000))
    img0 = get_image(obs, env)
    instr = env.get_language_instruction()
    controller.reset(img0, instr)

    done = False
    for _ in range(max_steps):
        img = get_image(obs, env)
        action = controller.get_action(img)
        obs, _, done, trunc, _ = env.step(action)
        frames.append(img)
        if done or trunc:
            break
    return frames, done

# 8) Main: правильное gym.make
if __name__ == "__main__":
    from simpler_env import ENVIRONMENT_MAP

    tasks = [
        ("widowx_put_eggplant_in_basket", "widowx_bridge", "bridge_orig"),
        # ("google_robot_open_drawer", "google_robot", "fractal20220817_data")
    ]

    for task_name, setup, unnorm in tasks:
        env_name, env_kwargs = ENVIRONMENT_MAP[task_name]
        # убираем prepackaged_config из env_kwargs, чтобы не дублировать
        base_kwargs = {k: v for k, v in env_kwargs.items() if k != "prepackaged_config"}

        env = gym.make(
            env_name,
            obs_mode="rgb",
            render_mode="rgb_array",
            prepackaged_config=True,
            **base_kwargs
        )

        controller = FastCombinedController(
            spatial_ckpt="checkpoints/spatialvla-4b",
            ecot_ckpt="Embodied-CoT/ecot-openvla-7b-bridge",
            policy_setup=setup,
            ecot_unnorm_key=unnorm
        )

        successes = 0
        for ep in range(5):
            frames, ok = run_episode(env, controller)
            successes += int(ok)
            mediapy.show_video(
                frames, fps=10,
                title=f"{task_name} EP{ep+1} - {'OK' if ok else 'FAIL'}"
            )

        print(f"{task_name} success rate: {successes}/5")
        env.close()

  from .autonotebook import tqdm as notebook_tqdm
2025-04-25 11:03:20.522324: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-25 11:03:20.522379: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-25 11:03:20.524014: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 11:03:20.530516: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[20

*** policy_setup: widowx_bridge, unnorm_key: bridge_orig/1.0.0 ***


Some kwargs in processor config are unused and will not have any effect: bin_policy, num_obs_steps, statistics, action_config, action_chunk_size, intrinsic_config, obs_delta. 


Add 0 TRANSLATION TOKENS, tokenizer vocab size 257152 / 265347
Add 0 ROTATION TOKENS to tokenizer, tokenizer vocab size 257152 / 265347
Add 0 GRIPPER TOKENS to tokenizer, tokenizer vocab size 257152 / 265347


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.01it/s]
`use_fast` is set to `True` but the image processor class does not have a fast version.  Falling back to the slow version.
Expected `transformers==4.40.1` and `tokenizers==0.19.1` but got `transformers==4.49.0` and `tokenizers==0.21.1`; there might be inference-time regressions due to dependency changes. If in doubt, pleaseuse the above versions.
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  4.13it/s]
  logger.warn(


Unsupported: dynamic shape operator: aten.nonzero.default; to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True

from user code:
   File "/root/.cache/huggingface/modules/transformers_modules/spatialvla-4b/modeling_spatialvla.py", line 367, in forward
    inputs_embeds[spatial_selected] = inputs_embeds[spatial_selected] * 0.0 + self.spatial_embed_tokens(input_ids[spatial_selected] - self.config.action_token_begin_idx)

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


SpatialVLA + ECoT
(старый 20минут)

In [4]:
# Полное отключение Torch Dynamo и компиляции
import os
os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["TORCH_COMPILE"] = "0"

import numpy as np
import torch
import cv2 as cv
from typing import Optional
from transforms3d.euler import euler2axangle
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor

# ----- Патч для make_batched_images -----
import transformers
from transformers.models.paligemma import processing_paligemma as pp
if not hasattr(pp, 'make_batched_images'):
    from transformers.image_utils import make_flat_list_of_images
    def make_batched_images(images):
        return make_flat_list_of_images(images)
    pp.make_batched_images = make_batched_images

# ---------- SpatialVLA ----------
from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference

# ---------- ECoTInference ----------
class EcoTInference:
    def __init__(
        self,
        saved_model_path: str,
        unnorm_key: Optional[str] = None,
        policy_setup: str = "widowx_bridge",
        image_size: list[int] = [224, 224],
        action_scale: float = 1.0,
    ) -> None:
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        self.policy_setup = policy_setup
        self.unnorm_key = (
            unnorm_key
            or ("bridge_orig" if policy_setup == "widowx_bridge" else "fractal20220817_data")
        )
        print(f"*** ECoT policy_setup: {policy_setup}, unnorm_key: {self.unnorm_key} ***")
        self.processor = AutoProcessor.from_pretrained(saved_model_path, trust_remote_code=True)
        self.model = AutoModelForVision2Seq.from_pretrained(
            saved_model_path,
            attn_implementation="sdpa",
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        ).cuda()
        self.image_size = image_size
        self.action_scale = action_scale
        self.sticky_repeat = 1 if policy_setup == "widowx_bridge" else 15
        self.reset(None)

    def reset(self, task_description: Optional[str]) -> None:
        self.task_description = task_description
        self.prev_grip = None
        self.grip_count = 0
        self.sticky_on = False
        self.sticky_val = 0.0

    def _resize(self, img: np.ndarray) -> np.ndarray:
        return cv.resize(img, tuple(self.image_size), interpolation=cv.INTER_AREA)

    def step(self, image: np.ndarray, task: str) -> tuple[dict, dict]:
        if task != self.task_description:
            self.reset(task)
        img = self._resize(image)
        pil = Image.fromarray(img)
        inputs = self.processor(task, pil).to("cuda", torch.bfloat16)
        out = self.model.predict_action(
            **inputs, unnorm_key=self.unnorm_key, do_sample=False
        )
        # handle tuple output
        raw_tensor = out[0] if isinstance(out, tuple) else out
        # to numpy
        raw_np = (
            raw_tensor.detach().cpu().numpy()
            if torch.is_tensor(raw_tensor)
            else np.array(raw_tensor)
        )
        # parse components
        wv, rd, gr = raw_np[:3], raw_np[3:6], raw_np[6]
        ax, ang = euler2axangle(*rd)
        rot = (ax * ang) * self.action_scale
        # gripper logic
        if self.prev_grip is not None:
            rel = self.prev_grip - gr
            if abs(rel) > 0.5 and not self.sticky_on:
                self.sticky_on = True
                self.sticky_val = rel
            if self.sticky_on:
                self.grip_count += 1
                rel = self.sticky_val
            if self.grip_count >= self.sticky_repeat:
                self.sticky_on = False
                self.grip_count = 0
            grip = rel
        else:
            grip = 2 * (gr > 0.5) - 1
        self.prev_grip = gr
        raw_act = {"world_vector": wv, "rotation_delta": rd, "open_gripper": gr}
        act = {
            "world_vector": wv * self.action_scale,
            "rot_axangle": rot,
            "gripper": grip,
            "terminate_episode": np.array([0.0]),
        }
        return raw_act, act

# ---------- CombinedSpatialECoTInference ----------
class CombinedSpatialECoTInference:
    def __init__(
        self,
        spatial_cp: str,
        ecot_cp: str,
        setup: str = "widowx_bridge",
        temp: float = 2.0,
        size: list[int] = [224, 224],
        scale: float = 1.0,
    ):
        self.spatial = SpatialVLAInference(
            spatial_cp,
            policy_setup=setup,
            action_ensemble_temp=temp,
        )
        self.ecot = EcoTInference(
            ecot_cp,
            policy_setup=setup,
            image_size=size,
            action_scale=scale,
        )
        self.task = None

    def reset(self, task: str):
        self.task = task
        self.spatial.reset(task)
        self.ecot.reset(task)

    def step(self, img: np.ndarray, task: Optional[str] = None):
        if task and task != self.task:
            self.reset(task)
        _, sa = self.spatial.step(img, self.task)
        feat = (
            f"EMBODIED FEATURES: pos={sa['world_vector'].tolist()}, "
            f"rot={sa.get('rot_axangle', [0,0,0]).tolist()}"
        )
        prompt = f"{feat}\n{self.task}"
        re, ea = self.ecot.step(img, prompt)
        final = {
            "world_vector": ea["world_vector"],
            "rot_axangle": ea["rot_axangle"],
            "gripper": np.atleast_1d(ea["gripper"]),
            "terminate_episode": np.array([0.0]),
        }
        return re, final

if __name__ == '__main__':
    import gymnasium as gym
    import mediapy
    import numpy as np
    from simpler_env import ENVIRONMENT_MAP

    # Создание среды без глубины (rgb only) с рендером в массив
    env_name, env_kwargs = ENVIRONMENT_MAP['widowx_put_eggplant_in_basket']
    env = gym.make(
        env_name,
        obs_mode='rgb',
        render_mode='rgb_array',      # включаем возврат кадра
        prepackaged_config=True,
        **env_kwargs
    )
    obs, _ = env.reset(seed=0)
    # Получение инструкции
    try:
        task = env.unwrapped.get_language_instruction()
    except Exception:
        task = env.get_language_instruction()

    policy = CombinedSpatialECoTInference(
        spatial_cp='checkpoints/spatialvla-4b',
        ecot_cp='Embodied-CoT/ecot-openvla-7b-bridge',
    )
    policy.reset(task)

    def fetch_image(obs_dict, camera_name="base_camera"):
        # 1) nested obs["image"][camera_name]["rgb"]
        try:
            return obs_dict["image"][camera_name]["rgb"]
        except (KeyError, TypeError):
            pass
        # 2) obs["image"] — сам numpy-массив
        if isinstance(obs_dict.get("image"), np.ndarray):
            return obs_dict["image"]
        # 3) obs["image"] — dict без camera_name
        cand = obs_dict.get("image")
        if isinstance(cand, dict):
            for v in cand.values():
                if isinstance(v, dict) and "rgb" in v:
                    return v["rgb"]
        # 4) fallback через render()
        img = env.render()  # теперь гарантированно работает
        if isinstance(img, tuple):
            img = img[0]
        return img

    frames = []
    done = False
    while not done:
        img = fetch_image(obs)
        _, action = policy.step(img)
        frames.append(img)
        vec = np.concatenate([
            action['world_vector'],
            action['rot_axangle'],
            action['gripper'],
        ])
        try:
            obs, _, done, _, _ = env.step(vec)
        except ValueError:
            obs, _, done, _ = env.step(vec)

    mediapy.show_video(frames, fps=10)

TypeError: gymnasium.envs.registration.make() got multiple values for keyword argument 'prepackaged_config'

In [None]:
mediapy.show_video(frames, fps=10)


conda create -n openvla python=3.10 -y
conda activate openvla

#### базовые и PyTorch
conda install -c conda-forge numpy=1.24.4 scipy=1.10.1 \
              typing_extensions=4.9 pytorch=2.3.1 torchvision=0.18.1 \
              pytorch-cuda=12.1 -y

#### остальные pip-пакеты
pip install flash-attn==2.7.4.post1+cu12torch2.3cxx11abiFALSE \
            transformers==4.49.2 tokenizers==0.21.2 safetensors \
            huggingface_hub mediapy imageio moviepy pandas gymnasium \
            pyglet glfw pyquaternion opencv-python-headless

#### субмодули SimplerEnv
git clone https://github.com/DelinQu/SimplerEnv-OpenVLA.git --recurse-submodules
cd SimplerEnv-OpenVLA
pip install -e ManiSkill2_real2sim
pip install -e .

#### TensorFlow-блок ставьте только если нужен RT-1 / Octo:
#### pip install tensorflow==2.16.1 tensorflow_hub==0.16.1 tensorflow_datasets==4.9.4 ...



pip install transformers==4.49.0 tokenizers==0.21.1 safetensors>=0.4.2             huggingface_hub>=0.23 mediapy==1.1.7 imageio==2.34.1 moviepy==1.0.3

###############################################################################
# 0. Подготовка (обновите conda-forge index, git lfs) -------------------------
###############################################################################
conda update -n base -c conda-forge conda -y
conda install -c conda-forge git-lfs -y
git lfs install              # один раз на системе

###############################################################################
# 1. Создаём чистую среду ------------------------------------------------------
###############################################################################
conda create -n openvla python=3.10 -y
conda activate openvla

###############################################################################
# 2. Ставим ядро PyTorch 2.3.1 с CUDA 12.1 через conda -------------------------
###############################################################################
conda install -c nvidia -c pytorch \
              pytorch=2.3.1 torchvision=0.18.1 \
              pytorch-cuda=12.1 -y

###############################################################################
# 3. Базовые численные библиотеки ---------------------------------------------
###############################################################################
conda install -c conda-forge numpy=1.24.4 scipy=1.10.1 \
                              pandas=2.2.2 typing_extensions=4.9 -y

###############################################################################
# 4. Python-пакеты через pip ---------------------------------------------------
###############################################################################
pip install --upgrade pip

# Hugging Face стек
pip install transformers==4.49.2 tokenizers==0.21.2 safetensors>=0.4.2 \
            huggingface_hub>=0.23 mediapy==1.1.7 imageio==2.34.1 moviepy==1.0.3

# Gymnasium + опции для ManiSkill / SimplerEnv
pip install gymnasium==0.29.1 pyglet==2.0.10 glfw==2.6.2 \
            opencv-python-headless==4.10.0.82 pyquaternion==0.9.9 pybullet==3.2.6

###############################################################################
# 5. Flash-Attention 2 (готовый wheel под Torch 2.3 + cu121) -------------------
###############################################################################
pip install --no-cache-dir \
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/\
flash_attn-2.7.4.post1%2Bcu12torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

###############################################################################
# 6. Клонируем SimplerEnv-OpenVLA с ManiSkill2_real2sim ------------------------
###############################################################################
git clone https://github.com/DelinQu/SimplerEnv-OpenVLA.git --recurse-submodules
cd SimplerEnv-OpenVLA

pip install ruckig==0.12.1 --only-binary ruckig

# движок ManiSkill2 (чуть патчённый) + сам SimplerEnv
pip install -e ManiSkill2_real2sim
pip install -e .

cd ..

###############################################################################
# 7. (опц.) TensorFlow-блок для RT-1 / Octo (≈ 3 ГБ) ---------------------------
###############################################################################
# pip install tensorflow==2.16.1 tensorflow_hub==0.16.1 tensorflow_datasets==4.9.4 \
#             tf_agents==0.19.0 rlds==0.1.8 dm-reverb[tensorflow]==0.14.0 \
#             apache_beam==2.54.0 tfds-nightly==4.9.4.dev202403220044 \
#             tfp-nightly==0.25.0.dev20240322

###############################################################################
# 8. Скачиваем чек-пойнт OpenVLA-7B (≈ 14 GB) ----------------------------------
###############################################################################


git lfs clone https://huggingface.co/openvla/openvla-7b checkpoints/openvla-7b
############# Или ############# (В коде)
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id="openvla/openvla-7b",
    local_dir="checkpoints/openvla-7b",
    local_dir_use_symlinks=False,   # реальные файлы, без ссылок
    resume_download=True            # можно докачивать при обрыве
)
#############     #############

pip install timm
pip install 'accelerate>=0.26.0'

###############################################################################
# 9. Проверка ------------------------------------------------------------------
###############################################################################
python - <<'PY'
import torch, transformers, flash_attn
print("Torch:", torch.__version__, "| CUDA:", torch.version.cuda)
print("Transformers:", transformers.__version__)
from flash_attn import flash_attn_qkvpacked_func
print("Flash-Attn OK")
PY
