### Install Dependencies

In [1]:
!pip install scipy ftfy regex tqdm torch git+https://github.com/openai/CLIP.git einops pyrender==0.1.45 trimesh==3.9.34 pycollada==0.6

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-evdcknr_
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-evdcknr_
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting scipy
  Using cached scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting ftfy
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting regex
  Using cached regex-2024.11.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch
  Using cached torch-2.7.0-cp39-cp39-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting einops
  Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting pyrender==0.1.45
  Usin

### Clone Repo and Setup

Clone [https://github.com/peract/peract_colab.git](github.com/peract/peract_colab.git).   

This repo contains barebones code from [`ARM`](https://github.com/stepjam/ARM), [`YARR`](https://github.com/stepjam/YARR), [`PyRep`](https://github.com/stepjam/PyRep), [`RLBench`](https://github.com/stepjam/RLBench) to get started with  PerAct without the actual [V-REP](https://www.coppeliarobotics.com/) simulator.

The repo also contains a pre-generated RLBench dataset of 10 expert demonstrations for the `open_drawer` task. This task has three variations: "open the top drawer", "open the middle drawer", and "open the bottom drawer"



In [2]:
!git clone https://github.com/peract/peract_colab.git

fatal: destination path 'peract_colab' already exists and is not an empty directory.


If you fork-off this repo, you might want to pull the latest changes.

In [3]:
!cd peract_colab && git pull origin master

From https://github.com/peract/peract_colab
 * branch              master     -> FETCH_HEAD
Already up to date.


Set `PYOPENGL_PLATFORM=egl` for pyrender visualizations

In [None]:
import numpy as np
np.bool = np.bool_ # bad trick to fix numpy version issue :(

import os
import sys
import shutil
import pickle

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

os.environ["DISPLAY"] = ":0"
os.environ["PYOPENGL_PLATFORM"] = "egl"


import matplotlib.pyplot as plt
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
import PIL
from PIL import Image

import sys
sys.path.append("..")

from mast3r.mast3r.model import AsymmetricMASt3R
from mast3r.dust3r.model import AsymmetricCroCo3DStereo
from mast3r.mast3r.fast_nn import fast_reciprocal_NNs

import mast3r.mast3r.utils.path_to_dust3r
from mast3r.dust3r.inference import inference
from mast3r.dust3r.image_pairs import make_pairs
from mast3r.dust3r.utils.device import to_numpy
from mast3r.dust3r.demo import get_3D_model_from_scene
from mast3r.dust3r.cloud_opt import global_aligner, GlobalAlignerMode

ModuleNotFoundError: No module named 'torch'

In [None]:
import torchvision.transforms as tvf
from PIL import Image
import numpy as np
import torch

ImgNorm = tvf.Compose([
    tvf.ToTensor(),
    tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


def rgb(ftensor, true_shape=None):
    if isinstance(ftensor, list):
        return [rgb(x, true_shape=true_shape) for x in ftensor]
    if isinstance(ftensor, torch.Tensor):
        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
        ftensor = ftensor.transpose(1, 2, 0)
    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
        ftensor = ftensor.transpose(0, 2, 3, 1)
    if true_shape is not None:
        H, W = true_shape
        ftensor = ftensor[:H, :W]
    if ftensor.dtype == np.uint8:
        img = np.float32(ftensor) / 255
    else:
        img = (ftensor * 0.5) + 0.5
    return img.clip(min=0, max=1)


def _resize_pil_image(img, long_edge_size):
    S = max(img.size)
    interp = Image.LANCZOS if S > long_edge_size else Image.BICUBIC
    new_size = tuple(int(round(x * long_edge_size / S)) for x in img.size)
    return img.resize(new_size, interp)

def load_images_from_loaded(images, size, square_ok=False, verbose=True):
    """
    Process a list of already loaded images (as PIL.Image or NumPy arrays)
    and convert them to normalized tensors for DUSt3R.
    """
    if verbose:
        print(f'>> Processing {len(images)} already-loaded images')

    imgs = []
    for idx, img in enumerate(images):
        if isinstance(img, np.ndarray):
            img = Image.fromarray(img)
        elif not isinstance(img, Image.Image):
            raise TypeError(f"Unsupported image type at index {idx}: {type(img)}")

        img = img.convert('RGB')  # ensure RGB
        W1, H1 = img.size

        # Resize
        if size == 224:
            img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1)))
        else:
            img = _resize_pil_image(img, size)

        # Center crop
        W, H = img.size
        cx, cy = W // 2, H // 2
        if size == 224:
            half = min(cx, cy)
            img = img.crop((cx - half, cy - half, cx + half, cy + half))
        else:
            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
            if not square_ok and W == H:
                halfh = int(3 * halfw / 4)
            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))

        W2, H2 = img.size
        if verbose:
            print(f' - processed image {idx} with resolution {W1}x{H1} --> {W2}x{H2}')

        imgs.append(dict(
            img=ImgNorm(img)[None],
            true_shape=np.int32([img.size[::-1]]),
            idx=idx,
            instance=str(idx)
        ))

    assert imgs, 'No valid images were processed'
    if verbose:
        print(f' (Successfully processed {len(imgs)} images)')
    return imgs


NameError: name 'tvf' is not defined

Define some constants and setting variables.

The `BATCH_SIZE` is 1 to fit the model on a single GPU. But you can play around with the voxel sizes and Transformer layers to increase this.  

In the paper, we use `NUM_LATENTS=2048` by default, but smaller latents like `512` are also fine (see Appendix G).

In [5]:
#constants:
CAMERAS = ['front', 'left_shoulder', 'right_shoulder', 'wrist']
IMAGE_SIZE =  128  # 128x128 - if you want to use higher voxel resolutions like 200^3, you might want to regenerate the dataset with larger images
DATA_FOLDER ='peract_colab/data'
EPISODES_FOLDER = 'colab_dataset/open_drawer/all_variations/episodes'


Add `peract_colab` to the system path and make a directory for storing the replay buffer.  For now, we will store the replay buffer on disk to avoid memory issues with putting everthing on RAM.

In [6]:
sys.path.append('peract_colab')
data_path = os.path.join(DATA_FOLDER, EPISODES_FOLDER)

In [7]:
from rlbench.utils import get_stored_demo
from rlbench.backend.utils import extract_obs

In [8]:
TEST_DATA_PATH = os.path.abspath("../../../DL2/rlbench - 0B2LlLwoO3nfZfkFqMEhXWkxBdjJNNndGYl9uUDQwS1pfNkNHSzFDNGwzd1NnTmlpZXR1bVE/test/")

In [9]:
task_dirs = os.listdir(TEST_DATA_PATH)


In [None]:
device = 'cuda'
schedule = 'cosine'
lr = 0.01
niter = 300
model_name = "naver/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"
model = AsymmetricMASt3R.from_pretrained(model_name).to(device)

#Loop through all the tasks:
for task_dir in task_dirs:
    print(f"Current task: {task_dir}")
    task_path = os.path.join(TEST_DATA_PATH, task_dir)
    episodes = os.listdir(os.path.join(task_path, "all_variations/episodes"))

    #Loop through all the episodes:
    for episode_idx in range(len(episodes)):
        print(f"Current episode: {episode_idx}")
        episode_path = os.path.join(task_path, 'all_variations/episodes')
        
        demo = get_stored_demo(episode_path, episode_idx)

        #Loop through time steps:
        for ts in range(len(demo)):
            obs_dict = extract_obs(demo._observations[ts], CAMERAS, t=ts)

            images = load_images_from_loaded([obs_dict['front_rgb'], 
                                     obs_dict['left_shoulder_rgb'], 
                                     obs_dict['right_shoulder_rgb'], 
                                     obs_dict['wrist_rgb']], size=512, square_ok=True)
            
            pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
            output = inference(pairs, model, device, batch_size=1, verbose=False)

            mode = GlobalAlignerMode.PointCloudOptimizer #if len(images) > 2 else GlobalAlignerMode.PairViewer
            scene = global_aligner(output, device=device, mode=mode, verbose=False)
            if mode == GlobalAlignerMode.PointCloudOptimizer:
                loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)

            rgbimg = scene.imgs
            depths = to_numpy(scene.get_depthmaps())
            depths_max = max([d.max() for d in depths])
            depths = [d / depths_max for d in depths]
            print(depths[0])

            imgs = []
            for i in range(len(rgbimg)):
                imgs.append(rgb(depths[i]))

            # plot the depth of image 1 using 3d points:depth

            plt.figure()
            depth = Image.fromarray(imgs[0])
            depth = depth.resize((128, 128), Image.LANCZOS)
            plt.imshow(np.asarray(depth))
            plt.savefig('test.pdf')
            plt.show(block=True)
            plt.figure()
            depth = Image.fromarray(imgs[1])
            depth = depth.resize((128, 128), Image.LANCZOS)
            plt.imshow(np.asarray(depth))
            plt.savefig('test1.pdf')
            plt.show(block=True)
            plt.figure()
            depth = Image.fromarray(imgs[2])
            depth = depth.resize((128, 128), Image.LANCZOS)
            plt.imshow(np.asarray(depth))
            plt.savefig('test2.pdf')
            plt.show(block=True)
            plt.figure()
            depth = Image.fromarray(imgs[3])
            depth = depth.resize((128, 128), Image.LANCZOS)
            plt.imshow(np.asarray(depth))
            plt.savefig('test3.pdf')
            plt.show(block=True)
            plt.figure()
            depth = Image.fromarray(imgs[4])
            depth = depth.resize((128, 128), Image.LANCZOS)
            plt.imshow(np.asarray(depth))
            plt.savefig('test4.pdf')
            plt.show(block=True)