In [7]:
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from pathlib import Path
import os
import torch
import numpy as np

Load the dataset that you want to clean up by removing all the zero actions

In [9]:
root_dataset_dir = '/fast_storage/qnoens/data/lerobot/eval_task1B_distract'
dataset_name = 'test_dataset'

lerobot_dataset = LeRobotDataset(repo_id=dataset_name, root=root_dataset_dir)
meta = lerobot_dataset.meta
print(meta)

Resolving data files:   0%|          | 0/52 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/52 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

LeRobotDatasetMetadata({
    Repository ID: 'test_dataset',
    Total episodes: '52',
    Total frames: '8342',
    Features: '['next.reward', 'next.success', 'seed', 'timestamp', 'wrist_image_original', 'scene_image_original', 'wrist_image', 'scene_image', 'state', 'robot_pose', 'gripper_state', 'joints', 'action', 'frame_index', 'episode_index', 'index', 'task_index']',
})',



Decide on the path where you want the new dataset to end up

In [10]:
root_dir = Path('/fast_storage/qnoens/data/lerobot/init_1B_distract')
use_videos = Path(root_dataset_dir + '/' + 'videos').exists()

#### Create the new dataset. 

**Note:** If the path you gave already existed, then it will deleted!

In [11]:
if root_dir.exists():
    print(f'Root directory already exists. Deleting it.')
    os.system(f'rm -rf {root_dir}')

new_dataset = LeRobotDataset.create(
    repo_id=dataset_name,
    fps=lerobot_dataset.fps,
    root=root_dir,
    features=lerobot_dataset.features,
    use_videos=use_videos,
    image_writer_processes=0,
    image_writer_threads=16,
)

In [13]:
delete_list = [5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 50, 51]
print(len(delete_list))

32


We will now go episode by episode over every frame within the episode to create the new episodes excluding actions that are too small and will therefore damage the performance of our model.

**Note:** The relative gripper actions are also made binary here which makes it immediately compatible with OpenVLA

In [14]:
# We now go episode by episode over frame by frame
episode_count = 0
for ep_idx in range(lerobot_dataset.num_episodes):
    episode_meta = meta.episodes[ep_idx]    
    episode_length = episode_meta['length']

    lengths = [meta.episodes[i]['length'] for i in range(ep_idx)]
    start_idx = sum(lengths)

    if ep_idx in delete_list:
        print(f"Removing episode {ep_idx} from dataset")
        continue

    # if ep_idx <= 51:
    #     instruction = "move the block to the blue rectangle"
    # elif ep_idx <= 101:
    #     instruction = "move the block to the green circle"
    # elif ep_idx <= 151:
    #     instruction = "move the block to the pink pentagon"
    # else:
    #     instruction = "move the block to the orange triangle"
    instruction = "clean up the table"


    # Iterate over frames and set action to zero
    for frame_idx in range(start_idx, start_idx + episode_length):
        orig_frame = lerobot_dataset[frame_idx]

        if frame_idx + 1 < start_idx + episode_length:
            next_action = lerobot_dataset[frame_idx + 1]['joints']
            next_gripper = lerobot_dataset[frame_idx + 1]['gripper_state']
        else:
            next_action = lerobot_dataset[frame_idx]['joints']
            next_gripper = lerobot_dataset[frame_idx]['gripper_state']

        # Slice first 6 elements of joint positions
        next_action = next_action[:6]

        # Ensure next_gripper is a tensor of shape (1,)
        if next_gripper.dim() == 0:
            next_gripper = next_gripper.unsqueeze(0)

        # Final action tensor: shape (7,)
        next_action = torch.cat([next_action, next_gripper])

        # Optional sanity check
        assert next_action.shape == (7,), f"Expected (7,), got {next_action.shape}"


        
        # Create new frame
        frame = {
            'next.reward': orig_frame['next.reward'].unsqueeze(0),
            'next.success': orig_frame['next.success'].unsqueeze(0),
            'seed': orig_frame['seed'].unsqueeze(0),  
            'wrist_image_original': orig_frame['wrist_image_original'].permute(1, 2, 0),
            'scene_image_original': orig_frame['scene_image_original'].permute(1, 2, 0),
            'wrist_image': orig_frame['wrist_image'].permute(1, 2, 0),
            'scene_image': orig_frame['scene_image'].permute(1, 2, 0),
            'state': torch.cat([orig_frame['joints'], orig_frame['gripper_state'].unsqueeze(0)]),
            'robot_pose': orig_frame['robot_pose'], 
            'gripper_state': orig_frame['gripper_state'].unsqueeze(0), 
            'joints': orig_frame['joints'], 
            'action': next_action,
            'task': instruction
        }

        # Add frame to new dataset
        new_dataset.add_frame(frame)



    # Add episode to new dataset
    new_dataset.save_episode()
    print(f"Processed episode {episode_count + 1}/{lerobot_dataset.num_episodes - len(delete_list)}")
    episode_count += 1




Map:   0%|          | 0/943 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 1/20


Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 2/20


Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 3/20


Map:   0%|          | 0/416 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 4/20


Map:   0%|          | 0/476 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 5/20
Removing episode 5 from dataset
Removing episode 6 from dataset
Removing episode 7 from dataset
Removing episode 8 from dataset
Removing episode 9 from dataset


Map:   0%|          | 0/604 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 6/20


Map:   0%|          | 0/379 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 7/20


Map:   0%|          | 0/411 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 8/20


Map:   0%|          | 0/486 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 9/20


Map:   0%|          | 0/409 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 10/20
Removing episode 15 from dataset
Removing episode 16 from dataset
Removing episode 17 from dataset
Removing episode 18 from dataset
Removing episode 19 from dataset
Removing episode 20 from dataset
Removing episode 21 from dataset
Removing episode 22 from dataset
Removing episode 23 from dataset
Removing episode 24 from dataset


Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 11/20


Map:   0%|          | 0/154 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 12/20


Map:   0%|          | 0/139 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 13/20


Map:   0%|          | 0/632 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 14/20


Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 15/20
Removing episode 30 from dataset
Removing episode 31 from dataset
Removing episode 32 from dataset
Removing episode 33 from dataset
Removing episode 34 from dataset
Removing episode 35 from dataset
Removing episode 36 from dataset
Removing episode 37 from dataset
Removing episode 38 from dataset
Removing episode 39 from dataset
Removing episode 40 from dataset
Removing episode 41 from dataset
Removing episode 42 from dataset
Removing episode 43 from dataset
Removing episode 44 from dataset


Map:   0%|          | 0/528 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 16/20


Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 17/20


Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 18/20


Map:   0%|          | 0/587 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 19/20


Map:   0%|          | 0/499 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processed episode 20/20
Removing episode 50 from dataset
Removing episode 51 from dataset
