### Pip Install

In [None]:
# install the package
#%pip install --upgrade mani_skill
# install a version of torch that is compatible with your system
#%pip install torch torchvision torchaudio numpy diffusers

### Imports

In [1]:
import math
from typing import Union
import h5py
from tqdm import tqdm
import numpy as np
import os

from mani_skill.utils import common
from mani_skill.utils.io_utils import load_json
from mani_skill.utils.common import flatten_state_dict
import mani_skill.envs

import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.utils.data import IterableDataset, Dataset
from torch.utils.data import DataLoader
from typing import Tuple, Sequence, Dict, Union, Optional
import numpy as np
import math
import torch
import torch.nn as nn
import collections
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusers.training_utils import EMAModel
from diffusers.optimization import get_scheduler
from tqdm.auto import tqdm

# env import
import gymnasium
from gymnasium import spaces
import os


### Dataset code

In [11]:
# loads h5 data into memory for faster access
def load_h5_data(data):
    out = dict()
    for k in data.keys():
        if isinstance(data[k], h5py.Dataset):
            out[k] = data[k][:]
        else:
            out[k] = load_h5_data(data[k])
    return out


def create_sample_indices(episode_ends: np.ndarray, sequence_length: int, pad_before: int = 0, pad_after: int = 0):
    # Currently uses truncated as episode ends which is the end of the episode and not the end of the trajectory
    # TODO: What to use as episode ends?
    indices = list()
    episode_length = 0
    episode_index = 1 # Start 1 for human readability
    for i in range(len(episode_ends)):
        episode_length += 1
        if episode_ends[i]:
            start_idx = 0 if i <= 0 else i - episode_length + 1
            min_start = -pad_before
            max_start = episode_length - sequence_length + pad_after

            # Create indices for each possible sequence in the episode
            for idx in range(min_start, max_start + 1):
                buffer_start_idx = max(idx, 0) + start_idx
                buffer_end_idx = min(idx + sequence_length, episode_length) + start_idx
                start_offset = buffer_start_idx - (idx + start_idx)
                end_offset = (idx + sequence_length + start_idx) - buffer_end_idx
                sample_start_idx = 0 + start_offset
                sample_end_idx = sequence_length - end_offset
                indices.append([buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx])
            episode_length = 0
            episode_index += 1
    return np.array(indices)


def sample_sequence(train_data, sequence_length, buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx):
    result = dict()
    for key, input_arr in train_data.items():
        sample = input_arr[buffer_start_idx:buffer_end_idx]
        data = sample
        if (sample_start_idx > 0) or (sample_end_idx < sequence_length):
            if isinstance(input_arr, torch.Tensor):
                data = torch.zeros((sequence_length,) + input_arr.shape[1:], dtype=input_arr.dtype)
            else:
                data = np.zeros(shape=(sequence_length,) + input_arr.shape[1:], dtype=input_arr.dtype)
            if sample_start_idx > 0:
                data[:sample_start_idx] = sample[0]
            if sample_end_idx < sequence_length:
                data[sample_end_idx:] = sample[-1]
            data[sample_start_idx:sample_end_idx] = sample
        result[key] = data
    return result

def remove_np_uint16(x: Union[np.ndarray, dict]):
            if isinstance(x, dict):
                for k in x.keys():
                    x[k] = remove_np_uint16(x[k])
                return x
            else:
                if x.dtype == np.uint16:
                    return x.astype(np.int32)
                return x

def convert_observation(obs):
    return np.concatenate(list(obs.values()), axis=-1)

def get_observations(obs):
    print(obs["extra"].keys())
    #print(obs["agent"].keys())
    #print(obs["extra"]["tcp_pose"].shape)
    #print(obs["extra"]["obj_pose"].shape)
    #print(obs["extra"]["goal_pos"].shape)
    # is_grasped_reshaped = np.reshape(obs["extra"]["is_grasped"], (len(obs["extra"]["is_grasped"]), 1))
    return dict(
        tcp_pose=obs["extra"]["tcp_pose"],
        obj_pose=obs["extra"]["charger_pose"],
        goal_pos=obs["extra"]["goal_pose"],
        # is_grasped=is_grasped_reshaped,
        #tcp_to_obj_pos=obs["extra"]["tcp_to_obj_pos"],
        #obj_to_goal_pos=obs["extra"]["obj_to_goal_pos"],
        qpos=obs["agent"]["qpos"],
        qvel=obs["agent"]["qvel"],
    )

def get_data_stats(data):
    data = data.reshape(-1,data.shape[-1])
    stats = {
        'min': np.min(data, axis=0),
        'max': np.max(data, axis=0)
    }
    return stats

def normalize_data(data, stats):
    # nomalize to [0,1]
    ndata = (data - stats['min']) / (stats['max'] - stats['min'])
    # normalize to [-1, 1]
    ndata = ndata * 2 - 1
    return ndata

class StateNormalDataset(Dataset):
    """
    A general torch Dataset you can drop in and use immediately with just about any trajectory .h5 data generated from ManiSkill.
    This class simply is a simple starter code to load trajectory data easily, but does not do any data transformation or anything
    advanced. We recommend you to copy this code directly and modify it for more advanced use cases

    Args:
        dataset_file (str): path to the .h5 file containing the data you want to load
        load_count (int): the number of trajectories from the dataset to load into memory. If -1, will load all into memory
        success_only (bool): whether to skip trajectories that are not successful in the end. Default is false
        device: The location to save data to. If None will store as numpy (the default), otherwise will move data to that device
    """

    def __init__(
        self, dataset_file: str, pred_horizon: int, obs_horizon: int, action_horizon:int, load_count=-1, success_only: bool = False, normalize: bool = False, device=None
    ) -> None:
        self.dataset_file = dataset_file
        self.pred_horizon = pred_horizon
        self.obs_horizon = obs_horizon
        self.action_horizon = action_horizon
        self.normalize = normalize
        self.device = device
        self.data = h5py.File(dataset_file, "r")
        json_path = dataset_file.replace(".h5", ".json")
        self.json_data = load_json(json_path)
        self.episodes = self.json_data["episodes"]
        self.env_info = self.json_data["env_info"]
        self.env_id = self.env_info["env_id"]
        self.env_kwargs = self.env_info["env_kwargs"]
        self.is_pointcloud = dataset_file.find("pointcloud") != -1

        self.obs = None
        self.actions = []
        self.terminated = []
        self.truncated = []
        self.success, self.fail, self.rewards = None, None, None
        if load_count == -1:
            load_count = len(self.episodes)
        for eps_id in tqdm(range(load_count), desc="Loading Episodes", colour="green"):
            eps = self.episodes[eps_id]
            if success_only:
                assert (
                    "success" in eps
                ), "episodes in this dataset do not have the success attribute, cannot load dataset with success_only=True"
                if not eps["success"]:
                    continue
            trajectory = self.data[f"traj_{eps['episode_id']}"]
            trajectory = load_h5_data(trajectory)
            eps_len = len(trajectory["actions"])

            # exclude the final observation as most learning workflows do not use it
            obs = common.index_dict_array(trajectory["obs"], slice(eps_len))
            if eps_id == 0:
                self.obs = obs
            else:
                self.obs = common.append_dict_array(self.obs, obs)

            self.actions.append(trajectory["actions"])
            self.terminated.append(trajectory["terminated"])
            self.truncated.append(trajectory["truncated"])

            # handle data that might optionally be in the trajectory
            if "rewards" in trajectory:
                if self.rewards is None:
                    self.rewards = [trajectory["rewards"]]
                else:
                    self.rewards.append(trajectory["rewards"])
            if "success" in trajectory:
                if self.success is None:
                    self.success = [trajectory["success"]]
                else:
                    self.success.append(trajectory["success"])
            if "fail" in trajectory:
                if self.fail is None:
                    self.fail = [trajectory["fail"]]
                else:
                    self.fail.append(trajectory["fail"])

        self.actions = np.vstack(self.actions)
        self.terminated = np.concatenate(self.terminated)
        self.truncated = np.concatenate(self.truncated)

        self.truncated = np.zeros(self.actions.shape[0], dtype=bool)
        self.truncated[-1] = True

        if self.rewards is not None:
            self.rewards = np.concatenate(self.rewards)
        if self.success is not None:
            self.success = np.concatenate(self.success)
        if self.fail is not None:
            self.fail = np.concatenate(self.fail)

        def remove_np_uint16(x: Union[np.ndarray, dict]):
            if isinstance(x, dict):
                for k in x.keys():
                    x[k] = remove_np_uint16(x[k])
                return x
            else:
                if x.dtype == np.uint16:
                    return x.astype(np.int32)
                return x

        # uint16 dtype is used to conserve disk space and memory
        # you can optimize this dataset code to keep it as uint16 and process that
        # dtype of data yourself. for simplicity we simply cast to a int32 so
        # it can automatically be converted to torch tensors without complaint
        self.obs = remove_np_uint16(self.obs)

        if device is not None:
            self.actions = common.to_tensor(self.actions, device=device)
            self.obs = common.to_tensor(self.obs, device=device)
            self.terminated = common.to_tensor(self.terminated, device=device)
            self.truncated = common.to_tensor(self.truncated, device=device)
            if self.rewards is not None:
                self.rewards = common.to_tensor(self.rewards, device=device)
            if self.success is not None:
                self.success = common.to_tensor(self.terminated, device=device)
            if self.fail is not None:
                self.fail = common.to_tensor(self.truncated, device=device)



        # Added code for diffusion policy
        obs_dict = get_observations(self.obs)
        train_data = dict(
                        obs=convert_observation(obs_dict),
                        actions=self.actions,
                        )

         # Initialize index lists and stat dicts
        self.indices = create_sample_indices(
            episode_ends=self.truncated,
            sequence_length=self.pred_horizon,
            pad_before=self.obs_horizon - 1,
            pad_after=self.action_horizon - 1
        )

        stats = dict()
        normalized_train_data = dict()
        for key, data in train_data.items():
            stats[key] = get_data_stats(data)
            normalized_train_data[key] = normalize_data(data, stats[key])

        self.normalized_train_data = normalized_train_data
        self.stats = stats

    def __len__(self):
        # all possible sequenzes of the dataset
        return len(self.indices)

    def __getitem__(self, idx):
        # Change data to fit diffusion policy
        buffer_start_idx, buffer_end_idx, sample_start_idx, sample_end_idx = self.indices[idx]


        sampled = sample_sequence(
            train_data=self.normalized_train_data,
            sequence_length=self.pred_horizon,
            buffer_start_idx=buffer_start_idx,
            buffer_end_idx=buffer_end_idx,
            sample_start_idx=sample_start_idx,
            sample_end_idx=sample_end_idx
        )

        # discard unused observations in the sequence
        for k in sampled.keys():
            if k != "actions":
                # discard unused observations in the sequence
                sampled[k] = sampled[k][:self.obs_horizon,:]
        sampled[k] = common.to_tensor(sampled[k], device=self.device)

        return sampled

### Network code

In [12]:
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb


class Downsample1d(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.conv = nn.Conv1d(dim, dim, 3, 2, 1)

    def forward(self, x):
        return self.conv(x)

class Upsample1d(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.conv = nn.ConvTranspose1d(dim, dim, 4, 2, 1)

    def forward(self, x):
        return self.conv(x)


class Conv1dBlock(nn.Module):
    '''
        Conv1d --> GroupNorm --> Mish
    '''

    def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
        super().__init__()

        self.block = nn.Sequential(
            nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2),
            nn.GroupNorm(n_groups, out_channels),
            nn.Mish(),
        )

    def forward(self, x):
        return self.block(x)


class ConditionalResidualBlock1D(nn.Module):
    def __init__(self,
            in_channels,
            out_channels,
            cond_dim,
            kernel_size=3,
            n_groups=8):
        super().__init__()

        self.blocks = nn.ModuleList([
            Conv1dBlock(in_channels, out_channels, kernel_size, n_groups=n_groups),
            Conv1dBlock(out_channels, out_channels, kernel_size, n_groups=n_groups),
        ])

        # FiLM modulation https://arxiv.org/abs/1709.07871
        # predicts per-channel scale and bias
        cond_channels = out_channels * 2
        self.out_channels = out_channels
        self.cond_encoder = nn.Sequential(
            nn.Mish(),
            nn.Linear(cond_dim, cond_channels),
            nn.Unflatten(-1, (-1, 1))
        )

        # make sure dimensions compatible
        self.residual_conv = nn.Conv1d(in_channels, out_channels, 1) \
            if in_channels != out_channels else nn.Identity()

    def forward(self, x, cond):
        '''
            x : [ batch_size x in_channels x horizon ]
            cond : [ batch_size x cond_dim]

            returns:
            out : [ batch_size x out_channels x horizon ]
        '''
        out = self.blocks[0](x)
        embed = self.cond_encoder(cond)

        embed = embed.reshape(
            embed.shape[0], 2, self.out_channels, 1)
        scale = embed[:,0,...]
        bias = embed[:,1,...]
        out = scale * out + bias

        out = self.blocks[1](out)
        out = out + self.residual_conv(x)
        return out


class ConditionalUnet1D(nn.Module):
    def __init__(self,
        input_dim,
        global_cond_dim,
        diffusion_step_embed_dim=256,
        down_dims=[256,512,1024],
        kernel_size=5,
        n_groups=8
        ):
        """
        input_dim: Dim of actions.
        global_cond_dim: Dim of global conditioning applied with FiLM
          in addition to diffusion step embedding. This is usually obs_horizon * obs_dim
        diffusion_step_embed_dim: Size of positional encoding for diffusion iteration k
        down_dims: Channel size for each UNet level.
          The length of this array determines numebr of levels.
        kernel_size: Conv kernel size
        n_groups: Number of groups for GroupNorm
        """

        super().__init__()
        all_dims = [input_dim] + list(down_dims)
        start_dim = down_dims[0]

        dsed = diffusion_step_embed_dim
        diffusion_step_encoder = nn.Sequential(
            SinusoidalPosEmb(dsed),
            nn.Linear(dsed, dsed * 4),
            nn.Mish(),
            nn.Linear(dsed * 4, dsed),
        )
        cond_dim = dsed + global_cond_dim

        in_out = list(zip(all_dims[:-1], all_dims[1:]))
        mid_dim = all_dims[-1]
        self.mid_modules = nn.ModuleList([
            ConditionalResidualBlock1D(
                mid_dim, mid_dim, cond_dim=cond_dim,
                kernel_size=kernel_size, n_groups=n_groups
            ),
            ConditionalResidualBlock1D(
                mid_dim, mid_dim, cond_dim=cond_dim,
                kernel_size=kernel_size, n_groups=n_groups
            ),
        ])

        down_modules = nn.ModuleList([])
        for ind, (dim_in, dim_out) in enumerate(in_out):
            is_last = ind >= (len(in_out) - 1)
            down_modules.append(nn.ModuleList([
                ConditionalResidualBlock1D(
                    dim_in, dim_out, cond_dim=cond_dim,
                    kernel_size=kernel_size, n_groups=n_groups),
                ConditionalResidualBlock1D(
                    dim_out, dim_out, cond_dim=cond_dim,
                    kernel_size=kernel_size, n_groups=n_groups),
                Downsample1d(dim_out) if not is_last else nn.Identity()
            ]))

        up_modules = nn.ModuleList([])
        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
            is_last = ind >= (len(in_out) - 1)
            up_modules.append(nn.ModuleList([
                ConditionalResidualBlock1D(
                    dim_out*2, dim_in, cond_dim=cond_dim,
                    kernel_size=kernel_size, n_groups=n_groups),
                ConditionalResidualBlock1D(
                    dim_in, dim_in, cond_dim=cond_dim,
                    kernel_size=kernel_size, n_groups=n_groups),
                Upsample1d(dim_in) if not is_last else nn.Identity()
            ]))

        final_conv = nn.Sequential(
            Conv1dBlock(start_dim, start_dim, kernel_size=kernel_size),
            nn.Conv1d(start_dim, input_dim, 1),
        )

        self.diffusion_step_encoder = diffusion_step_encoder
        self.up_modules = up_modules
        self.down_modules = down_modules
        self.final_conv = final_conv

        print("number of parameters: {:e}".format(
            sum(p.numel() for p in self.parameters()))
        )

    def forward(self,
            sample: torch.Tensor,
            timestep: Union[torch.Tensor, float, int],
            global_cond=None):
        """
        x: (B,T,input_dim)
        timestep: (B,) or int, diffusion step
        global_cond: (B,global_cond_dim)
        output: (B,T,input_dim)
        """
        # (B,T,C)
        sample = sample.moveaxis(-1,-2)
        # (B,C,T)

        # 1. time
        timesteps = timestep
        if not torch.is_tensor(timesteps):
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)
        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        timesteps = timesteps.expand(sample.shape[0])

        global_feature = self.diffusion_step_encoder(timesteps)

        if global_cond is not None:
            global_feature = torch.cat([
                global_feature, global_cond
            ], axis=-1)

        x = sample
        h = []
        for idx, (resnet, resnet2, downsample) in enumerate(self.down_modules):
            x = resnet(x, global_feature)
            x = resnet2(x, global_feature)
            h.append(x)
            x = downsample(x)

        for mid_module in self.mid_modules:
            x = mid_module(x, global_feature)

        for idx, (resnet, resnet2, upsample) in enumerate(self.up_modules):
            x = torch.cat((x, h.pop()), dim=1)
            x = resnet(x, global_feature)
            x = resnet2(x, global_feature)
            x = upsample(x)

        x = self.final_conv(x)

        # (B,C,T)
        x = x.moveaxis(-1,-2)
        # (B,T,C)
        return x

### Setup

In [13]:
#from google.colab import drive
#drive.mount('/content/drive')

In [14]:
import os
#os.chdir("/content/drive/MyDrive/Master Uni Freiburg/DLL/Data")
#!ls

In [15]:
# download demonstration data from Google Drive
dataset_path = '../../Data/Training/Generated/PlugCharger-v1/motionplanning/Data1000.state_dict.pd_joint_pos.h5'
# dataset_path = '../../Data/staterawdata.state_dict.pd_ee_delta_pos.h5'

# parameters
pred_horizon = 16
obs_horizon = 2
action_horizon = 8
#|o|o|                             observations: 2
#| |a|a|a|a|a|a|a|a|               actions executed: 8
#|p|p|p|p|p|p|p|p|p|p|p|p|p|p|p|p| actions predicted: 16

# create dataset from file
dataset = StateNormalDataset(
    dataset_file=dataset_path,
    pred_horizon=pred_horizon,
    obs_horizon=obs_horizon,
    action_horizon=action_horizon,
    device=None
)

stats = dataset.stats

# create dataloader
dataloader = DataLoader(
    dataset,
    batch_size=128,
    num_workers=1,
    # don't kill worker process afte each epoch
    persistent_workers=True,
    shuffle= True
)

# visualize data in batch
# TODO: normalize data in dataset
batch = next(iter(dataloader))
print(batch.keys())
print("observations:", batch['obs'].shape, batch['obs'].dtype)
print("actions:", batch['actions'].shape, batch['actions'].dtype)


# observation and action dimensions corrsponding to the dataset
obs_dim = batch['obs'].shape[-1]
action_dim = batch['actions'].shape[-1]
print("obs_dim:", obs_dim)
print("action_dim:", action_dim)

# create network object
noise_pred_net = ConditionalUnet1D(
    input_dim=action_dim,
    global_cond_dim=obs_dim*obs_horizon
)

# example inputs
noised_action = torch.randn((1, pred_horizon, action_dim))
obs = torch.zeros((1, obs_horizon, obs_dim))
diffusion_iter = torch.zeros((1,))

# the noise prediction network
# takes noisy action, diffusion iteration and observation as input
# predicts the noise added to action
noise = noise_pred_net(
    sample=noised_action,
    timestep=diffusion_iter,
    global_cond=obs.flatten(start_dim=1))

# illustration of removing noise
# the actual noise removal is performed by NoiseScheduler
# and is dependent on the diffusion noise schedule
denoised_action = noised_action - noise

# for this demo, we use DDPMScheduler with 100 diffusion iterations
num_diffusion_iters = 100
noise_scheduler = DDPMScheduler(
    num_train_timesteps=num_diffusion_iters,
    # the choise of beta schedule has big impact on performance
    # we found squared cosine works the best
    beta_schedule='squaredcos_cap_v2',
    # clip output to [-1,1] to improve stability
    clip_sample=True,
    # our network predicts noise (instead of denoised action)
    prediction_type='epsilon'
)

# device transfer
device = torch.device('cuda')
_ = noise_pred_net.to(device)

Loading Episodes:   0%|          | 0/720 [00:00<?, ?it/s]

dict_keys(['tcp_pose', 'charger_pose', 'receptacle_pose', 'goal_pose'])


  ndata = (data - stats['min']) / (stats['max'] - stats['min'])


dict_keys(['obs', 'actions'])
observations: torch.Size([128, 2, 39]) torch.float32
actions: torch.Size([128, 16, 8]) torch.float32
obs_dim: 39
action_dim: 8
number of parameters: 6.633882e+07


### Training

In [16]:
num_epochs = 100

# Exponential Moving Average
# accelerates training and improves stability
# holds a copy of the model weights
ema = EMAModel(
    parameters=noise_pred_net.parameters(),
    power=0.75)

# Standard ADAM optimizer
# Note that EMA parametesr are not optimized
optimizer = torch.optim.AdamW(
    params=noise_pred_net.parameters(),
    lr=1e-4, weight_decay=1e-6)

# Cosine LR schedule with linear warmup
lr_scheduler = get_scheduler(
    name='cosine',
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=len(dataloader) * num_epochs
)

with tqdm(range(num_epochs), desc='Epoch') as tglobal:
    # epoch loop
    for epoch_idx in tglobal:
        epoch_loss = list()
        # batch loop
        with tqdm(dataloader, desc='Batch', leave=False) as tepoch:
            for nbatch in tepoch:
                # data normalized in dataset
                # device transfer
                nobs = nbatch['obs'].to(device)
                naction = nbatch['actions'].to(device)
                B = nobs.shape[0]

                # observation as FiLM conditioning
                # (B, obs_horizon, obs_dim)
                obs_cond = nobs[:,:obs_horizon,:]
                # (B, obs_horizon * obs_dim)
                obs_cond = obs_cond.flatten(start_dim=1)

                # sample noise to add to actions
                noise = torch.randn(naction.shape, device=device)

                # sample a diffusion iteration for each data point
                timesteps = torch.randint(
                    0, noise_scheduler.config.num_train_timesteps,
                    (B,), device=device
                ).long()

                # add noise to the clean images according to the noise magnitude at each diffusion iteration
                # (this is the forward diffusion process)
                noisy_actions = noise_scheduler.add_noise(
                    naction, noise, timesteps)
                
                print(noisy_actions)


                # predict the noise residual
                noise_pred = noise_pred_net(
                    noisy_actions, timesteps, global_cond=obs_cond)

                # L2 loss
                loss = nn.functional.mse_loss(noise_pred, noise)

                # optimize
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                # step lr scheduler every batch
                # this is different from standard pytorch behavior
                lr_scheduler.step()

                # update Exponential Moving Average of the model weights
                ema.step(noise_pred_net.parameters())

                # logging
                loss_cpu = loss.item()
                epoch_loss.append(loss_cpu)
                tepoch.set_postfix(loss=loss_cpu)
        tglobal.set_postfix(loss=np.mean(epoch_loss))

# Weights of the EMA model
# is used for inference
ema_noise_pred_net = noise_pred_net
ema.copy_to(ema_noise_pred_net.parameters())

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Batch:   0%|          | 0/1441 [00:00<?, ?it/s]

tensor([[[-9.8265e-01, -2.1042e-01, -6.4496e-01,  ..., -3.5160e-01,
           1.4287e-01, -9.5627e-01],
         [-9.7674e-01,  1.0168e+00,  3.3215e-01,  ..., -3.5215e-01,
          -8.0541e-01,  5.5816e-01],
         [ 1.0191e-01, -1.8964e+00, -1.3148e+00,  ...,  9.8624e-02,
           4.7517e-01, -2.2924e+00],
         ...,
         [ 3.2210e-01,  7.9912e-01,  6.0580e-01,  ...,  8.8027e-01,
           3.8346e-01,  5.0980e-01],
         [-4.0149e-01,  5.4420e-01, -2.9375e-01,  ...,  1.8248e-01,
           7.3711e-02,  1.7998e-02],
         [-4.3680e-01,  8.5426e-01,  7.9379e-02,  ..., -6.8749e-02,
           8.5825e-01, -9.5877e-01]],

        [[ 7.8155e-01,  7.4314e-01,  3.7388e-01,  ..., -9.0428e-01,
           5.4365e-01,  7.9521e-01],
         [-3.8412e-01,  3.0190e-01,  1.5412e+00,  ...,  1.8265e-01,
           8.3158e-01, -4.9446e-01],
         [-7.6659e-01, -1.7868e-01, -3.1328e-01,  ...,  2.3217e-01,
           1.1381e+00, -1.5941e-01],
         ...,
         [-3.6088e-01,  1

  return F.conv_transpose1d(
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


tensor([[[-6.5937e-01,  1.1679e-01,  7.4546e-01,  ..., -4.0181e-01,
          -1.7475e-01, -1.1698e-01],
         [-1.5574e+00, -2.2980e-01, -1.7534e-01,  ..., -4.6482e-02,
          -8.9223e-02,  8.4778e-01],
         [-1.4350e-01, -9.8118e-01,  2.1116e-01,  ..., -7.0590e-01,
          -1.0745e+00,  1.0227e-01],
         ...,
         [ 8.9181e-02, -4.4836e-01, -8.2918e-02,  ...,  3.6698e-01,
          -1.8202e+00, -1.0386e+00],
         [-1.2307e+00, -1.5392e+00, -8.2797e-01,  ..., -1.1298e+00,
           8.1596e-01, -1.9355e+00],
         [-3.4988e-01,  1.2305e+00, -1.9062e+00,  ...,  7.1309e-01,
           2.0039e+00,  7.3974e-01]],

        [[-7.8653e-01, -1.9650e-01,  1.8044e+00,  ...,  1.3695e+00,
          -1.7231e-01, -1.3508e+00],
         [ 3.1543e-01, -4.6058e-01, -1.3085e+00,  ...,  1.4781e+00,
           8.2084e-01,  2.9133e-01],
         [ 1.0062e+00, -2.2619e-01,  5.2612e-01,  ..., -3.6992e-01,
          -1.0542e+00, -1.0335e+00],
         ...,
         [ 2.1742e-01,  5

KeyboardInterrupt: 

### Saving model

In [None]:
model_path = "../../Data/Checkpoints/model.pt"
torch.save({
    'model_state_dict': ema_noise_pred_net.state_dict(),
    'ema_model_state_dict': ema.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'lr_scheduler_state_dict': lr_scheduler.state_dict(),
    'epoch': epoch_idx,
    'loss': loss, # Save the current epoch
}, model_path)

In [21]:
model_path = '../../Data/Checkpoints/model_plugcharger_state_dict_pd_joint_pos.pt'
state_dict = torch.load(model_path, map_location='cuda')
ema_noise_pred_net = noise_pred_net
ema_noise_pred_net.load_state_dict(state_dict["model_state_dict"] )
print('Pretrained weights loaded.')

Pretrained weights loaded.


In [22]:
import gymnasium as gym
# limit enviornment interaction to 200 steps before termination
result_path = "../../Data/Results/videos"

env_id = "PlugCharger-v1"
obs_mode = "state_dict"
control_mode = "pd_joint_pos"
env = gym.make(env_id, obs_mode=obs_mode, control_mode=control_mode, render_mode='rgb_array')

max_steps = 400

# reset
obs, info = env.reset()
print(env.action_space)
obs = get_observations(obs)
obs = convert_observation(obs)

# save observations
obs_deque = collections.deque([obs] * obs_horizon, maxlen=obs_horizon)

# save visualization
imgs = []
rewards = []
done = False
step_idx = 0

num_episodes = 50

Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)
dict_keys(['tcp_pose', 'charger_pose', 'receptacle_pose', 'goal_pose'])


In [23]:
def unnormalize_data(ndata, stats):
    ndata = (ndata + 1) / 2
    data = ndata * (stats['max'] - stats['min']) + stats['min']
    return data

with tqdm(total=max_steps, desc="Eval") as pbar:
    while not done:
        B = 1
        # stack the last obs_horizon (2) number of observations
        obs_seq = np.stack(obs_deque)
        # normalize observation
        #TODO: normalize observation
        #nobs = normalize_data(obs_seq, stats=stats['obs'])
        nobs = normalize_data(obs_seq, stats=stats['obs'])
        # device transfer
        nobs = torch.from_numpy(nobs).to(device, dtype=torch.float32)

        # infer action
        with torch.no_grad():
            # reshape observation to (B,obs_horizon*obs_dim)
            obs_cond = nobs.unsqueeze(0).flatten(start_dim=1)

            # initialize action from Guassian noise
            noisy_action = torch.randn(
                (B, pred_horizon, action_dim), device=device)
            naction = noisy_action

            # init scheduler
            noise_scheduler.set_timesteps(num_diffusion_iters)

            for k in noise_scheduler.timesteps:
                # predict noise
                noise_pred = ema_noise_pred_net(
                    sample=naction,
                    timestep=k,
                    global_cond=obs_cond
                )

                # inverse diffusion step (remove noise)
                naction = noise_scheduler.step(
                    model_output=noise_pred,
                    timestep=k,
                    sample=naction
                ).prev_sample

        # unnormalize action
        naction = naction.detach().to('cpu').numpy()
        # (B, pred_horizon, action_dim)
        naction = naction[0]
        #TODO: unnormalize action
        #action_pred = unnormalize_data(naction, stats=stats['action'])
        action_pred = unnormalize_data(naction, stats=stats['actions'])


        # only take action_horizon number of actions
        start = obs_horizon - 1
        end = start + action_horizon
        action = action_pred[start:end,:]

        # execute action_horizon number of steps
        # without replanning
        print(len(action))
        for i in range(len(action)):
            # stepping env
            print(action[i].shape)
            obs, reward, done, _, info = env.step(action[i])

            # process observation
            # From the observation dictionary, we concatenate all the observations
            # as done in the training data
            obs = get_observations(obs)
            obs = convert_observation(obs)

            # save observations
            obs_deque.append(obs)

            # and reward/vis
            rewards.append(reward)
            imgs.append(env.render())

            # update progress bar
            step_idx += 1
            pbar.update(1)
            pbar.set_postfix(reward=reward)
            if step_idx > max_steps:
                print("Step idx: ", step_idx)
                done = True
            if done:
                break
# print out the maximum target coverage
print('Score: ', max(rewards))

Eval:   0%|          | 0/400 [00:00<?, ?it/s]

  ndata = (data - stats['min']) / (stats['max'] - stats['min'])


8
(8,)


ValueError: ('Action cannot be NaN. Environment received:', tensor([[nan, nan, nan, nan, nan, nan, nan, nan]]))

In [131]:
mean_success = 0 
mean_reward = 0
for episode in range(num_episodes):
    # Reset environment and variables
    obs, info = env.reset()
    print(env.action_space)
    obs = get_observations(obs)
    obs = convert_observation(obs)
    obs_deque = collections.deque([obs] * obs_horizon, maxlen=obs_horizon)
    imgs = []
    rewards = []
    done = False
    step_idx = 0
    unsuccessful = False
    with tqdm(total=max_steps, desc="Eval") as pbar:
        while not done:
            B = 1
            # stack the last obs_horizon (2) number of observations
            obs_seq = np.stack(obs_deque)
            # normalize observation
            #TODO: normalize observation
            #nobs = normalize_data(obs_seq, stats=stats['obs'])
            nobs = normalize_data(obs_seq, stats=stats['obs'])
            # device transfer
            nobs = torch.from_numpy(nobs).to(device, dtype=torch.float32)

            # infer action
            with torch.no_grad():
                # reshape observation to (B,obs_horizon*obs_dim)
                obs_cond = nobs.unsqueeze(0).flatten(start_dim=1)

                # initialize action from Guassian noise
                noisy_action = torch.randn(
                    (B, pred_horizon, action_dim), device=device)
                naction = noisy_action

                # init scheduler
                noise_scheduler.set_timesteps(num_diffusion_iters)

                for k in noise_scheduler.timesteps:
                    # predict noise
                    noise_pred = ema_noise_pred_net(
                        sample=naction,
                        timestep=k,
                        global_cond=obs_cond
                    )

                    # inverse diffusion step (remove noise)
                    naction = noise_scheduler.step(
                        model_output=noise_pred,
                        timestep=k,
                        sample=naction
                    ).prev_sample

            # unnormalize action
            naction = naction.detach().to('cpu').numpy()
            # (B, pred_horizon, action_dim)
            naction = naction[0]
            #TODO: unnormalize action
            #action_pred = unnormalize_data(naction, stats=stats['action'])
            action_pred = unnormalize_data(naction, stats=stats['actions'])


            # only take action_horizon number of actions
            start = obs_horizon - 1
            end = start + action_horizon
            action = action_pred[start:end,:]

            # execute action_horizon number of steps
            # without replanning
            for i in range(len(action)):
                # stepping env
                obs, reward, done, _, info = env.step(action[i])

                # process observation
                # From the observation dictionary, we concatenate all the observations
                # as done in the training data
                obs = get_observations(obs)
                obs = convert_observation(obs)

                # save observations
                obs_deque.append(obs)

                # and reward/vis
                rewards.append(reward)
                imgs.append(env.render())

                # update progress bar
                step_idx += 1
                pbar.update(1)
                pbar.set_postfix(reward=reward)
                if step_idx > max_steps:
                    print("Step idx: ", step_idx)
                    done = True
                    unsuccessful = True
                if done:
                    break
    # print out the maximum target coverage
    print(f'Episode {episode + 1} Score: ', max(rewards))
    if not unsuccessful:
        mean_success += 1
    mean_reward += max(rewards)

print("Reward: ", mean_reward / num_episodes)
print("Success: ", mean_success/num_episodes)

Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 1 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 2 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 3 Score:  tensor([0.5208])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 4 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 5 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 6 Score:  tensor([0.5256])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 7 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 8 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 9 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 10 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 11 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 12 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 13 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 14 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 15 Score:  tensor([0.5654])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 16 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 17 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 18 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 19 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 20 Score:  tensor([0.5159])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 21 Score:  tensor([0.5472])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 22 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 23 Score:  tensor([0.5398])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 24 Score:  tensor([0.5297])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 25 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 26 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 27 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 28 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 29 Score:  tensor([0.5673])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 30 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 31 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 32 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 33 Score:  tensor([0.5729])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 34 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 35 Score:  tensor([0.5466])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 36 Score:  tensor([0.5609])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 37 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 38 Score:  tensor([0.5475])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 39 Score:  tensor([0.5703])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 40 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 41 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 42 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 43 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 44 Score:  tensor([0.5695])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 45 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Step idx:  401
Episode 46 Score:  tensor([0.5354])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 47 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 48 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 49 Score:  tensor([1.])
Box([-2.8973 -1.7628 -2.8973 -3.0718 -2.8973 -0.0175 -2.8973 -1.    ], [ 2.8973  1.7628  2.8973 -0.0698  2.8973  3.7525  2.8973  1.    ], (8,), float32)


Eval:   0%|          | 0/400 [00:00<?, ?it/s]

Episode 50 Score:  tensor([1.])
Reward:  tensor([0.8643])
Success:  0.7


In [None]:
from PIL import Image
from IPython.display import display, Image as IPImage
import io

print("Image shape:", imgs[0].shape)  # Print shape to check if it's (H, W, 3) for RGB
print("Image dtype:", imgs[0].dtype)  # Should be uint8

images = [Image.fromarray(img.squeeze(0).cpu().numpy()) for img in imgs]

# Save to a bytes buffer
buffer = io.BytesIO()
images[0].save(buffer, format='GIF', save_all=True, append_images=images[1:], optimize=False, duration=50, loop=0)
buffer.seek(0)

# Save to a file
with open('../../Data/animation.gif', 'wb') as f:
    f.write(buffer.getvalue())

# Display the GIF (optional)
display(IPImage(data=buffer.getvalue()))