In [1]:
import h5py
import numpy as np
import tqdm
from scipy.spatial.transform import Rotation as R

In [2]:
DATASET_PATH = "dataset.h5"
INTRINSICS = {"focalLength": 1.0, "width": 384.0, "height": 256.0, "zRange": [0.01, 2.0]}
VIEW_IDX = 3

In [3]:
with h5py.File(DATASET_PATH, "r") as f:
    dp_key = next(iter(f.keys()))
    for k, v in f[dp_key].items():
        print(f"{k:<12} {v.shape}")

cam_poses    (4, 7)
depths       (4, 256, 384)
feasibles    (7, 8)
final_poses  (7, 8, 7)
images       (4, 256, 384, 3)
obj_ids      (7,)
poses        (7, 7)
seg_ids      (4, 256, 384)
sizes        (7, 3)
target_poses (7, 7)


In [4]:
def create_distance_map(target_poses, cam_pose, depth):
    h, w = int(INTRINSICS["height"]), int(INTRINSICS["width"])
    f = INTRINSICS["focalLength"]
    Y, X = np.meshgrid(np.arange(h), np.arange(w), indexing="ij")

    # Unproject to camera space
    x = (X - w / 2) / (w / 2) * depth / f
    y = (Y - h / 2) / (h / 2) * depth / f
    pts_cam = np.stack([x, y, depth], axis=-1)

    # Transform to world space
    rot = R.from_quat(np.roll(cam_pose[3:], -1)).as_matrix()
    pts_world = (rot @ pts_cam.reshape(-1, 3).T).T.reshape(h, w, 3) + cam_pose[:3]

    # Compute distances
    dists = [np.linalg.norm(pts_world - pose[:3], axis=-1) for pose in target_poses]
    return np.stack(dists).astype(np.float32)

In [6]:
with h5py.File(DATASET_PATH, "r") as f:
    depths, dists = [], []

    for key in tqdm.tqdm(f.keys()):
        dp = f[key]
        cam_pose = dp["cam_poses"][VIEW_IDX][()]
        depth = dp["depths"][VIEW_IDX][()]

        depths.append(depth)
        dists.append(create_distance_map(dp["target_poses"][()], cam_pose, depth).flatten())

depths = np.concatenate(depths)
dists = np.concatenate(dists)

100%|██████████| 10996/10996 [09:46<00:00, 18.75it/s]


In [7]:
depth_min = depths.min()
depth_max = depths.max()
dist_max = dists.max()

print(f"depth:    [{depth_min:.4f}, {depth_max:.4f}], mean={depths.mean():.4f}")
print(f"distance: [{dists.min():.4f}, {dist_max:.4f}], mean={dists.mean():.4f}")

depth:    [0.5387, 1.2218], mean=0.8247
distance: [0.0002, 2.3817], mean=0.7369


In [8]:
with h5py.File(DATASET_PATH, "r") as f_in, h5py.File("combined-dataset.h5", "w") as f_out:
    f_out.attrs.update(f_in.attrs)
    f_out.attrs["depth_min"] = depth_min
    f_out.attrs["depth_max"] = depth_max
    f_out.attrs["dist_max"] = dist_max

    for key in tqdm.tqdm(f_in.keys()):
        dp = f_in[key]

        obj_ids = dp["obj_ids"][()]
        cam_pose = dp["cam_poses"][VIEW_IDX][()]
        seg = dp["seg_ids"][VIEW_IDX]
        poses = dp["poses"][()]
        target_poses = dp["target_poses"][()]
        feasibles = dp["feasibles"][()]

        depth_raw = dp["depths"][VIEW_IDX][()]
        depth = (depth_raw - depth_min) / (depth_max - depth_min)

        # Create distance maps
        dist_maps = create_distance_map(target_poses, cam_pose, depth_raw)
        dist_maps = np.clip(dist_maps / dist_max, 0.0, 1.0)

        # Calculate target offsets
        target_offsets = target_poses[:, :3] - poses[:, :3]

        for i in range(len(obj_ids)):
            grp = f_out.create_group(f"{key}_obj_{i}")
            grp.create_dataset("depth", data=depth)
            grp.create_dataset("mask", data=(seg == obj_ids[i]).astype(np.float32))
            grp.create_dataset("dist_map", data=dist_maps[i])
            grp.create_dataset("quat", data=poses[i][3:])
            grp.create_dataset("target_pos", data=target_poses[i])
            grp.create_dataset("target_offset", data=target_offsets[i])
            grp.create_dataset("feasibles", data=feasibles[i])

100%|██████████| 10996/10996 [07:18<00:00, 25.07it/s]
