In [2]:
import importlib
import condorgmm
import condorgmm.data
import matplotlib.pyplot as plt
import warp as wp
import numpy as np
import scipy.stats
import condorgmm.warp_gmm as warp_gmm

In [3]:
condorgmm.rr_init("low_frame_rate")

In [14]:
scene = 48
video = condorgmm.data.YCBTestVideo(scene)
frame = video[0]

ycb_dir = video.ycb_dir

import trimesh
@staticmethod
def vertices_and_colors_from_obj_file(path, scale=1.0):
    trimesh_mesh = trimesh.load_mesh(path, process=False, validate=False)
    vertices = np.array(trimesh_mesh.vertices)
    if not isinstance(trimesh_mesh.visual, trimesh.visual.color.ColorVisuals):
        vertex_colors = (
            np.array(trimesh_mesh.visual.to_color().vertex_colors)[..., :3] / 255.0
        )
    else:
        vertex_colors = (
            np.array(trimesh_mesh.visual.vertex_colors)[..., :3] / 255.0
        )
    return vertices * scale, vertex_colors

import os
meshes = [
    vertices_and_colors_from_obj_file(
        os.path.join(ycb_dir, f'../models/obj_{f"{id + 1}".rjust(6, "0")}.ply'), scale=0.001
    )
    for id in video[0].object_ids
]

In [5]:
num_poses = 20000

c2f_schedule_params = (
    (0.04, 1000.0),
    (0.02, 1500.0),
    (0.01, 3000.0),
    (0.005, 4000.0),
)
c2f_schedule = []

for c2f_step in c2f_schedule_params:
    sigma, kappa = c2f_step
    position_deltas = np.random.normal(0.0, sigma, size=(num_poses, 3))
    quaternion_deltas = scipy.stats.vonmises_fisher(
        mu=np.array([0, 0, 0, 1]),
        kappa=kappa,
    ).rvs(num_poses)

    include_identity = True
    if include_identity:
        position_deltas[0, :] = 0.0
        quaternion_deltas[0, :] = np.array([0, 0, 0, 1])

    pose_deltas = wp.array(
        np.hstack((position_deltas, quaternion_deltas)), dtype=wp.transform
    )
    c2f_schedule.append(pose_deltas)

Warp 1.6.0 initialized:
   CUDA Toolkit 12.8, Driver 12.4
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "NVIDIA L4" (22 GiB, sm_89, mempool enabled)
   Kernel cache:
     /home/nishadgothoskar/.cache/warp/1.6.0


In [8]:
timesteps = range(0,len(video), 50)
for T in timesteps:
    condorgmm.rr_set_time(T)
    condorgmm.rr_log_frame(video[T])


In [18]:
from condorgmm.warp_gmm.enumeration_kernels import inference_step
importlib.reload(warp_gmm.enumeration_kernels)
from condorgmm.warp_gmm.enumeration_kernels import inference_step
condorgmm.rr_init("low_frame_rate")
frame = video[0]

object_index = 3
current_object_index = object_index
spatial_means = meshes[object_index][0]
rgb_means = meshes[object_index][1] * 255.0
initial_object_pose_in_camera_frame = (condorgmm.Pose(frame.camera_pose).inv() @ condorgmm.Pose(frame.object_poses[object_index]))

transformed_points = initial_object_pose_in_camera_frame.apply(spatial_means)
proj_pixel_coords = (transformed_points[:, :2] / transformed_points[:, 2:3]) * np.array([frame.intrinsics[0], frame.intrinsics[1]]) + np.array([frame.intrinsics[2], frame.intrinsics[3]])
rounded_pixel_coordinates = np.floor(proj_pixel_coords).astype(np.int32)
associated_rgb = frame.rgb[rounded_pixel_coordinates[:, 1], rounded_pixel_coordinates[:, 0]]
associated_depth = frame.depth[rounded_pixel_coordinates[:, 1], rounded_pixel_coordinates[:, 0]]
matching = np.abs(associated_depth - transformed_points[:, 2]) < 0.01

# spatial_means = spatial_means[matching]
rgb_means[matching,:] = associated_rgb[matching,:]

# mask = frame.masks[object_index]
# spatial_means = condorgmm.xyz_from_depth_image(frame.depth, *frame.intrinsics)[mask]
# rgb_means = frame.rgb[mask]
# spatial_means = initial_object_pose_in_camera_frame.inv().apply(spatial_means)

# indices = np.random.choice(
#     len(spatial_means),
#     (min(len(spatial_means), 5000),),
#     replace=False,
# )
# spatial_means = spatial_means[indices]
# rgb_means = rgb_means[indices]

gmm = warp_gmm.gmm_warp_from_numpy(
    spatial_means.astype(np.float32),
    rgb_means.astype(np.float32),
    object_posquats=initial_object_pose_in_camera_frame.posquat[None, ...].astype(np.float32),
    log_spatial_scales=np.log(0.0005 * np.ones((spatial_means.shape[0], 3), dtype=np.float32))
)
print(gmm.object_posquats.numpy()[0])


num_vertices = gmm.spatial_means.shape[0]
pose_hypotheses = wp.empty(num_poses, dtype=wp.transform)
pixel_coordinates = wp.zeros((num_poses, num_vertices), dtype=wp.vec2i)
corresponding_rgbd_per_pose_and_vertex = wp.empty(
    (num_poses, num_vertices), dtype=wp.vec4
)
scores_per_pose_and_vertex = wp.empty((num_poses, num_vertices), dtype=float)
scores_per_pose = wp.zeros(num_poses, dtype=float)


T = 0
condorgmm.rr_set_time(T)
condorgmm.rr_log_frame(video[T])
warp_gmm.rr_log_gmm_warp(gmm, "gmm_warp", size_scalar=1.5)
condorgmm.rr_log_pose(condorgmm.Pose(frame.camera_pose).inv() @ condorgmm.Pose(frame.object_poses[object_index]), "gt_pose")


[ 0.05233599 -0.0130313   0.8615899   0.0024441   0.69188577  0.26218858
 -0.6727148 ]


In [10]:
inferred_object_poses = []
for T in timesteps:
    frame = video[T]
    frame_warp = frame.as_warp()
    
    for pose_deltas in c2f_schedule:
        inference_step(
            gmm.object_posquats,
            pose_deltas,
            gmm.spatial_means,
            gmm.rgb_means,
            frame.intrinsics[0],
            frame.intrinsics[1],
            frame.intrinsics[2],
            frame.intrinsics[3],
            frame_warp.rgb,
            frame_warp.depth,
            # These inputs are empty memory that will be filled by the kernels.
            pose_hypotheses,
            pixel_coordinates,
            corresponding_rgbd_per_pose_and_vertex,
            scores_per_pose_and_vertex,
            scores_per_pose,
        )

    object_pose = condorgmm.Pose(gmm.object_posquats.numpy()[0])
    transformed_points = object_pose.apply(gmm.spatial_means.numpy())
    proj_pixel_coords = (transformed_points[:, :2] / transformed_points[:, 2:3]) * np.array([frame.intrinsics[0], frame.intrinsics[1]]) + np.array([frame.intrinsics[2], frame.intrinsics[3]])
    rounded_pixel_coordinates = np.floor(proj_pixel_coords).astype(np.int32)
    valid = (rounded_pixel_coordinates[:, 0] >= 0) & (rounded_pixel_coordinates[:, 0] < frame.rgb.shape[1]) & (rounded_pixel_coordinates[:, 1] >= 0) & (rounded_pixel_coordinates[:, 1] < frame.rgb.shape[0])
    rounded_pixel_coordinates = rounded_pixel_coordinates * valid[:, None]
    associated_rgb = frame.rgb[rounded_pixel_coordinates[:, 1], rounded_pixel_coordinates[:, 0]]
    associated_depth = frame.depth[rounded_pixel_coordinates[:, 1], rounded_pixel_coordinates[:, 0]]
    matching = np.abs(associated_depth - transformed_points[:, 2]) < 0.005
    matching_and_valid = matching & valid
    rgb_means = gmm.rgb_means.numpy()
    rgb_means[matching_and_valid,:] = associated_rgb[matching_and_valid,:]        
    gmm.rgb_means = wp.array(rgb_means, dtype=wp.vec3)
    
        
    print(gmm.object_posquats.numpy()[0])
    condorgmm.rr_set_time(T)
    warp_gmm.rr_log_gmm_warp(gmm, "gmm_warp", size_scalar=1.5)
    condorgmm.rr_log_pose(gmm.object_posquats.numpy()[0], "inferred_pose")
    inferred_object_poses.append(gmm.object_posquats.numpy()[0])
    condorgmm.rr_log_pose(condorgmm.Pose(frame.camera_pose).inv() @ condorgmm.Pose(frame.object_poses[object_index]), "gt_pose")

Module condorgmm.warp_gmm.enumeration_kernels 0c9b6a8 load on device 'cuda:0' took 0.62 ms  (cached)
[ 0.05233599 -0.0130313   0.8615899   0.0024441   0.69188577  0.26218858
 -0.6727148 ]
[ 0.05947617 -0.0148075   0.85370094  0.01354977  0.7096188   0.22939762
 -0.6660587 ]
[ 0.0822545  -0.01263825  0.86141527  0.01214595  0.71618855  0.22575256
 -0.66027415]
[ 0.12117092 -0.05650976  0.8675858  -0.00810349  0.67433     0.24813344
 -0.6954444 ]
[ 0.15147673 -0.03209625  0.8722536  -0.02203601  0.6450719   0.24393643
 -0.7238035 ]
[ 0.12623484 -0.01215548  0.8902563  -0.01633698  0.6442113   0.23155552
 -0.72877085]
[ 0.09787653 -0.00536741  0.9058726  -0.0149751   0.6563901   0.22773404
 -0.7190724 ]
[ 0.10354301 -0.00155372  0.9089102  -0.01543457  0.62037855  0.23615047
 -0.74774677]
[ 0.09856848 -0.00557951  0.91248703 -0.02945836  0.63146317  0.24518913
 -0.7350299 ]
[ 0.08783374  0.00138029  0.9152598  -0.00177905  0.62677664  0.22979517
 -0.74454165]
[ 0.09440718 -0.00562259  0.9

In [11]:
object_id = video[0].object_ids[object_index]
object_mesh = video.get_object_mesh_from_id(object_id)
object_name = video.get_object_name_from_id(object_id)
print(object_name)

051_large_clamp


In [22]:
results_df = condorgmm.eval.metrics.create_empty_results_dataframe()
condorgmm.eval.metrics.add_object_tracking_metrics_to_results_dataframe(
    results_df,
    scene,
    "condorgmm",
    object_name,
    predicted_poses,
    gt_poses,
    vertices,
)

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [21]:
import condorgmm.eval.metrics

In [22]:
import condorgmm.eval.fp_loader
fp_loader = condorgmm.eval.fp_loader.YCBVTrackingResultLoader(
    frame_rate=50, split=ycb_dir.name
)
fp_df = fp_loader.get_dataframe(scene)


In [24]:
gt_poses = [
    condorgmm.Pose(video[t].camera_pose).inv()
    @ condorgmm.Pose(video[t].object_poses[current_object_index])
    for t in timesteps
]

vertices = video.get_object_mesh_from_id(
    video[0].object_ids[current_object_index]
).vertices

inferred_object_poses = [condorgmm.Pose(pose) for pose in inferred_object_poses]

results_df = condorgmm.eval.metrics.create_empty_results_dataframe()
condorgmm.eval.metrics.add_object_tracking_metrics_to_results_dataframe(
    results_df,
    scene,
    "condorgmm",
    object_name,
    inferred_object_poses,
    gt_poses,
    vertices,
)


In [29]:
import pandas as pd
full_df = pd.concat([fp_df, results_df])
auc_results_full = full_df.groupby(["metric", "method", "object"])["value"].apply(
    condorgmm.eval.metrics.compute_auc
)
print(auc_results_full)

metric  method          object               
ADD     FoundationPose  002_master_chef_can      0.204111
                        007_tuna_fish_can        0.571778
                        025_mug                  0.650444
                        051_large_clamp          0.441000
                        052_extra_large_clamp    0.325000
        condorgmm           051_large_clamp          0.914556
ADD-S   FoundationPose  002_master_chef_can      0.629667
                        007_tuna_fish_can        0.816111
                        025_mug                  0.826111
                        051_large_clamp          0.547000
                        052_extra_large_clamp    0.979000
        condorgmm           051_large_clamp          0.964556
Name: value, dtype: float64


In [26]:
fp_df

Unnamed: 0,scene,method,object,timestep,predicted,gt,metric,value
0,48,FoundationPose,002_master_chef_can,0,"[-0.03245632350444794, -0.008746874518692493, ...","[-0.031677025422232274, -0.017368816807616497,...",ADD-S,0.006057
1,48,FoundationPose,002_master_chef_can,1,"[-0.024065330624580383, -0.014229506254196167,...","[-0.023487833882362825, -0.02297177083569611, ...",ADD-S,0.006084
2,48,FoundationPose,002_master_chef_can,2,"[0.0015329779125750065, -0.012915787287056446,...","[0.0007751038657471028, -0.02009170430772513, ...",ADD-S,0.005083
3,48,FoundationPose,002_master_chef_can,3,"[0.02247895672917366, -0.02795446291565895, 0....","[0.03918443426687707, -0.06518133103855758, 0....",ADD-S,0.031214
4,48,FoundationPose,002_master_chef_can,4,"[0.029076755046844482, -0.0484016053378582, 0....","[0.07212755206100327, -0.044044601048402854, 0...",ADD-S,0.044196
...,...,...,...,...,...,...,...,...
40,48,FoundationPose,052_extra_large_clamp,40,"[-0.12645815312862396, 0.0952477902173996, 0.8...","[-0.10260229912475038, 0.0939881687277426, 0.8...",ADD,0.067866
41,48,FoundationPose,052_extra_large_clamp,41,"[-0.13152475655078888, 0.08612556755542755, 0....","[-0.10976216625644014, 0.0858998113975631, 0.7...",ADD,0.067433
42,48,FoundationPose,052_extra_large_clamp,42,"[-0.1385863721370697, 0.07273939996957779, 0.7...","[-0.11486511993147, 0.07214020082416844, 0.768...",ADD,0.067781
43,48,FoundationPose,052_extra_large_clamp,43,"[-0.13955964148044586, 0.07709338515996933, 0....","[-0.11537487219915202, 0.07519616048985027, 0....",ADD,0.067805


In [13]:
corresponding_rgbd_per_pose_and_vertex_np = corresponding_rgbd_per_pose_and_vertex.numpy()

In [14]:
corresponding_rgbd_per_pose_and_vertex_np[0, 0]

array([ 0.   ,  0.   , 22.   ,  0.832], dtype=float32)

In [10]:
scores_per_pose.numpy()

array([-9993.419, -9993.419, -9993.419, ..., -9993.419, -9993.419,
       -9993.419], dtype=float32)