In [1]:
from chat_with_nerf.chat.agent import Agent 
import os
import json
import numpy as np
from chat_with_nerf.chat.session import Session
import time
import open3d as o3d

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
[32;20m[2023-09-08 22:09:17,212] INFO torch.distributed.nn.jit.instantiator [<module>] [instantiator.py:21] - Created a temporary directory at /tmp/tmpjn4kl1xh[0m
[32;20m[2023-09-08 22:09:17,213] INFO torch.distributed.nn.jit.instantiator [_write] [instantiator.py:76] - Writing /tmp/tmpjn4kl1xh/_remote_module_non_scriptable.py[0m


In [2]:
root_directory = '/workspace/chat-with-nerf-eval/data/scanrefer_val'  # Assuming current directory, adjust path if needed

### Useful functions

In [3]:
def construct_bbox_corners(center, box_size):
    sx, sy, sz = box_size
    x_corners = [sx / 2, sx / 2, -sx / 2, -sx / 2, sx / 2, sx / 2, -sx / 2, -sx / 2]
    y_corners = [sy / 2, -sy / 2, -sy / 2, sy / 2, sy / 2, -sy / 2, -sy / 2, sy / 2]
    z_corners = [sz / 2, sz / 2, sz / 2, sz / 2, -sz / 2, -sz / 2, -sz / 2, -sz / 2]
    corners_3d = np.vstack([x_corners, y_corners, z_corners])
    corners_3d[0, :] = corners_3d[0, :] + center[0]
    corners_3d[1, :] = corners_3d[1, :] + center[1]
    corners_3d[2, :] = corners_3d[2, :] + center[2]
    corners_3d = np.transpose(corners_3d)

    return corners_3d

In [4]:
def get_box3d_min_max(corner):
    ''' Compute min and max coordinates for 3D bounding box
        Note: only for axis-aligned bounding boxes

    Input:
        corners: numpy array (8,3), assume up direction is Z (batch of N samples)
    Output:
        box_min_max: an array for min and max coordinates of 3D bounding box IoU

    '''

    min_coord = corner.min(axis=0)
    max_coord = corner.max(axis=0)
    x_min, x_max = min_coord[0], max_coord[0]
    y_min, y_max = min_coord[1], max_coord[1]
    z_min, z_max = min_coord[2], max_coord[2]
    
    return x_min, x_max, y_min, y_max, z_min, z_max

In [5]:
def box3d_iou(corners1, corners2):
    ''' Compute 3D bounding box IoU.

    Input:
        corners1: numpy array (8,3), assume up direction is Z
        corners2: numpy array (8,3), assume up direction is Z
    Output:
        iou: 3D bounding box IoU

    '''
    # # corner points are in counter clockwise order
    # rect1 = [(corners1[i,0], corners1[i,2]) for i in range(3,-1,-1)]
    # rect2 = [(corners2[i,0], corners2[i,2]) for i in range(3,-1,-1)] 
    # area1 = poly_area(np.array(rect1)[:,0], np.array(rect1)[:,1])
    # area2 = poly_area(np.array(rect2)[:,0], np.array(rect2)[:,1])
    # inter, inter_area = convex_hull_intersection(rect1, rect2)
    # iou_2d = inter_area/(area1+area2-inter_area)
    # ymax = min(corners1[0,1], corners2[0,1])
    # ymin = max(corners1[4,1], corners2[4,1])
    # inter_vol = inter_area * max(0.0, ymax-ymin)
    # vol1 = box3d_vol(corners1)
    # vol2 = box3d_vol(corners2)
    # iou = inter_vol / (vol1 + vol2 - inter_vol)
    # return iou, iou_2d

    x_min_1, x_max_1, y_min_1, y_max_1, z_min_1, z_max_1 = get_box3d_min_max(corners1)
    x_min_2, x_max_2, y_min_2, y_max_2, z_min_2, z_max_2 = get_box3d_min_max(corners2)
    xA = np.maximum(x_min_1, x_min_2)
    yA = np.maximum(y_min_1, y_min_2)
    zA = np.maximum(z_min_1, z_min_2)
    xB = np.minimum(x_max_1, x_max_2)
    yB = np.minimum(y_max_1, y_max_2)
    zB = np.minimum(z_max_1, z_max_2)
    inter_vol = np.maximum((xB - xA), 0) * np.maximum((yB - yA), 0) * np.maximum((zB - zA), 0)
    box_vol_1 = (x_max_1 - x_min_1) * (y_max_1 - y_min_1) * (z_max_1 - z_min_1)
    box_vol_2 = (x_max_2 - x_min_2) * (y_max_2 - y_min_2) * (z_max_2 - z_min_2)
    iou = inter_vol / (box_vol_1 + box_vol_2 - inter_vol + 1e-8)

    return iou

### Data Analysis

In [6]:
def get_val_set():
    json_dict = {}
    # List of all subfolders and their files
    subfolders_files = [(dp, filenames) for dp, _, filenames in os.walk(root_directory)]

    # Dictionary comprehension to pick only the first JSON from each subfolder
    json_dict = {os.path.basename(dp): os.path.join(dp, filenames[0]) for dp, filenames in subfolders_files if any(fn.endswith('.json') for fn in filenames)}

    return json_dict

In [7]:
json_dict = get_val_set()

In [8]:
scene_name = 'scene0025_00'
scene_path = json_dict[scene_name]
agent = Agent()
print(scene_name)
print(scene_path)
with open(scene_path, 'r') as file:
    data = json.load(file)

print("label: ", data['objects'][0]['label'])
print("bbox: ", data['objects'][0]['bbox'])
print("object_id: ", data['objects'][0]['object_ids'])

[32;20m[2023-09-08 22:09:30,992] INFO chat_with_nerf [initialize_model_no_gpt_context] [model_context.py:54] - Search for all Scenes and Set the current Scene[0m
[32;20m[2023-09-08 22:09:30,993] INFO chat_with_nerf [search_scenes] [model_context.py:89] - scene_path: /workspace/chat-with-nerf-dev/chat-with-nerf/data/scene0025_00/scene0025_00.yaml[0m
[32;20m[2023-09-08 22:09:30,994] INFO root [create_model] [factory.py:154] - Loaded ViT-B-16 model config.[0m
[32;20m[2023-09-08 22:09:32,306] INFO root [create_model] [factory.py:227] - Loading pretrained ViT-B-16 weights (laion2b_s34b_b88k).[0m
/workspace/chat-with-nerf-eval/data/scannet/scans/scene0025_00/h5_embedding/embeddings.h5
scene0025_00
/workspace/chat-with-nerf-eval/data/scanrefer_val/scene0025_00/72afcc45-a8b4-48b6-8224-783ad1d1ca95.json
label:  monitor
bbox:  [-0.30863550305366516, -1.6108747720718384, 0.9761558771133423, 0.4441679120063782, 0.42981481552124023, 0.5227721333503723]
object_id:  1


In [9]:
axisAlignment_values = [
    0.707107, 0.707107, 0.000000, -4.050430, 
    -0.707107, 0.707107, 0.000000, -0.052607, 
    0.000000, 0.000000, 1.000000, -0.049991, 
    0.000000, 0.000000, 0.000000, 1.000000
]

axisAlignment_matrix = np.array(axisAlignment_values).reshape(4, 4)
print(axisAlignment_matrix)

[[ 0.707107  0.707107  0.       -4.05043 ]
 [-0.707107  0.707107  0.       -0.052607]
 [ 0.        0.        1.       -0.049991]
 [ 0.        0.        0.        1.      ]]


In [12]:
new_session = Session.create_for_scene(scene_name)
print("description: ", data['objects'][0]['description'])
result = []
start_time = time.time()  
for target_object in data['objects']:
    bbox = target_object['bbox']
    center_aligned = np.append(np.array(bbox[:3]), 1)
    extents_aligned = np.array(bbox[3:])
    
    half_extents = extents_aligned
    corners_aligned = [
        center_aligned[:3] + [-half_extents[0], -half_extents[1], -half_extents[2]],
        center_aligned[:3] + [half_extents[0], -half_extents[1], -half_extents[2]],
        center_aligned[:3] + [-half_extents[0], half_extents[1], -half_extents[2]],
        center_aligned[:3] + [half_extents[0], half_extents[1], -half_extents[2]],
        center_aligned[:3] + [-half_extents[0], -half_extents[1], half_extents[2]],
        center_aligned[:3] + [half_extents[0], -half_extents[1], half_extents[2]],
        center_aligned[:3] + [-half_extents[0], half_extents[1], half_extents[2]],
        center_aligned[:3] + [half_extents[0], half_extents[1], half_extents[2]],
    ]

    inverse_matrix = np.linalg.inv(axisAlignment_matrix)
    corners_original = [np.dot(inverse_matrix, np.append(corner, 1.0))[:3] for corner in corners_aligned]
    center_original = np.mean(corners_original, axis=0)

    extents_original = [
        np.max([corner[0] for corner in corners_original]) - np.min([corner[0] for corner in corners_original]),
        np.max([corner[1] for corner in corners_original]) - np.min([corner[1] for corner in corners_original]),
        np.max([corner[2] for corner in corners_original]) - np.min([corner[2] for corner in corners_original]),
    ]

    print("Original center:", center_original)
    print("Original extents:", extents_original)
    
    for description in data['objects'][0]['description']:
        start_time = time.time()  
        print(description)
        (center, box_size), prediction = agent.act_no_gpt(
            description,
            scene_name,
            new_session,
        )
        print(result)
        ground_truth = construct_bbox_corners(center_original, extents_original)
        # prediction = construct_bbox_corners(center, box_size)
        result.append(box3d_iou(ground_truth, prediction))
end_time = time.time()
elapsed_time = end_time - start_time  # calculate the difference in times

[32;20m[2023-09-08 22:17:02,470] INFO chat_with_nerf [create_for_scene] [session.py:42] - Creating a new session 34347b75-2a4d-468a-8c30-7876a26b27a4 with scene scene0025_00.[0m
description:  ['there is a monitor sitting on the left side of a desk. the desk is smaller and curved of the two desks sitting back to back next to the window.  the monitor is more of a light gray, where the second one nearest the window is yellowed a little bit.', 'it is a white and black computer monitor shaped like a rectangle with a smaller, rectangular base. the monitor is the the right of another, taller computer monitor and in front of a desk chair.', 'this is an white and black monitor. it is behind an all black keyboard on a tan desk. it is close to an off white and black monitor of similar size.', 'the monitor is located on top of the desk, and to the left of the other monitor facing the chair. there is a keyboard in front of the monitor.', 'walking into the room, a large office desk is in the middl

In [11]:
result

[0.07715253060991699,
 0.06190477211365443,
 0.0658956756903804,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0011340780541556875,
 0.0017314738580489345,
 0.001834060986008438,
 0.0014874540073949523,
 0.0,
 0.14368092315866415,
 0.19781611904908455,
 0.19638264977184774,
 0.08245600284739081,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0016405880780711731,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.002106738615959218,
 0.0029873790673026817,
 0.0033034793445247384,
 0.0005849371639541467,
 0.0006278455359256732,
 0.0,
 0.0,
 0.0,
 0.001877026030373136,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.19332138055656387,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.014489121196049524,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0018652183194793931,
 0.0]

In [13]:
result 

[0.04944342369588022,
 0.056973283888001075,
 0.05768480830186307,
 0.0,
 0.0,
 0.0,
 0.0030454395251981936,
 0.0019519564936158517,
 0.02329463374044079,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0003115251673616036,
 0.0004044683936187268,
 0.0004306300046859287,
 0.0001859317509691232,
 8.896876531308002e-05,
 0.03951686470139289,
 0.05130668228826685,
 0.05462527402089905,
 0.023585381267804153,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0011456880595730468,
 0.001523405425097438,
 0.0016253047281832975,
 0.001182418681381745,
 0.0005874751965703678,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.000378185635006035,
 0.0004910169460017195,
 0.0005227766448344827,
 0.000225717613455036,
 0.00010800639091407525,
 0.0,
 0.0,
 0.002403762344836196,
 0.021847499072022762,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1333808047726815,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0014292928362991083,
 0.0018557209435269121,
 0.001975751705731725,
 0.0008530640460012869,
 0.

### Visulization

In [19]:
def create_bbox(center, extents, color=[1, 0, 0]):
    print(center)
    print(extents)
    sx, sy, sz = extents
    x_corners = [sx / 2, sx / 2, -sx / 2, -sx / 2, sx / 2, sx / 2, -sx / 2, -sx / 2]
    y_corners = [sy / 2, -sy / 2, -sy / 2, sy / 2, sy / 2, -sy / 2, -sy / 2, sy / 2]
    z_corners = [sz / 2, sz / 2, sz / 2, sz / 2, -sz / 2, -sz / 2, -sz / 2, -sz / 2]
    corners_3d = np.vstack([x_corners, y_corners, z_corners])
    corners_3d[0, :] = corners_3d[0, :] + center[0]
    corners_3d[1, :] = corners_3d[1, :] + center[1]
    corners_3d[2, :] = corners_3d[2, :] + center[2]
    corners_3d = np.transpose(corners_3d)

    lines = [
        [0, 1], [1, 2], [2, 3], [3, 0],
        [4, 5], [5, 6], [6, 7], [7, 4],
        [0, 4], [1, 5], [2, 6], [3, 7]
    ]
    
    colors = [color for i in range(len(lines))]  # Red color for all lines
    line_set = o3d.geometry.LineSet()
    line_set.points = o3d.utility.Vector3dVector(corners_3d)
    line_set.lines = o3d.utility.Vector2iVector(lines)
    line_set.colors = o3d.utility.Vector3dVector(colors)
    
    return line_set


def visualize_mesh_with_bboxes(mesh_path, bbox_centers, bbox_extents):
    # Load mesh
    mesh = o3d.io.read_triangle_mesh(mesh_path)

    # Create a list to store all geometries (mesh + bboxes)
    geometries = [mesh]

    # Create bounding boxes and add them to the list
    for i, (center, extent) in enumerate(zip(bbox_centers, bbox_extents)):
        if i == 0:
            bbox = create_bbox(center, extent, [1, 0, 0])
            geometries.append(bbox)
        else:
            bbox = create_bbox(center, extent, [0, 1, 0])
            geometries.append(bbox)

    # Visualize
    o3d.visualization.draw_plotly(geometries)

In [39]:
(center, box_size), prediction = agent.act_no_gpt(
    "computer monitor",
    scene_name,
    new_session,
)

[38;20m[2023-09-08 22:30:57,011] DEBUG chat_with_nerf [call_visual_grounder_no_gpt] [visual_grounder.py:53] - Set Positive Words in Visual Grounder[0m
[38;20m[2023-09-08 22:30:57,012] DEBUG chat_with_nerf [call_visual_grounder_no_gpt] [visual_grounder.py:54] - positive words: computer monitor[0m


In [40]:
center_aligned = np.append(np.array(data['objects'][0]['bbox'][:3]), 1)
extents_aligned = np.array(data['objects'][0]['bbox'][3:])

half_extents = extents_aligned/2
corners_aligned = [
    center_aligned[:3] + [-half_extents[0], -half_extents[1], -half_extents[2]],
    center_aligned[:3] + [half_extents[0], -half_extents[1], -half_extents[2]],
    center_aligned[:3] + [-half_extents[0], half_extents[1], -half_extents[2]],
    center_aligned[:3] + [half_extents[0], half_extents[1], -half_extents[2]],
    center_aligned[:3] + [-half_extents[0], -half_extents[1], half_extents[2]],
    center_aligned[:3] + [half_extents[0], -half_extents[1], half_extents[2]],
    center_aligned[:3] + [-half_extents[0], half_extents[1], half_extents[2]],
    center_aligned[:3] + [half_extents[0], half_extents[1], half_extents[2]],
]

inverse_matrix = np.linalg.inv(axisAlignment_matrix)
corners_original = [np.dot(inverse_matrix, np.append(corner, 1.0))[:3] for corner in corners_aligned]
center_original = np.mean(corners_original, axis=0)

extents_original = [
    np.max([corner[0] for corner in corners_original]) - np.min([corner[0] for corner in corners_original]),
    np.max([corner[1] for corner in corners_original]) - np.min([corner[1] for corner in corners_original]),
    np.max([corner[2] for corner in corners_original]) - np.min([corner[2] for corner in corners_original]),
]

print("Original center:", center_original)
print("Original extents:", extents_original)

Original center: [3.74770881 1.54398608 1.02614688]
Original extents: [0.617998922035575, 0.6179989220355748, 0.5227721333503722]


In [41]:
box3d_iou(construct_bbox_corners(np.array(center_original), np.array(extents_original)), prediction)

0.034397550722141776

In [None]:
mesh_path = "/workspace/chat-with-nerf-eval/data/scannet/scans/scene0025_00/scene0025_00_vh_clean_2.ply"
bbox_centers = []
bbox_extents = []
bbox_centers.append(np.array(center_original))
bbox_extents.append(np.array(extents_original))
bbox_centers.append(center)
bbox_extents.append(np.array(box_size))
visualize_mesh_with_bboxes(mesh_path, bbox_centers, bbox_extents)

In [None]:
for scene_id in scenes:
    # swap to a new scene
    llava_to_preserve = agent.model_context.captioner
    agent.new_model_context = ModelContextManager.intialize_with_give_captioner(
        llava_to_preserve
    )

    queries = get_text_queries(scene_id)
    for query in queries:  # loop over each scene using joblib
        new_session = Session.create_for_scene(scene_id)
        while True:
            (
                chat_history_for_display,
                chat_counter,
                server_status_code,
                session_state,
                model_3d_grounding_result,
            ) = agent.act(
                system_msg="Hello, I am a chatbot",
                inputs="new text from user simulator",
                top_p=0.9,
                temperature=0.1,
                dropdown_scene=scene_id,
                session=new_session,
            )  # act() only returns if the control is given back to the user
            
   in executor.wait()  # wait for all queries for this scene to finish