### TODOs
- Adjust the object scoring function to:
    - score directly on the 3DObject class once it is created from Matcher
- Write a plotting function for ObjectInstance debugging

In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import plotly.io as pio
import numpy as np

# Ensure Plotly is set up to work with Jupyter notebooks
pio.renderers.default = 'notebook'

# Hack the path for now, deal with this later
cwd = '/teamspace/studios/this_studio/letsdoit'
if (cwd not in sys.path):
    sys.path.append(cwd)

from typing import List
from copy import copy
from transformers import CLIPProcessor, CLIPModel
from letsdoit.pipeline.masks_finder import MasksFinder, unrotate_masks, unrotate_bboxes
from letsdoit.pipeline.masks_matcher import MasksMatcher
from letsdoit.pipeline.masks_merger import MasksMerger
from letsdoit.pipeline.clip_retriever import ClipRetriever
from dataloader.dataloader import DataLoader
from letsdoit.pipeline.object_instance import ObjectInstance, initialize_object_instances, plot_instances_3d, generate_masks_features, filter_instances
from letsdoit.pipeline.object_3d import filter_objects_3d, denoise_objects_3d, plot_objects_3d
from letsdoit.utils.misc import select_ids
from letsdoit.scoring.primitive import SpatialPrimitive, SpatialPrimitivePair, get_primitive
from letsdoit.scoring.spatial_graph import GraphNode, retrieve_best_action_object

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [5]:
path_dataset = '/teamspace/studios/this_studio/datasets'
ASSET_TYPE = 'wide'
loader = DataLoader(path_dataset, split='dev')
retriever = ClipRetriever()
masks_finder = MasksFinder()
masks_merger = MasksMerger()
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
visit_ids = loader.visit_ids
video_ids = loader.get_video_ids(visit_ids[0])

visit_id = visit_ids[0]
video_id = video_ids[0]
pcd = loader.parser.get_highres_reconstruction(visit_id, video_id)
# Get both original and upright-rotated images and depths as outputs
images, images_rotated, image_paths, intrinsics, poses, orientations = loader.get_images(visit_id, asset_type=ASSET_TYPE, sample_freq=1)
depths, depths_rotated, depth_paths, _, _, _ = loader.get_depths(visit_id, asset_type=ASSET_TYPE, sample_freq=1)
# Retriever receives upright-rotatead images as inputs
retriever.generate_image_features(images_rotated)

final text_encoder_type: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>


Loading rgb frames from visit 420683 and video 42445132: 100%|██████████| 159/159 [00:07<00:00, 22.42it/s]
Loading depth frames from visit 420683 and video 42445132: 100%|██████████| 159/159 [00:05<00:00, 30.84it/s]


In [6]:
# For an object, get a list of corresponding ObjectInstances
def get_object_instances(object: str) -> List[ObjectInstance]:
    best_indices = retriever.retrieve_best_images_for_object(object)
    best_images = select_ids(images, best_indices)
    best_images_rotated = select_ids(images_rotated, best_indices)
    best_image_paths = select_ids(image_paths, best_indices)
    best_intrinsics = select_ids(intrinsics, best_indices)
    best_poses = select_ids(poses, best_indices)
    best_orientations = select_ids(orientations, best_indices)
    best_depths = select_ids(depths, best_indices)
    best_depth_paths = select_ids(depth_paths, best_indices)
    # Masks we get here as outputs are for the upright-rotated images
    image_ids, masks, bboxes, confidences, labels = masks_finder.get_masks_from_imgs(best_images_rotated, object)
    # Rotate masks and bboxes back to the rotation of the original image
    mask_image_sizes = [best_images_rotated[idx].shape[:-1] for idx in image_ids]
    mask_image_orientations = [best_orientations[idx] for idx in image_ids]
    masks_unrotated = unrotate_masks(masks=masks, orientations=orientations)
    bboxes_unrotated = unrotate_bboxes(bboxes=bboxes, img_dims=mask_image_sizes, orientations=mask_image_orientations)
    # Marco's addition?
    image_features = generate_masks_features(clip_processor, clip_model, select_ids(best_images_rotated, image_ids), bboxes, masks)
    dict_object_instances = {'images': select_ids(best_images, image_ids),
                         'image_names': [Path(img_path).name.replace('.png', '') for img_path in best_image_paths],
                         'depths': select_ids(best_depths, image_ids),
                         'bboxes': bboxes_unrotated,
                         'masks': masks_unrotated,
                         'labels': labels,
                         'confidences': confidences,
                         'intrinsics': select_ids(best_intrinsics, image_ids),
                         'extrinsics': select_ids(best_poses, image_ids),
                         'orientations': select_ids(best_orientations, image_ids),
                         'image_features': image_features,
    }

    object_instances = initialize_object_instances(**dict_object_instances)

    return object_instances

In [7]:
def get_objects_3d(object: str):
    object_instances = get_object_instances(object)
    objects_3d = masks_merger(object_instances, pcd)
    denoise_objects_3d(objects_3d)
    return objects_3d

In [29]:
doors = get_objects_3d('door')



The `device` argument is deprecated and will be removed in v5 of Transformers.


torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.


None of the inputs have requires_grad=True. Gradients will be None

Extracting the masks: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it]
Merging the object instances: 100%|██████████| 10/10 [00:09<00:00,  1.04it/s]
Denoising point clouds: 100%|██████████| 6/6 [00:00<00:00,  9.05it/s]


In [16]:
handle_synonym = 'knob'

# Get handles
handles = get_objects_3d(handle_synonym)

# Get drawers
drawers = get_objects_3d('drawer')
# Get beds
beds = get_objects_3d('bed')
# Get nightstands
nightstands = get_objects_3d('nightstand')

# windows = get_object_instances('window')
# radiators = get_object_instances('radiator')

Extracting the masks:   0%|          | 0/10 [00:00<?, ?it/s]


The `device` argument is deprecated and will be removed in v5 of Transformers.


torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.


None of the inputs have requires_grad=True. Gradients will be None

Extracting the masks: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it]
Merging the object instances: 100%|██████████| 10/10 [00:01<00:00,  5.31it/s]
Denoising point clouds: 100%|██████████| 10/10 [00:05<00:00,  1.90it/s]

The `device` argument is deprecated and will be removed in v5 of Transformers.


torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current b

In [30]:
objects = drawers + beds + nightstands + handles + doors
# objects = windows + radiators

In [22]:
import json
path = '/teamspace/studios/this_studio/sample_instructions.json'
with open(path, 'r') as file:
    instructions = json.load(file)

visit_id = 420683
desc_id = 'a2645cef-a1a7-4130-a26a-3863f9c9dff4'
inst_visit = [inst for inst in instructions if inst['visit_id']==visit_id][0]
instruction = [i for i in inst_visit['instructions'] if i['desc_id']==desc_id][0]

In [23]:
instruction['action_object'] = handle_synonym
instruction['spatial_primitives'][0]['target_object'] = handle_synonym
instruction

{'instruction': 'Open the nightstand drawer on the right side of the bed',
 'desc_id': 'a2645cef-a1a7-4130-a26a-3863f9c9dff4',
 'action_object': 'knob',
 'spatial_primitives': [{'primitive': 'contains',
   'target_object': 'knob',
   'reference_object': 'drawer'},
  {'primitive': 'contains',
   'target_object': 'drawer',
   'reference_object': 'nightstand'},
  {'primitive': 'to the right',
   'target_object': 'nightstand',
   'reference_object': 'bed'}]}

In [31]:
plot_objects_3d(doors+handles)

In [24]:
root = GraphNode(primitives=instruction['spatial_primitives'],
                 all_objects=objects,
                 root=True
)
action_instances = filter_objects_3d(instruction['action_object'], objects)
root.expand(action_instances)

In [25]:
node = root
best_objects = []
while len(node.children) != 0:
    best_child = np.argmax([child.best_score for child in node.children])
    best_objects.append(node.children[best_child].object)
    node = node.children[best_child]

In [42]:
import itertools
# Score = action_object_confidence + sum(total spatial primitive score)
# For each primitive, create all combinations of SpatialPrimitivePair
primitives_all = []
for primitive in instruction['spatial_primitives']:
    primitive_pairs = []
    prim = get_primitive(primitive['primitive'])
    target_list = filter_objects_3d(primitive['target_object'], objects)
    reference_list = filter_objects_3d(primitive['reference_object'], objects)
    if primitive == SpatialPrimitive.BETWEEN:
        reference_2_list = filter_objects_3d(primitive['reference_object_2'])
        triplets_iter = itertools.product(target_list, reference_list, reference_2_list)
        for t, r1, r2 in triplets_iter:
            primitive_pairs.append(SpatialPrimitivePair(t, r1, prim, reference_2=r2))
    else:
        pairs_iter = itertools.product(target_list, reference_list)
        for t, r in pairs_iter:
            primitive_pairs.append(SpatialPrimitivePair(t, r, prim))
    primitive['combos'] = primitive_pairs
    primitives_all.append(primitive_pairs)
# From those, create all possible combinations of SpatialPrimitivePair series
# From those, create all possible combinations of action_object and SpatialPrimitivePair series


In [43]:
a = [1, 2, 3]
b = ['cat', 'dog', 'fart']
c = [True, False, False]
d = []

for el in d:
    print(el)

print('finished')
b.pop('cat')

finished


TypeError: 'str' object cannot be interpreted as an integer

In [None]:
del b[1]