### TODOs
- Adjust the object scoring function to:
    - score directly on the 3DObject class once it is created from Matcher
- Write a plotting function for ObjectInstance debugging

In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import plotly.io as pio
import numpy as np

# Ensure Plotly is set up to work with Jupyter notebooks
pio.renderers.default = 'notebook'

# Hack the path for now, deal with this later
cwd = '/teamspace/studios/this_studio/letsdoit'
if (cwd not in sys.path):
    sys.path.append(cwd)

from typing import List
from copy import copy
from transformers import CLIPProcessor, CLIPModel
from masks_finder import MasksFinder, unrotate_masks, unrotate_bboxes
from masks_matcher import MasksMatcher
from clip_retriever import ClipRetriever
from dataloader.dataloader import DataLoader
from object_scorer import ObjectScorer
from letsdoit.utils.object_instance import ObjectInstance, initialize_object_instances, plot_instances_3d, generate_masks_features
from letsdoit.utils.misc import select_ids
from letsdoit.scoring.primitive import SpatialPrimitive, SpatialPrimitivePair

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
path_dataset = '/teamspace/studios/this_studio/datasets'
ASSET_TYPE = 'wide'
loader = DataLoader(path_dataset, split='dev')
retriever = ClipRetriever()
masks_finder = MasksFinder()
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
visit_ids = loader.visit_ids
video_ids = loader.get_video_ids(visit_ids[0])

visit_id = visit_ids[0]
video_id = video_ids[0]
# Get both original and upright-rotated images and depths as outputs
images, images_rotated, image_paths, intrinsics, poses, orientations = loader.get_images(visit_id, video_id, asset_type=ASSET_TYPE, sample_freq=1)
depths, depths_rotated, depth_paths, _, _, _ = loader.get_depths(visit_id, video_id, asset_type=ASSET_TYPE, sample_freq=1)
# Retriever receives upright-rotatead images as inputs
retriever.generate_image_features(images_rotated)


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.


torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3549.)



final text_encoder_type: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


_IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight'])
<All keys matched successfully>


Loading rgb frames from visit 420683 and video 42445132: 100%|██████████| 159/159 [00:07<00:00, 21.89it/s]
Loading depth frames from visit 420683 and video 42445132: 100%|██████████| 159/159 [00:05<00:00, 30.06it/s]


In [3]:
# For an object, get a list of corresponding ObjectInstances
def get_object_instances(object: str) -> List[ObjectInstance]:
    best_indices = retriever.retrieve_best_images_for_object(object)
    best_images = select_ids(images, best_indices)
    best_images_rotated = select_ids(images_rotated, best_indices)
    best_image_paths = select_ids(image_paths, best_indices)
    best_intrinsics = select_ids(intrinsics, best_indices)
    best_poses = select_ids(poses, best_indices)
    best_orientations = select_ids(orientations, best_indices)
    best_depths = select_ids(depths, best_indices)
    best_depth_paths = select_ids(depth_paths, best_indices)
    # Masks we get here as outputs are for the upright-rotated images
    image_ids, masks, bboxes, confidences, labels = masks_finder.get_masks_from_imgs(best_images_rotated, object)
    # Rotate masks and bboxes back to the rotation of the original image
    mask_image_sizes = [best_images_rotated[idx].shape[:-1] for idx in image_ids]
    mask_image_orientations = [best_orientations[idx] for idx in image_ids]
    masks_unrotated = unrotate_masks(masks=masks, orientations=orientations)
    bboxes_unrotated = unrotate_bboxes(bboxes=bboxes, img_dims=mask_image_sizes, orientations=mask_image_orientations)
    # Marco's addition?
    image_features = generate_masks_features(clip_processor, clip_model, select_ids(best_images_rotated, image_ids), bboxes, masks)
    dict_object_instances = {'images': select_ids(best_images, image_ids),
                         'image_names': [Path(img_path).name.replace('.png', '') for img_path in best_image_paths],
                         'depths': select_ids(best_depths, image_ids),
                         'bboxes': bboxes_unrotated,
                         'masks': masks_unrotated,
                         'labels': labels,
                         'confidences': confidences,
                         'intrinsics': select_ids(best_intrinsics, image_ids),
                         'extrinsics': select_ids(best_poses, image_ids),
                         'orientations': select_ids(best_orientations, image_ids),
                         'image_features': image_features,
    }

    object_instances = initialize_object_instances(**dict_object_instances)

    return object_instances

In [4]:
# Get drawers
drawers = get_object_instances('drawer')
# # Get beds
# beds = get_object_instances('bed')
# # Get nightstands
# nightstands = get_object_instances('nightstand')


The `device` argument is deprecated and will be removed in v5 of Transformers.


torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.


None of the inputs have requires_grad=True. Gradients will be None

100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


In [5]:
t = drawers[5]
t.label='reference_2'
r1 = drawers[3]
r1.label='reference_1'
r2 = drawers[4]
r2.label='target'

In [6]:
prim = SpatialPrimitive.BETWEEN
spp = SpatialPrimitivePair(target=r2, reference=t, reference_2=t, primitive=prim)

In [7]:
spp.get_score()

1

In [None]:
plot_instances_3d([t, r1, r2])