In [1]:
from pathlib import Path
import numpy as np
import torch
import matplotlib.pyplot as plt
from natsort import natsorted

# reload notebook automatically after changes to source python files
%load_ext autoreload
%autoreload 2

# change base folder to parent
import os
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
print(os.getcwd())

/mnt/C45ADD865ADD7620/i3d-rrc/ic-topo-nav/mickst3r


In [2]:
from src.datasets.utils import read_intrinsics

dataset_root = Path("./data/mapfree/val")

VAL_SCENES = ["s00460", "s00474", "s00482", "s00489", "s00495"]
scene_paths = [dataset_root / scene for scene in VAL_SCENES]

print(f"Found {len(scene_paths)} scenes")

scene_path = scene_paths[0]
print(f"Selected scene: {scene_path}")

intrinsics_path = scene_path / "intrinsics.txt"
K = read_intrinsics(intrinsics_path, resize=(384, 512)) # resize to model input size

reference_image_folder = scene_path / "seq0"
image_r_path = reference_image_folder / "frame_00000.jpg"

query_image_folder = scene_path / "seq1"
query_image_paths = natsorted(list(query_image_folder.glob("*.jpg")))

QUERY_NAT_IDX = 1
image_q_path = query_image_paths[QUERY_NAT_IDX]

Found 5 scenes
Selected scene: data/mapfree/val/s00460


In [3]:
from mast3r_src.model import AsymmetricMASt3R

from dust3r_src.dust3r.inference import inference
from dust3r_src.dust3r.utils.image import load_images

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

model_name = "./checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth"
model = AsymmetricMASt3R.from_pretrained(model_name).to(device)

images = load_images([str(image_r_path), str(image_q_path)], size=512)
output_1 = inference([tuple(images)], model, device, batch_size=1, verbose=False)
output_2 = inference([tuple(images[::-1])], model, device, batch_size=1, verbose=False)

# double mast3r predictions - 🔷 schema below
# view1, view2 -> dicts with keys 'img', 'true_shape', 'idx', 'instance'
# view["img"] -> torch.Tensor of shape (1, 3, H_resize, W_resize) standardized RGB image
# view["true_shape"] -> torch.Tensor of shape (1, 2) H_resize, W_resize information
# view["idx"] / view["instance"] -> list of length 1 with the int/str index of the image resp.

# pred1, pred2 -> dict with keys 'pts3d', 'conf', 'desc', 'desc_conf'
# pred["pts3d"] -> torch.Tensor of shape (1, H_resize, W_resize, 3) pointmaps
# pred["conf"] / pred["desc_conf"]  -> torch.Tensor of shape (1, H_resize, W_resize) confidence map for pts3d and desc resp.
# pred["desc"] -> torch.Tensor of shape (1, H_resize, W_resize, 24) descriptor map
view1, pred1 = output_1['view1'], output_1['pred1']
view2, pred2 = output_2['view1'], output_2['pred1']

# extract confidence from pointmaps of both views
conf1, conf2 = pred1['conf'], pred2['conf'] # Shape (1, H_resize, W_resize)
print(f"\nConfidence shape: {conf1.shape}, {conf2.shape}")

# extract depths from pointmaps of both views as the z-coordinate
depth1, depth2 = pred1['pts3d'][..., 2], pred2['pts3d'][..., 2]
print(f"Depths shape: {depth1.shape}, {depth2.shape}")

# extract descriptors from descriptor maps of both views
desc1, desc2 = pred1['desc'], pred2['desc']
print(f"Descriptors shape: {desc1.shape}, {desc2.shape}")

# extract descriptors confidence from descriptor maps of both views
desc_conf1, desc_conf2 = pred1['desc_conf'], pred2['desc_conf']
# print(f"Descriptors confidence shape: {desc_conf1.shape}, {desc_conf2.shape}")

Using device: cuda
... loading model from ./checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth
instantiating : AsymmetricMASt3R(enc_depth=24, dec_depth=12, enc_embed_dim=1024, dec_embed_dim=768, enc_num_heads=16, dec_num_heads=12, pos_embed='RoPE100',img_size=(512, 512), head_type='catmlp+dpt', output_mode='pts3d+desc24', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), patch_embed_cls='PatchEmbedDust3R', two_confs=True, desc_conf_mode=('exp', 0, inf), landscape_only=False)
<All keys matched successfully>
>> Loading a list of 2 images
 - adding data/mapfree/val/s00460/seq0/frame_00000.jpg with resolution 540x720 --> 384x512
 - adding data/mapfree/val/s00460/seq1/frame_00001.jpg with resolution 540x720 --> 384x512
 (Found 2 images)


In [5]:
from config.default import cfg
cfg.merge_from_file('config/prob_pose.yaml')
from src.diff_downsample_maps import downsample_maps_w_kpts

from src.diff_probabilistic_procrustes import e2eProbabilisticProcrustesSolver
from src.diff_compute_correspondences import ComputeCorrespondences

data_batch = downsample_maps_w_kpts(pred1, pred2, target_size=(51, 38), conf_type='conf', device=device)

data_batch['K_color0'] = torch.from_numpy(K[f'seq0/{image_r_path.stem}.jpg']).unsqueeze(0).to(device)
data_batch['K_color1'] = torch.from_numpy(K[f'seq1/{image_q_path.stem}.jpg']).unsqueeze(0).to(device)

compute_Correspondences = ComputeCorrespondences(cfg, device)
e2e_Procrustes = e2eProbabilisticProcrustesSolver(cfg)

data_batch = compute_Correspondences.prepare_data(data_batch)
R_batch, t_batch, inliers_batch, _ = e2e_Procrustes.estimate_pose(
    data_batch, return_inliers=True
)

R_batch = R_batch.squeeze(0).detach().cpu().numpy()
t_batch = t_batch.reshape(-1).detach().cpu().numpy()

print(f"Estimated Rotation:\n{R_batch}\n\nEstimated Translation:\n{t_batch}\n\nInliers: {inliers_batch.sum()}")

Estimated Rotation:
[[ 0.904065   -0.07803822  0.42021033]
 [ 0.15057954  0.9783055  -0.14227915]
 [-0.3999911   0.19190446  0.8962031 ]]

Estimated Translation:
[-0.85832775 -0.12916309  0.5198691 ]

Inliers: 522.5718994140625


In [6]:
# read GT pose
pose_path = scene_path / "poses.txt"
# ignore first line and read as image_q_path qw qx qy qz tx ty tz
# read as array since first column is the image name
pose = np.loadtxt(pose_path, skiprows=2, dtype=str)
gt_name = pose[QUERY_NAT_IDX, 0]
gt_qvec = np.array(pose[QUERY_NAT_IDX, 1:5], dtype=float)
gt_tvec = np.array(pose[QUERY_NAT_IDX, 5:], dtype=float)

from src.tf_utils import compose_qt_tf, calculate_rot_error, calculate_translation_error

gt_R, gt_t = compose_qt_tf(gt_qvec, gt_tvec, return_Rt=True)

R_error = calculate_rot_error(gt_R, R_batch)
t_error = calculate_translation_error(gt_t, t_batch)
print(f"R_error: {round(R_error, 3)} degrees | t_error: {round(t_error, 3)} meters")

R_delta = calculate_rot_error(gt_R, np.eye(3))
t_delta = calculate_translation_error(gt_t, np.zeros(3))
print(f"actual R_delta: {round(R_delta, 3)} degrees | t_delta: {round(t_delta, 3)} meters")

R_error: 3.367 degrees | t_error: 0.144 meters
actual R_delta: 28.902 degrees | t_delta: 0.992 meters
