In this notebook, we will build a 3D map of a scene from a small set of images and refine it with the featuremetric optimization. We then localize an image downloaded from the Internet and show the effect of the refinement.

# Setup
We start by defining some output paths: where the intermediate files will be stored.

In [None]:
%load_ext autoreload
%autoreload 2
import tqdm, tqdm.notebook
tqdm.tqdm = tqdm.notebook.tqdm  # notebook-friendly progress bars
import os
import time
import sys
import numpy as np
from pathlib import Path
from hloc import extract_features, match_features, reconstruction, pairs_from_exhaustive, visualization
from hloc.visualization import plot_images, read_image
from hloc.utils.viz_3d import init_figure, plot_points, plot_reconstruction, plot_camera_colmap
from pixsfm.util.visualize import init_image, plot_points2D
from pixsfm.refine_hloc import PixSfM
from pixsfm import ostream_redirect
from PIL import Image, ImageDraw
import pycolmap
from pathlib import Path
#import visualize_model
# redirect the C++ outputs to notebook cells
cpp_out = ostream_redirect(stderr=True, stdout=True)
cpp_out.__enter__()

In [None]:
import torch 
print(torch.__version__)
print(torch.cuda.get_arch_list())

In [None]:
images = Path('datasets/monarch/')
outputs = Path('outputs/monarch-demo/')
sfm_pairs = outputs / 'pairs-sfm.txt'
loc_pairs = outputs / 'pairs-loc.txt'
features = outputs / 'features.h5'
matches = outputs / 'matches.h5'
raw_dir = outputs / "raw"
ref_dir = outputs / "ref"
''' model location in case of intrinsics locked '''
ref_dir_locked = outputs / "ref_locked"
''' model location in case of intrinsics not locked '''
ref_dir_not_locked = outputs / "ref_dir_not_locked" 

In [None]:
def rotate_axis(p: np.ndarray):
    
# Original point
    
    # Normalize p to get the unit vector
    p_unit = p / np.linalg.norm(p)

    # Unit vector in the x direction
    x_unit = np.array([1, 0, 0])

    # Cross product of p_unit and x_unit
    v = np.cross(p_unit, x_unit)

    # Sine and cosine of the angle between p_unit and x_unit
    s = np.linalg.norm(v)
    c = np.dot(p_unit, x_unit)

    # Skew-symmetric cross-product matrix of v
    Vx = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])

    # Rotation matrix
    R = np.eye(3) + Vx + np.dot(Vx, Vx) * ((1 - c) / (s ** 2))

    # New point
    p_new = np.dot(R, p)

    return p_new

### Helper functions for relative pose calculations

In [None]:
'''
e_lw => left camera pose in world frame (4 * 4)
e_rw => right camera pose in world frame (4 * 4)
'''
from scipy.spatial.transform import Rotation
    

def transformation_matrix_to_arr(T : np.ndarray):
    R  = T[:3,:3] #extracting the rotation matrix
    dx = T[0,3]
    dy = T[1,3]
    dz = T[2,3]
    dquat = Rotation.from_matrix(R).as_quat()
    rel_pose_arr = [dx,dy,dz]
    for q in dquat: 
        rel_pose_arr.append(q)
    return rel_pose_arr

def relative_pose_transformation_matrix(e_lw: np.ndarray, e_rw: np.ndarray):
    from scipy.spatial.transform import Rotation
    e_wl = np.linalg.inv(e_lw)
    e_rl = np.dot(e_rw,np.linalg.inv(e_lw))
    return e_rl

#def calculate_relative_pose(e_lw, e_rw):
def calculate_relative_pose(e_lw: np.ndarray, e_rw: np.ndarray):
    #print(f"Inside the calculate_relative_pose function")
    from scipy.spatial.transform import Rotation
    e_wl = np.linalg.inv(e_lw)
    #print(f"e_wl: {e_wl}")
    #e_rl = e_rw * np.linalg.inv(e_lw) #right camera in the frame of the left camera
    #e_rl = e_rw * e_wl #right camera in the frame of the left camera
    #print(f"e_rl: {e_rl}")
    e_rl = np.dot(e_rw,np.linalg.inv(e_lw))
    R = e_rl[:3,:3] #extracting the rotation matrix
    dx = e_rl[0,3]
    dy = e_rl[1,3]
    dz = e_rl[2,3]
    dquat = Rotation.from_matrix(R).as_quat()
    #rel_pose =  [dx, dy] + dquat
    rel_pose = [dx,dy,dz]
    for q in dquat: 
        rel_pose.append(q)
    return rel_pose
    #return [dx,dy]
    #print(f"dx: {dx} dy: {dy} dquat: {dquat}")
    

def cam_extrinsics(img):
    from read_write_model import qvec2rotmat
    R = qvec2rotmat(img.qvec)
    t = img.tvec.reshape(3,-1)
    #print(f"R: {R} t: {t}")
    R_t = np.concatenate((R,t), axis = 1)
    #R_t = np.vstack([np.array([0,0,0,1]), R_t])
    R_t = np.vstack([R_t, np.array([0,0,0,1])])
    return R_t    #  4 * 4 matrix

def calculate_relative_pose_between(img_dict: dict, left_idx: int, right_idx: int):
    left_img = img_dict[left_idx]
    right_img = img_dict[right_idx]
    e_lw = cam_extrinsics(left_img)  #left camera pose w.r.t. world
    e_rw = cam_extrinsics(right_img) #right camera pose w.r.t world
    rel_pose = calculate_relative_pose(e_lw, e_rw)
    return rel_pose


    

In [None]:
import numpy as np

def align_baseline_axis(T: np.ndarray):

    # Extract the translation vector
    t = T[:3, 3]

    # Calculate the magnitude of the translation
    t_mag = np.linalg.norm(t)

    # Calculate the rotation angles
    theta_y = np.arctan2(t[2], t[0])  # Rotation around y-axis
    theta_z = -np.arctan2(t[1], np.sqrt(t[0]**2 + t[2]**2))  # Rotation around z-axis

    # Create the rotation matrices
    Ry = np.array([
        [np.cos(theta_y), 0, np.sin(theta_y), 0],
        [0, 1, 0, 0],
        [-np.sin(theta_y), 0, np.cos(theta_y), 0],
        [0, 0, 0, 1]
    ])

    Rz = np.array([
        [np.cos(theta_z), -np.sin(theta_z), 0, 0],
        [np.sin(theta_z), np.cos(theta_z), 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1]
    ])

    # Apply the rotations to the transformation matrix
    T_new = np.dot(Rz, np.dot(Ry, T))

    # Set the translation vector to its magnitude in the x direction
    T_new[:3, 3] = [t_mag, 0, 0]
    
    return T_new
    #print(T_new)

### Camera positions WITHOUT Rig Bundle Adjustment

In [None]:
from pathlib import Path
sparse_dir = ref_dir_locked 
print(f"sparse_dir: {sparse_dir.as_posix()}")
sparse_images = sparse_dir / "images.bin"
sparse_points3D = sparse_dir / "points3D.bin"
sparse_cameras = sparse_dir / "cameras.bin"

In [None]:
from read_write_model import read_cameras_binary
cameras = read_cameras_binary(sparse_cameras)
print(cameras)

In [None]:
import sys
sys.path.append(os.path.expandvars('$HOME/colmap/scripts/python'))
#sys.path.append("/home/skumar/colmap/scripts/python")
from read_write_model import read_images_binary 
sparse_img_dict = read_images_binary(sparse_images)
print(f"{len(sparse_img_dict.keys())} ==> {sparse_img_dict.keys()}")
print(f"min_key: {min(sparse_img_dict.keys())} mx_key: {max(sparse_img_dict.keys())}")

In [None]:
import numpy as np
rel_poses = []
rel_poses_aligned = []
num_images = len(sparse_img_dict.keys())
for idx in range(1, num_images // 2 + 1):
    left_img = sparse_img_dict[idx]
    right_img = sparse_img_dict[idx + 42]
    #print(f"left_img_name: {left_img.name} right_img_name: {right_img.name}")
    e_lw = cam_extrinsics(left_img)  #left camera pose w.r.t. world
    e_rw = cam_extrinsics(right_img) #right camera pose w.r.t world
    #unaligned relative pose calculation
    e_rl = calculate_relative_pose(e_lw, e_rw)
    rel_poses.append(e_rl)
    #aligned relative pose calculation
    e_rl_transformation_matrix = relative_pose_transformation_matrix(e_lw, e_rw)
    e_rl_aligned_transformation_matrix = align_baseline_axis(e_rl_transformation_matrix)
    #print(type(e_rl_aligned_transformation_matrix))
    e_rl_aligned_transformation_arr = transformation_matrix_to_arr(e_rl_aligned_transformation_matrix)
    rel_poses_aligned.append(e_rl_aligned_transformation_arr)

In [None]:
print(f"type(rel_poses): {type(rel_poses)}")
print(f"rel_poses[0]: {rel_poses[0]} type(rel_poses[0]): {type(rel_poses[0])}")

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.notebook_repr_html', True)
df = pd.DataFrame(rel_poses, columns=['dx', 'dy', 'dz', 'qx' , 'qy', 'qz' , 'qw'])
df.style

### Camera poses with Rig Bundle Adjustment

In [None]:
rig_ba_sparse_dir = Path(os.path.expandvars('$HOME/rig_stereo_colmap_cli_output'))
rig_ba_sparse_images = rig_ba_sparse_dir / "images.bin"
rig_ba_sparse_points3D = rig_ba_sparse_dir / "points3D.bin"
rig_ba_sparse_cameras = rig_ba_sparse_dir / "cameras.bin"

In [None]:
import sys
#sys.path.append("/home/skumar/colmap/scripts/python")
sys.path.append(os.path.expandvars('$HOME/colmap/scripts/python'))
from read_write_model import read_images_binary 
rig_ba_sparse_img_dict = read_images_binary(rig_ba_sparse_images)
print(f"{len(rig_ba_sparse_img_dict.keys())} => {rig_ba_sparse_img_dict.keys()}")

In [None]:
import numpy as np

rel_poses = []
rel_poses_aligned = [] 
num_images = len(sparse_img_dict.keys())

for idx in range(1, num_images // 2 + 1):
    left_img = rig_ba_sparse_img_dict[idx]
    right_img = rig_ba_sparse_img_dict[idx + 42]
    #print(f"left_img_name: {left_img.name} right_img_name: {right_img.name}")
    e_lw = cam_extrinsics(left_img)  #left camera pose w.r.t. world
    e_rw = cam_extrinsics(right_img) #right camera pose w.r.t world
    
    #unaligned relative pose calculation
    e_rl = calculate_relative_pose(e_lw, e_rw)
    rel_poses.append(e_rl)
    
    #aligned relative pose calculation
    e_rl_transformation_matrix = relative_pose_transformation_matrix(e_lw, e_rw)
    e_rl_aligned_transformation_matrix = align_baseline_axis(e_rl_transformation_matrix)
    e_rl_aligned_transformation_arr = transformation_matrix_to_arr(e_rl_aligned_transformation_matrix)
    rel_poses_aligned.append(e_rl_aligned_transformation_arr)


In [None]:

import numpy as np
rig_ba_rel_poses = []
num_images = len(rig_ba_sparse_img_dict.keys())
for idx in range(1, num_images // 2 + 1):
    left_img = rig_ba_sparse_img_dict[idx]
    right_img = rig_ba_sparse_img_dict[idx + 42]
    if idx < 5:
        print(f"left_img_name: {left_img.name} right_img_name: {right_img.name}")
    e_lw = cam_extrinsics(left_img)  #left camera pose w.r.t. world
    e_rw = cam_extrinsics(right_img) #right camera pose w.r.t world
    rel_pose = calculate_relative_pose(e_lw, e_rw)
    rig_ba_rel_poses.append(rel_pose)

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.notebook_repr_html', True)
df = pd.DataFrame(rel_poses, columns=['dx', 'dy', 'dz', 'qx' , 'qy', 'qz' , 'qw'])
df.style

In [None]:
from read_write_model import read_cameras_binary
cameras = read_cameras_binary(rig_ba_sparse_cameras)
print(cameras)