In [1]:
%matplotlib inline

import os
import sys
sys.path.append('../')
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import glob
import random
import cv2
import numpy as np
import networkx as nx
import torch
import torch.nn.functional as F
from datetime import datetime

import roma
import kornia
import utils.colmap as colmap_utils


from tqdm import tqdm
from copy import deepcopy
from PIL import Image
from matplotlib import pyplot as plt

from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images_ratio, load_and_preprocess_images_square
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map

from utils.umeyama import umeyama
from utils.metric_torch import evaluate_auc, evaluate_pcd

torch._dynamo.config.accumulated_cache_size_limit = 512
device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
def run_VGGT(images, device, dtype):
    # images: [B, 3, H, W]

    # Run VGGT for camera and depth estimation
    model = VGGT()
    _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
    model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
    model.eval()
    model = model.to(device).to(dtype)
    print(f"Model loaded")

    with torch.no_grad():
        predictions = model(images.to(device, dtype), verbose=True)
        extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions['pose_enc'], images.shape[-2:])
        extrinsic = extrinsic.squeeze(0).cpu().numpy()
        intrinsic = intrinsic.squeeze(0).cpu().numpy()
        depth_map = predictions['depth'].squeeze(0).cpu().numpy()
        depth_conf = predictions['depth_conf'].squeeze(0).cpu().numpy()
    
    return extrinsic, intrinsic, depth_map, depth_conf

In [5]:
# Get image paths and preprocess them
data_dir_gt = "../data/MipNeRF360"
data_dir_pred = "../data/MipNeRF360_vggt_opt_lr_5_avg_scale_500k_rand_order"

scenes = sorted(os.listdir(data_dir_gt))
for scene in scenes:
    if os.path.isdir(os.path.join(data_dir_gt, scene)):
        print(f"Processing scene: {scene}")
        sparse_dir_gt = os.path.join(data_dir_gt, scene, "sparse", "0")
        images_dir = os.path.join(data_dir_gt, scene, "images")

        cameras_gt = colmap_utils.read_cameras_binary(os.path.join(sparse_dir_gt, "cameras.bin"))
        images_gt = colmap_utils.read_images_binary(os.path.join(sparse_dir_gt, "images.bin"))
        pcd_gt = colmap_utils.read_points3D_binary(os.path.join(sparse_dir_gt, "points3D.bin"))

        sparse_dir_pred = os.path.join(data_dir_pred, scene, "sparse", "0")
        cameras_pred = colmap_utils.read_cameras_binary(os.path.join(sparse_dir_pred, "cameras.bin"))
        images_pred = colmap_utils.read_images_binary(os.path.join(sparse_dir_pred, "images.bin"))
        pcd_pred = colmap_utils.read_points3D_binary(os.path.join(sparse_dir_pred, "points3D.bin"))

        # print(f"GT's intrinsics: {cameras_gt[1].params}")
        # print(f"Pred's intrinsics: {cameras_pred[1].params}")
        diff = cameras_gt[1].params - cameras_pred[1].params
        print("Intrinsic Difference", np.linalg.norm(diff[:2] / cameras_gt[1].params[2:]))

Processing scene: bicycle
Intrinsic Difference 0.018999718823743232
Processing scene: bonsai
Intrinsic Difference 0.02563519124160258
Processing scene: counter
Intrinsic Difference 0.027854708642436523
Processing scene: flowers
Intrinsic Difference 0.07321011271751116
Processing scene: garden
Intrinsic Difference 0.003909443165705178
Processing scene: kitchen
Intrinsic Difference 0.032984788461085386
Processing scene: room
Intrinsic Difference 0.05987657710889783
Processing scene: stump
Intrinsic Difference 0.03201081516783678
Processing scene: treehill
Intrinsic Difference 0.05678942490718071
