### setting up 

In [1]:
import sys, os
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "submodules", "vggt")))
# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Append the path to 'submodules/vggt'
sys.path.append(os.path.join(notebook_dir, "submodules", "vggt"))

In [2]:
import torch
from submodules.vggt.vggt.models.vggt import VGGT
from submodules.vggt.vggt.utils.load_fn import load_and_preprocess_images

### usage 

In [6]:
# ---> load model 
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16

# Initialize model
model = VGGT()
# Load the state dict
checkpoint = torch.load("submodules/vggt/checkpoints/model.pt", map_location=device)
model.load_state_dict(checkpoint)
model = model.to(device=device)

In [11]:
from submodules.vggt.vggt.utils.pose_enc import pose_encoding_to_extri_intri
from submodules.vggt.vggt.utils.geometry import unproject_depth_map_to_point_map

images_names = os.listdir("data/leonora/ship_of_cranes/7_views")
images_paths = [os.path.join("data/leonora/ship_of_cranes/7_views", name) for name in images_names]
images = load_and_preprocess_images(images_paths).to(device)

with torch.no_grad():
    # with torch.cuda.amp.autocast(dtype=dtype):
    images = images[None]  # add batch dimension
    aggregated_tokens_list, ps_idx = model.aggregator(images)
                
    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])

    # Predict Depth Maps
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

    # Predict Point Maps
    point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)
        
    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
    point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0), 
                                                                extrinsic.squeeze(0), 
                                                                intrinsic.squeeze(0))

    

In [19]:
point_map[0][0].shape

torch.Size([392, 518, 3])

In [28]:
from PIL import Image
from torchvision import transforms

img = Image.open("data/leonora/ship_of_cranes/7_views/IMG_1.jpg")

transform = transforms.Compose([
    transforms.Resize((392, 518)), 
    transforms.ToTensor(),
])
img_resized = transform(img)
img_resized.shape

torch.Size([3, 392, 518])

In [31]:
import viser
from PIL import Image
from torchvision import transforms

idx_img = 0
server = viser.ViserServer()
points = point_map[0][idx_img].view(-1, 3).cpu().numpy()

img = Image.open("data/leonora/ship_of_cranes/7_views/IMG_1.jpg")
transform = transforms.Compose([
    transforms.Resize((392, 518)), 
    transforms.ToTensor(),
])
colors = transform(img).view(-1, 3).numpy()

# Add point cloud to the scene
server.add_point_cloud(
    name="my_pointcloud",
    points=points,
    colors=colors,
)

print("Server running. Open your browser and go to http://<your-server-ip>:8080")

# Keep the server alive (usually at the end)
import time
while True:
    time.sleep(1)


  server.add_point_cloud(


Server running. Open your browser and go to http://<your-server-ip>:8080


KeyboardInterrupt: 