GPU Memory Requirements around 15GB with full precision

**Installing Dependencies**

First, clone this repository to your local machine, and install the dependencies (torch, torchvision, numpy, Pillow, and huggingface_hub).

In [None]:
!git clone git@github.com:facebookresearch/vggt.git 
cd vggt
!pip install -r requirements.txt
!pip install -r requirements_demo.txt

To infer via gradio demo, simply run

In [None]:
!python demo_gradio.py

Load Model

In [None]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
 
device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
 
# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
 
# (OR)
#model = VGGT()
#_URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
#model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))

In [None]:
# Load and preprocess example images (replace with your own image paths)
image_names = ["path/to/imageA.png", "path/to/imageB.png", "path/to/imageC.png"]  
images = load_and_preprocess_images(image_names).to(device)

# Shapes shown are with respect to three images

You can also optionally choose which attributes (branches) to predict, as shown below. This achieves the same result as the example above. This example uses a batch size of 1 (processing a single scene), but it naturally works for multiple scenes.

In [None]:
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        images = images[None]  # add batch dimension
        aggregated_tokens_list, ps_idx = model.aggregator(images)
        
    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
    # pose_enc.shape --> (1, len(images), 9) --> (1, 3, 9)
    
    
    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
    
    # enstrinsic.shape --> (1, 3, 3, 4)
    # intrinsic.shape --> (1, 3, 3, 3)

    # Predict Depth Maps
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)
    
    # depth_map.shape --> (1, 3, 294, 518, 1)
    # depth_map.conf --> (1, 3, 394, 518)

    # Predict Point Maps
    point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)
    
    # point_map.shape --> (1, 3, 294, 518, 1)
    # point_conf.conf --> (1, 3, 394, 518)
        
    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
    point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0), 
                                                                extrinsic.squeeze(0), 
                                                                intrinsic.squeeze(0))

    # Predict Tracks
    # choose your own points to track, with shape (N, 2) for one scene
    query_points = torch.FloatTensor([[100.0, 200.0], 
                                        [60.72, 259.94]]).to(device)
    track_list, vis_score, conf_score = model.track_head(aggregated_tokens_list, images, ps_idx, query_points=query_points[None])
    
    # len(track_list) = 4
    # vis_score.shape --> (1, 3, 2)
    # conf_score.shape --> (1, 3, 2)
        
        

Tracklets Visualization

In [None]:
from vggt.utils.visual_track import visualize_tracks_on_images
track = track_list[-1]
visualize_tracks_on_images(images, track, (conf_score>0.2) & (vis_score>0.2), out_dir="track_visuals")