# VistaDream Initialization Sequence
The first part of vistadream involves setting up a 3D scaffold that will be used to better constrain the generative process.

In [1]:
import sys
from pathlib import Path

import rerun as rr
from numpy.random import default_rng

# Add parent directory to Python path so VistaDream modules can be found
notebook_dir = Path.cwd()
parent_dir = notebook_dir.parent
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))


rng = default_rng(12345)
rr.init("VistaDream Initialization")

%load_ext ipython_beartype
%beartype
%load_ext autoreload
%autoreload 2

# First start with the raw input image
lets setup deps

In [2]:
from pathlib import Path

import numpy as np
from jaxtyping import Bool, Float, UInt8
from monopriors.depth_utils import depth_edges_mask, depth_to_points
from monopriors.relative_depth_models import (
    RelativeDepthPrediction,
    get_relative_predictor,
)
from monopriors.relative_depth_models.base_relative_depth import BaseRelativePredictor
from PIL import Image
from simplecv.camera_parameters import Extrinsics, Intrinsics, PinholeParameters
from simplecv.rerun_log_utils import log_pinhole

from vistadream.ops.flux import FluxInpainting, FluxInpaintingConfig
from vistadream.ops.gs.basic import Frame, Gaussian_Scene
from vistadream.ops.gs.train import GS_Train_Tool
from vistadream.ops.utils import save_ply
from vistadream.resize_utils import add_border_and_mask, process_image



xFormers not available
xFormers not available


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [3]:
parent_log_path: Path = Path("coarse_init")
image_path: Path = Path("/home/pablo/0Dev/personal/vistadream/data/sd_readingroom/color.original.png")
predictor: BaseRelativePredictor = get_relative_predictor("MogeV1Predictor")(device="cuda")
flux_inpainter: FluxInpainting = FluxInpainting(FluxInpaintingConfig())
expansion_percent: float = 0.25  # 10% border expansion

Loading MoGe model...
MoGe model loaded. Time: 2.34s
Init model
Loading checkpoint

Init AE


In [4]:
# Start with initial image
rr.set_time("time", sequence=0)
input_img: Image.Image = Image.open(image_path).convert("RGB")
# ensures image is correctly sized and processed
input_img: Image.Image = process_image(input_img)

# Auto-generate outpainting setup: user-controlled border expansion
border_percent: float = (
    expansion_percent / 2.0
)  # Convert to fraction per side (divide by 2 for each side)
border_output:tuple[Image.Image, Image.Image] = add_border_and_mask(
    input_img,
    zoom_all=1.0,
    zoom_left=border_percent,
    zoom_right=border_percent,
    zoom_up=border_percent,
    zoom_down=border_percent,
    overlap=0,
)
outpaint_img: Image.Image = border_output[0]
outpaint_mask: Image.Image = border_output[1]
outpaint_img: Image.Image = flux_inpainter(rgb_hw3=np.array(outpaint_img), mask=np.array(outpaint_mask))
# )

In [5]:
outpaint_cam_log_path: Path = parent_log_path / "camera_1"
outpaint_pinhole_path: Path = outpaint_cam_log_path / "pinhole"

outpaint_rgb_hw3: UInt8[np.ndarray, "H W 3"] = np.array(outpaint_img.convert("RGB"))
outpaint_rel_depth: RelativeDepthPrediction = predictor.__call__(rgb=outpaint_rgb_hw3, K_33=None)
outpaint_depth_hw: Float[np.ndarray, "H W"] = outpaint_rel_depth.depth
# mask showing where outpainting (inpainting) is applied
outpaint_mask: Bool[np.ndarray, "H W"] = np.array(outpaint_mask).astype(np.bool_)
# depth edges, True near edges, False otherwise
outpaint_edges_mask: Bool[np.ndarray, "H W"] = depth_edges_mask(outpaint_depth_hw, threshold=0.1)
# inpaint/outpaint mask without edges (True where inpainting is applied, False near edges and where no inpainting)
outpaint_wo_edges: Bool[np.ndarray, "H W"] = outpaint_mask & ~outpaint_edges_mask

outpaint_intri: Intrinsics = Intrinsics(
    camera_conventions="RDF",
    fl_x=outpaint_rel_depth.K_33[0, 0].item(),
    fl_y=outpaint_rel_depth.K_33[1, 1].item(),
    cx=outpaint_rel_depth.K_33[0, 2].item(),
    cy=outpaint_rel_depth.K_33[1, 2].item(),
    width=outpaint_rgb_hw3.shape[1],
    height=outpaint_rgb_hw3.shape[0],
)
outpaint_extri = Extrinsics(
    world_R_cam=np.eye(3, dtype=np.float32),
    world_t_cam=np.zeros(3, dtype=np.float32),
)
outpaint_pinhole: PinholeParameters= PinholeParameters(
    name="outpaint camera",
    intrinsics=outpaint_intri,
    extrinsics=outpaint_extri,
)

outpaint_frame: Frame = Frame(
    H=outpaint_rgb_hw3.shape[0],
    W=outpaint_rgb_hw3.shape[1],
    rgb=outpaint_rgb_hw3.astype(np.float32) / 255.0,  # Convert to [0,1] range
    dpt=outpaint_depth_hw,
    intrinsic=outpaint_intri.k_matrix,
    cam_T_world=outpaint_extri.world_T_cam,  # Identity matrix for world coordinates
    inpaint=outpaint_mask,
    inpaint_wo_edge=outpaint_wo_edges,
)

In [6]:
input_cam_log_path: Path = parent_log_path / "camera_0"
input_pinhole_path: Path = input_cam_log_path / "pinhole"

input_rgb_hw3: UInt8[np.ndarray, "H W 3"] = np.array(input_img.convert("RGB"))
# get input depth from outpaint depth, where outpaint mask is False.
# This allows for getting the depth of the original image and only having to run the depth model once
input_area = ~outpaint_mask
input_depth_hw: Float[np.ndarray, "H W"] = outpaint_depth_hw[input_area].reshape(input_img.height, input_img.width)
"""
### Why Input Frame `inpaint` is Set to All `True`

1.  **The Conceptual Issue**
    At first glance, this seems wrong because the input frame contains the original image data, and original pixels shouldn't need "inpainting". We'd expect `inpaint=False` for original content.

2.  **The Training Logic Explanation**
    The `inpaint` mask in training determines which pixels should be supervised during optimization.
    - When `inpaint=True`: "Supervise this pixel - ensure the rendered result matches the target."
    - When `inpaint=False`: "Don't supervise this pixel - ignore it during training."

3.  **Why All `True` Makes Sense for the Input Frame**
    For the input frame, setting `inpaint=True` everywhere means: "Train the Gaussians to perfectly reproduce the original image." All original pixels become supervision targets, forcing the 3D representation to learn to render the input view accurately.

4.  **The Two-Frame Training Strategy**
    - **Input Frame**: Learns to reproduce original content. `input_frame.inpaint` is all `True`, so ALL pixels are supervised.
    - **Outpaint Frame**: Learns to reproduce only new content. `outpaint_frame.inpaint` is `True` only for outpainted areas.

5.  **What This Achieves**
    - **Input Frame Training**: Every original pixel has `inpaint=True`, so the model learns to render them perfectly.
    - **Outpaint Frame Training**: Original pixels have `inpaint=False` (they are already learned, so we skip them), while new outpainted pixels have `inpaint=True` so the model learns to render the new content.

6.  **The Alternative Would Be Problematic**
    If we set `input_frame.inpaint = False` everywhere, there would be no supervision on the original content. The Gaussians wouldn't learn to render the input view, leading to poor reconstruction quality for the reference image. Training would only learn the outpainted content.
"""
input_mask: Bool[np.ndarray, "H W"] = np.full_like(input_depth_hw, True, dtype=np.bool_)
input_edges_mask: Bool[np.ndarray, "H W"] = outpaint_wo_edges[input_area].reshape(input_img.height, input_img.width)
input_mask_wo_edges: Bool[np.ndarray, "H W"] = ~input_edges_mask
input_k33: Float[np.ndarray, "3 3"] = outpaint_rel_depth.K_33.copy()
# focal stays the same, but principal point is adjusted to center of input image
input_k33[0, 2] = input_rgb_hw3.shape[1] / 2.0
input_k33[1, 2] = input_rgb_hw3.shape[0] / 2.0

input_intri: Intrinsics = Intrinsics(
    camera_conventions="RDF",
    fl_x=input_k33[0, 0].item(),
    fl_y=input_k33[1, 1].item(),
    cx=input_k33[0, 2].item(),
    cy=input_k33[1, 2].item(),
    width=input_rgb_hw3.shape[1],
    height=input_rgb_hw3.shape[0],
)
input_extri = Extrinsics(
    world_R_cam=np.eye(3, dtype=np.float32),
    world_t_cam=np.zeros(3, dtype=np.float32),
)
input_pinhole: PinholeParameters= PinholeParameters(
    name="input camera",
    intrinsics=input_intri,
    extrinsics=input_extri,
)

input_frame: Frame = Frame(
    H=input_rgb_hw3.shape[0],
    W=input_rgb_hw3.shape[1],
    rgb=input_rgb_hw3.astype(np.float32) / 255.0,  # Convert to [0,1] range
    dpt=input_depth_hw,
    intrinsic=input_k33,
    cam_T_world=input_extri.world_T_cam,  # Identity matrix for world coordinates
    inpaint=input_mask,
    inpaint_wo_edge=input_mask_wo_edges,
)


In [7]:
rr.log("/", rr.ViewCoordinates.RDF, static=True)
rr.log(f"{input_pinhole_path}/rgb", rr.Image(input_img, color_model=rr.ColorModel.RGB))
rr.log(
    f"{input_pinhole_path}/depth",
    rr.DepthImage(input_depth_hw)
)
rr.log(f"{input_pinhole_path}/inpaint_mask", rr.Image(input_mask.astype(np.uint8)*255, color_model=rr.ColorModel.L))
rr.log(f"{input_pinhole_path}/edge_mask", rr.Image(input_edges_mask.astype(np.uint8)*255, color_model=rr.ColorModel.L))
rr.log(f"{input_pinhole_path}/inpaint_wo_edges_mask", rr.Image(input_mask_wo_edges.astype(np.uint8)*255, color_model=rr.ColorModel.L))


In [8]:
rr.log(f"{outpaint_pinhole_path}/rgb", rr.Image(outpaint_img, color_model=rr.ColorModel.RGB))
rr.log(
    f"{outpaint_pinhole_path}/depth",
    rr.DepthImage(outpaint_depth_hw)
)
rr.log(f"{outpaint_pinhole_path}/inpaint_mask", rr.Image(outpaint_mask.astype(np.uint8)*255, color_model=rr.ColorModel.L))
rr.log(f"{outpaint_pinhole_path}/edge_mask", rr.Image(outpaint_edges_mask.astype(np.uint8)*255, color_model=rr.ColorModel.L))
rr.log(f"{outpaint_pinhole_path}/inpaint_wo_edges_mask", rr.Image(outpaint_wo_edges.astype(np.uint8)*255, color_model=rr.ColorModel.L))
log_pinhole(camera=outpaint_pinhole, cam_log_path=outpaint_cam_log_path)

In [9]:
rr.notebook_show(width=1000, height=800)

HTML(value='<div id="75cf7c1c-d617-4966-b161-7be1019a903c"><style onload="eval(atob(\'KGFzeW5jIGZ1bmN0aW9uICgp…

Viewer()

In [10]:
scene = Gaussian_Scene()

In [11]:
scene._add_trainable_frame(input_frame, require_grad=True)
# scene._add_trainable_frame(outpaint_frame, require_grad=True)

In [12]:
scene = GS_Train_Tool(scene, iters=100)(scene.frames)

100%|██████████| 100/100 [00:00<00:00, 108.97it/s]


In [13]:
from vistadream.ops.visual_check import Check

checkor: Check = Check()

save_dir: Path = Path("../data/new_test_dir/")
gf_path: Path = save_dir / "gf.ply"
save_dir.mkdir(exist_ok=True, parents=True)
checkor._render_video(scene, save_dir=save_dir, nframes=250)
save_ply(scene, gf_path)

[INFO] rendering final video with 250 frames...


In [14]:
from IPython.display import Video, display

video_dpt_path = save_dir / "video_dpt.mp4"
video_rgb_path = save_dir / "video_rgb.mp4"

print("Depth Video:")
display(Video(video_dpt_path, embed=True))

print("\nRGB Video:")
display(Video(video_rgb_path, embed=True))

Depth Video:



RGB Video:


In [15]:
from copy import deepcopy

import torch

from vistadream.ops.trajs import _generate_trajectory

rr.init("VistaDream Splat")
rr.notebook_show(width=1000, height=800)

HTML(value='<div id="ad93d080-364f-41f8-a171-2014f760ca2a"><style onload="eval(atob(\'KGFzeW5jIGZ1bmN0aW9uICgp…

Viewer()

In [16]:
rr.log("/", rr.ViewCoordinates.RDF, static=True)
with torch.no_grad():
    nframes = 100
    cam_T_world_traj: Float[np.ndarray, "n_frames 4 4"] = _generate_trajectory(None, scene, nframes=nframes)
    H, W, intrinsic = scene.frames[0].H, scene.frames[0].W, deepcopy(scene.frames[0].intrinsic)
    intri: Intrinsics = Intrinsics(
        camera_conventions="RDF",
        fl_x=intrinsic[0, 0].item(),
        fl_y=intrinsic[1, 1].item(),
        cx=intrinsic[0, 2].item(),
        cy=intrinsic[1, 2].item(),
        width=W,
        height=H,
    )
    # render
    print(f"[INFO] rendering final video with {nframes} frames...")
    rgb_list: list[Float[np.ndarray, "H W 3"]] = []
    dpt_list: list[Float[np.ndarray, "H W"]] = []
    for idx, cam_T_world in enumerate(cam_T_world_traj):
        rr.set_time("time", sequence=idx)
        extri = Extrinsics(
            cam_R_world=cam_T_world[:3, :3],
            cam_t_world=cam_T_world[:3, 3],
        )
        pinhole = PinholeParameters(
            name="camera",
            intrinsics=intri,
            extrinsics=extri,
        )
        frame = Frame(H=H, W=W, intrinsic=intrinsic, cam_T_world=extri.cam_T_world)
        rgb, dpt, alpha = scene._render_RGBD(frame)
        rgb: Float[np.ndarray, "H W 3"] = rgb.numpy(force=True)
        dpt: Float[np.ndarray, "H W"] = dpt.numpy(force=True)

        rgb = (rgb * 255).astype(np.uint8)  # Convert to [0, 255] range
        rr.log(f"{parent_log_path}/camera/pinhole/rgb", rr.Image(rgb, color_model=rr.ColorModel.RGB).compress(jpeg_quality=75))
        log_pinhole(camera=pinhole, cam_log_path=parent_log_path / "camera")
        rr.log(f"{parent_log_path}/camera/pinhole/depth", rr.DepthImage(dpt))


[INFO] rendering final video with 100 frames...
