In [None]:
%load_ext autoreload
%autoreload 2
%cd ..

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

BASE_DIR = os.getcwd()
os.sys.path.append(os.path.abspath(os.path.join(BASE_DIR, "submodules", "UniDepth")))
os.sys.path.append(os.path.abspath(os.path.join(BASE_DIR, "submodules", "DepthCrafter")))

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
from einops import rearrange
from torch.utils.data import Dataset, DataLoader
import io
import cv2

import numpy as np
from PIL import Image

import mediapy as media
import IPython

from tqdm import tqdm
import os
# print()

from unidepth.models import UniDepthV1, UniDepthV2
from unidepth.utils import colorize, image_grid

from diffusers.training_utils import set_seed

try:
    from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
    from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
    from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames, read_folder_frames, read_video
    print("DepthCrafter is available")
    use_depthcrafter = True
except:
    print("DepthCrafter is not available")
    use_depthcrafter = False

In [None]:
device = torch.device("cuda")

unidepth_model = UniDepthV2.from_pretrained(f"lpiccinelli/unidepth-v2-vitl14")
unidepth_model = unidepth_model.eval().to(device)

In [None]:
if use_depthcrafter:

    unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
        "tencent/DepthCrafter",
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
    )
    # load weights of other components from the provided checkpoint
    depth_crafter_pipe = DepthCrafterPipeline.from_pretrained(
        "stabilityai/stable-video-diffusion-img2vid-xt",
        unet=unet,
        torch_dtype=torch.float16,
        variant="fp16",
    )

    depth_crafter_pipe.to("cuda")
    # enable attention slicing and xformers memory efficient attention
    try:
        depth_crafter_pipe.enable_xformers_memory_efficient_attention()
    except Exception as e:
        print(e)
        print("Xformers is not enabled")
    depth_crafter_pipe.enable_attention_slicing()

In [9]:
@torch.inference_mode()
def predict_unidepth(video, model):
    video_torch = torch.from_numpy(video).permute(0,3,1,2).to(device)

    depth_pred = []
    chunks = torch.split(video_torch, 32, dim=0)
    for chunk in chunks:
        predictions = model.infer(chunk)
        depth_pred_ = predictions["depth"].squeeze(1).cpu().numpy()
        depth_pred.append(depth_pred_)
    depth_pred = np.concatenate(depth_pred, axis=0)

    return depth_pred

@torch.inference_mode()
def predict_depthcrafter(video, pipe):
    frames, ori_h, ori_w = read_video(
        video, max_res=1024
    )
    res = pipe(
        frames,
        height=frames.shape[1],
        width=frames.shape[2],
        output_type="np",
        guidance_scale=1.2,
        num_inference_steps=25,
        window_size=110,
        overlap=25,
        track_time=False,
    ).frames[0]

    # convert the three-channel output to a single channel depth map
    res = res.sum(-1) / res.shape[-1]
    # normalize the depth map to [0, 1] across the whole video
    res = (res - res.min()) / (res.max() - res.min())
    
    res = F.interpolate(torch.from_numpy(res[:, None]), (ori_h, ori_w), mode='nearest').squeeze(1).numpy()

    return res

In [None]:

vid_names = ["lady-running"]



for vid_name in vid_names:
    if os.path.exists(os.path.join("demo_data", vid_name, "color.mp4")):
        video = media.read_video(os.path.join("demo_data", vid_name, "color.mp4"))
    elif os.path.isdir(os.path.join("demo_data", vid_name, "color")):
        rgb_folder = os.path.join("demo_data/", vid_name, "color")
        rgb_frames = sorted([f for f in os.listdir(rgb_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

        video = []

        for rgb_frame in rgb_frames:
            img = cv2.imread(os.path.join(rgb_folder, rgb_frame))
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            video.append(img)
            
        video = np.stack(video)
    else:
        raise ValueError("No video found")
    
    print("Run Unidepth")
    depth_pred = predict_unidepth(video, unidepth_model)
    np.save(os.path.join("demo_data", vid_name, "depth_pred.npy"), depth_pred)

    if use_depthcrafter: 
        print("Run DepthCrafter")   
        disp_pred = predict_depthcrafter(video, depth_crafter_pipe)
        np.save(os.path.join("demo_data", vid_name, "depth_depthcrafter.npy"), disp_pred)
