In [None]:
# Import data (stored in google drive)
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
# Select Checkpoint
!pip install ipywidgets==7.7.1 --quiet

from IPython.display import display
import ipywidgets as widgets
import shutil

ckpt_dic = {
    "Original (higher quality)": "prs-eth/marigold-v1-0",
    "LCM (faster)": "prs-eth/marigold-lcm-v1-0",
}

ckpt_name = 'LCM (faster)'
ckpt_path = ckpt_dic[ckpt_name]
w = widgets.Dropdown(
    options=['Original (higher quality)', 'LCM (faster)'],
    value=ckpt_name,
    description='Checkpoint:',
)


def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        ckpt_name = change['new']
        ckpt_path = ckpt_dic[ckpt_name]

w.observe(on_change)

display(w)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.6 MB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h

Dropdown(description='Checkpoint:', index=1, options=('Original (higher quality)', 'LCM (faster)'), value='LCM…

In [None]:
# Clone Marigold repository -- notify Erich about pulling Marigold model
%%shell
cd /content

if [ -d "Marigold" ]; then
    cd Marigold
    git pull
else
    git clone https://github.com/prs-eth/Marigold.git
    cd Marigold
fi


Cloning into 'Marigold'...
remote: Enumerating objects: 472, done.[K
remote: Counting objects: 100% (321/321), done.[K
remote: Compressing objects: 100% (182/182), done.[K
remote: Total 472 (delta 204), reused 224 (delta 133), pack-reused 151[K
Receiving objects: 100% (472/472), 5.66 MiB | 27.98 MiB/s, done.
Resolving deltas: 100% (267/267), done.




In [None]:
# Install dependencies from Marigold
%%shell

cd /content/Marigold

# pip install -r requirements.txt --upgrade  --quiet
pip install accelerate diffusers matplotlib scipy torch transformers --quiet

# for progress bar
pip install ipywidgets==7.7.1 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h



In [None]:
# Setup directories
import os

# Directories
repo_dir = "/content/Marigold"
input_dir = os.path.join("/content/drive/MyDrive/sample", "Image")
output_dir = os.path.join(repo_dir, "outputs")
output_dir_color = os.path.join(output_dir, "depth_colored")
output_dir_tif = os.path.join(output_dir, "depth_bw")
output_dir_npy = os.path.join(output_dir, "depth_npy")

os.chdir(repo_dir)

In [None]:
import os
import numpy as np
from glob import glob
from collections import deque
from PIL import Image
import torch
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import logging
from scipy.ndimage import zoom
from sklearn.metrics import mean_squared_error

In [None]:
def calculate_rmse(predictions, ground_truth):
    """
    Calculate RMSE, resizing the prediction to the ground truth dimensions if necessary.
    """
    if predictions.shape != ground_truth.shape:
        predictions = resize_depth_map(predictions, ground_truth.shape)
    # print("Prediction: ", predictions[0:6])
    # print("Ground truth: " + ground_truth[0:6])
    # return np.sqrt(np.mean((predictions - ground_truth) ** 2))

    rms = mean_squared_error(ground_truth, predictions, squared=False)
    return rms

In [None]:
def resize_depth_map(source, target_shape):
    """
    Resizes the source array to the target shape using simple resampling,
    which is suitable for depth maps where preserving exact pixel values isn't crucial.
    """
    # Calculate the zoom factors for each dimension
    zoom_factors = (target_shape[0] / source.shape[0], target_shape[1] / source.shape[1])
    return zoom(source, zoom_factors, order=1)  # Using bilinear interpolation (order=1)

In [None]:
def average_depth_maps(buffer):
    if len(buffer) > 0:
        return np.mean(np.array(buffer), axis=0)
    else:
        return None

In [None]:
def weighted_average_depth_maps(depth_maps, weights):
    """Calculate the weighted average of depth maps."""
    if len(depth_maps) != len(weights):
        raise ValueError("The number of depth maps and weights must match")
    # Stack the depth maps along the third dimension
    stacked_depth_maps = np.stack(depth_maps, axis=-1)
    # Normalize weights to ensure they sum to 1
    normalized_weights = np.array(weights) / np.sum(weights)
    # Perform the weighted average along the stack
    weighted_avg_depth_map = np.average(stacked_depth_maps, axis=-1, weights=normalized_weights)

    return weighted_avg_depth_map

In [None]:
def optimize_weights(depth_maps, depth_gt, initial_weights=None, epochs=100, learning_rate=0.01):
    depth_maps_tensor = torch.stack([dm.clone().detach() for dm in depth_maps])
    depth_gt_tensor = depth_gt.clone().detach()

    # Initialize weights or use provided initial weights
    if initial_weights is None:
        weights = torch.randn(depth_maps_tensor.shape[0], requires_grad=True)
    else:
        weights = torch.tensor(initial_weights, dtype=torch.float32, requires_grad=True)

    optimizer = torch.optim.Adam([weights], lr=learning_rate)

    for epoch in range(epochs):
        optimizer.zero_grad()

        # Calculate weighted average of depth maps
        expanded_weights = weights.unsqueeze(1).unsqueeze(2)
        weighted_depth_maps = torch.mul(expanded_weights, depth_maps_tensor)
        weighted_average = torch.sum(weighted_depth_maps, dim=0) / torch.sum(weights)

        # Compute loss
        loss = torch.mean((weighted_average - depth_gt_tensor) ** 2)

        weighted_depth_maps.requires_grad = True
        weighted_average.requires_grad = True
        loss.requires_grad = True

        # Backpropagate the loss
        loss.backward()

        # Update weights
        optimizer.step()

    # Normalize weights to ensure they sum to 1
    with torch.no_grad():
        normalized_weights = weights / weights.sum()

    return normalized_weights.cpu().numpy()

In [None]:
# Initialize directories and pipeline for Marigold
from marigold import MarigoldPipeline
repo_dir = "/content/Marigold"
ckpt_path = "prs-eth/marigold-lcm-v1-0"
pipe = MarigoldPipeline.from_pretrained(ckpt_path).to("cuda")

In [None]:
import os
import random
import re
from sklearn.model_selection import train_test_split

In [None]:
# Define base directory for training data
base_dir = "/content/drive/MyDrive/data"
# Process each dataset
datasets = sorted(glob(os.path.join(base_dir, "sample_*")))
# Define lists to store paths for each set
train_paths = []
val_paths = []
test_paths = []

In [None]:
def numerical_sort_key(s):
    """
    Extracts numbers from a filename and returns them for sorting purposes.
    """
    return [int(text) if text.isdigit() else text for text in re.split('(\d+)', s)]

In [None]:
def split_data(files, test_size, val_size):
    """
    Manually splits the data into training, validation, and test sets without shuffling.
    """
    n = len(files)
    test_count = int(n * test_size)
    val_count = int(n * val_size)

    test_files = files[:test_count]
    val_files = files[test_count:test_count + val_count]
    train_files = files[test_count + val_count:]

    return train_files, val_files, test_files

In [None]:
# Iterate over each video folder
for video_folder in os.listdir(base_dir):
    # Skip directories that do not contain 'sample_'
    if "sample_" not in video_folder:
        continue
    video_path = os.path.join(base_dir, video_folder)
    if not os.path.isdir(video_path):
        continue

    # Gather paths to input images and depth maps
    image_dir = os.path.join(video_path, "Image")
    depth_dir = os.path.join(video_path, "Depth")

    # Sorting files numerically
    image_files = sorted(os.listdir(image_dir), key=numerical_sort_key)
    depth_files = sorted(os.listdir(depth_dir), key=numerical_sort_key)
    assert len(image_files) == len(depth_files), f"Mismatch in number of files for {video_folder}"

    # Split paths into training, validation, and test sets without shuffling
    image_train, image_val, image_test = split_data(image_files, test_size=0.15, val_size=0.1765)
    depth_train, depth_val, depth_test = split_data(depth_files, test_size=0.15, val_size=0.1765)

    # Store paths for each set
    train_paths.extend([(os.path.join(image_dir, img), os.path.join(depth_dir, depth)) for img, depth in zip(image_train, depth_train)])
    val_paths.extend([(os.path.join(image_dir, img), os.path.join(depth_dir, depth)) for img, depth in zip(image_val, depth_val)])
    test_paths.extend([(os.path.join(image_dir, img), os.path.join(depth_dir, depth)) for img, depth in zip(image_test, depth_test)])


In [None]:
# Print the number of samples in each set
print(f"Number of samples in training set: {len(train_paths)}")
print(f"Number of samples in validation set: {len(val_paths)}")
print(f"Number of samples in test set: {len(test_paths)}")

In [None]:
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import pickle

In [None]:
def extract_video_id(filepath):
    # Regex finds 'sample_' followed by any digits (\d+), capturing the digits only
    match = re.search(r'sample_(\d+)', filepath)
    if match:
        return match.group(1)  # Returns the first group (the digits following 'sample_')
    return None  # Return None if no match is found


In [None]:
# Training
fig, ax = plt.subplots(figsize=(10, 5))

train_errors = []
val_errors = []
test_errors = []
errors = []
datasets = [train_paths[:400], val_paths[:80], test_paths[:80]]   # Assuming these are already defined
dataset_names = ["Training", "Validation", "Testing"]
current_video_id = None
optimized_weights = [1,1,1,1,1,1,1,1,1,1]
weights_history = []  # To store weights for each iteration

for dataset, name in zip(datasets, dataset_names):
    # Create a new figure and axis object for each dataset
    fig, ax = plt.subplots(figsize=(10, 5))

    print(f"Processing {name} dataset")
    rgb_files = [path[0] for path in dataset]
    depth_gt_files = [path[1] for path in dataset]

    depth_buffer = deque(maxlen=5)
    depth_gt_buffer = deque(maxlen=5)

    if name == "Training":
        errors = train_errors
    elif name == "Validation":
        errors = val_errors
    elif name == "Testing":
        errors = test_errors

    for rgb_path, depth_gt_path in tqdm(zip(rgb_files, depth_gt_files), total=len(rgb_files), desc=f"Processing {name} images"):
        new_video_id = extract_video_id(rgb_path)

        # Check if the video ID has changed (new video sequence)
        if current_video_id is not None and new_video_id != current_video_id:
            # Flush the buffers when a new video sequence is detected
            depth_buffer.clear()
            depth_gt_buffer.clear()

        current_video_id = new_video_id

        with torch.no_grad():
            input_image = Image.open(rgb_path).convert("RGB")
            depth_gt_image = Image.open(depth_gt_path).convert("RGB")

            depth_gt_array = np.array(depth_gt_image)
            depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
            depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(input_image.size, Image.BILINEAR)
            depth_gt_final = np.array(depth_gt_resized)

            # Call depth estimation pipeline
            pipeline_output = pipe(
                input_image,
                denoising_steps=4,
                ensemble_size=5,
                processing_res=768,
                match_input_res=True,
                color_map="Spectral",
                show_progress_bar=True
            )
            depth_pred = pipeline_output.depth_np

            depth_buffer.append(depth_pred)
            depth_gt_buffer.append(depth_gt_final)

            if len(depth_buffer) > 0:
                depth_maps_tensor = torch.stack([torch.tensor(dm, dtype=torch.float32) for dm in depth_buffer])
                depth_gt_tensor = torch.tensor(depth_gt_final, dtype=torch.float32)

                if name != "Testing":
                    if len(optimized_weights) != len(depth_buffer):
                      optimized_weights = optimize_weights(depth_maps_tensor, depth_gt_tensor)
                    else:
                      optimized_weights = optimize_weights(depth_maps_tensor, depth_gt_tensor, initial_weights=optimized_weights)
                depth_maps_array = np.array(list(depth_buffer))
                if optimized_weights is None or len(optimized_weights) != len(depth_buffer):
                    # Initialize or reset weights if they don't match the current buffer size
                    current_weights = np.ones(len(depth_buffer)) / len(depth_buffer)
                else:
                    # Ensure optimized_weights is a numpy array with the correct length
                    current_weights = optimized_weights[:len(depth_buffer)]

                # Compute the weighted average
                weighted_depth_pred = np.average(depth_maps_array, axis=0, weights=current_weights)
                rmse = calculate_rmse(weighted_depth_pred, depth_gt_final)
                errors.append(rmse)
                print(f'RMSE for {name}: {rmse}')


                weights_history.append(optimized_weights.copy())  # Store a copy of weights for each iteration

                # Save errors and weights_history to a file for every new image processed
                with open(f"{name.lower()}_errors_and_weights.pkl", "wb") as f:
                    pickle.dump({"errors": errors, "weights_history": weights_history}, f)


        # Update the plot for every iteration
        ax.plot(errors, label=f'RMSE per Image for {name} if not ax.lines else "Update')
        ax.set_xlabel('Image Index')
        ax.set_ylabel('RMSE')
        ax.set_title(f'Running RMSE Across {name} Dataset')
        if not ax.lines:
            ax.legend()

        clear_output(wait=True)
        display(fig)
        print("Current Weights", optimized_weights)

    # Save the graph as a PDF before changing from training to validation to testing
    plt.savefig(f"{name.lower()}_graph.pdf")

    plt.close(fig)
    print(optimized_weights)

    print(f"Average RMSE for {name} dataset:", np.mean(errors))

In [None]:
# Obtain each video data for testing
import os
import re
from glob import glob
base_dir = "/content/drive/MyDrive/data"

# Create a dictionary to store paths for each dataset
dataset_paths = {}
# Iterate over each video folder
for video_folder in os.listdir(base_dir):
    # Skip directories that do not contain 'sample_'
    if "sample_" not in video_folder:
        continue
    video_path = os.path.join(base_dir, video_folder)
    if not os.path.isdir(video_path):
        continue

    # Gather paths to input images and depth maps
    image_dir = os.path.join(video_path, "Image")
    depth_dir = os.path.join(video_path, "Depth")
    # print(image_dir, depth_dir)

    # Ensure directories exist
    if not os.path.exists(image_dir) or not os.path.exists(depth_dir):
        continue

    # Sorting files numerically
    image_files = sorted(os.listdir(image_dir), key=numerical_sort_key)
    depth_files = sorted(os.listdir(depth_dir), key=numerical_sort_key)
    assert len(image_files) == len(depth_files), f"Mismatch in number of files for {video_folder}"

    # Store paths for each image-depth pair in the dictionary
    dataset_paths[video_folder] = [(os.path.join(image_dir, img), os.path.join(depth_dir, depth)) for img, depth in zip(image_files, depth_files)]

In [None]:
print(dataset_paths)

In [None]:
def plot_errors(errors, title):
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(errors, label='RMSE per Image')
    ax.set_xlabel('Image Index')
    ax.set_ylabel('RMSE')
    ax.set_title(title)
    ax.legend()
    plt.show()
    plt.close(fig)

In [None]:
def process_video_optimized(rgb_depth_pairs):
    depth_buffer = deque(maxlen=5)
    errors = []
    frame_buffer = deque(maxlen=5)
    original_weights = np.array([0.2854117, 0.04321782, -0.33442682, 0.27889505, 0.7269023])

    for rgb_path, depth_gt_path in tqdm(rgb_depth_pairs, desc="Processing images"):
        with torch.no_grad():
            input_image = Image.open(rgb_path).convert("RGB")
            input_array = np.array(input_image, dtype=np.float32)
            frame_buffer.append(input_array)

            # Determine the weights based on the buffer size
            if len(frame_buffer) == frame_buffer.maxlen:
                normalized_weights = original_weights / np.sum(original_weights)
            else:
                # Use only the most recent weights corresponding to the number of frames available
                recent_weights = original_weights[-len(frame_buffer):]
                normalized_weights = recent_weights / np.sum(recent_weights)

            # Compute the weighted average of images
            blended_image_array = np.average(np.stack(frame_buffer, axis=0), axis=0, weights=normalized_weights)
            blended_image = Image.fromarray(np.uint8(blended_image_array))

            depth_gt_image = Image.open(depth_gt_path).convert("RGB")
            depth_gt_array = np.array(depth_gt_image)
            depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
            depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(blended_image.size, Image.BILINEAR)
            depth_gt_final = np.array(depth_gt_resized)

            # Call your depth estimation pipeline on the blended image
            pipeline_output = pipe(
                blended_image,
                denoising_steps=4,
                ensemble_size=5,
                processing_res=768,
                match_input_res=True,
                color_map="Spectral",
                show_progress_bar=True
            )
            depth_pred = pipeline_output.depth_np

            depth_buffer.append(depth_pred)

            # RMSE on the latest depth map only
            rmse = calculate_rmse(depth_pred, depth_gt_final)
            errors.append(rmse)
            print(f'RMSE: {rmse}')

    return np.mean(errors), errors

In [None]:
def process_video_exponential_decay(rgb_depth_pairs):
    depth_buffer = deque(maxlen=5)
    errors = []
    frame_buffer = deque(maxlen=5)
    lambda_decay = 0.8

    for rgb_path, depth_gt_path in tqdm(rgb_depth_pairs, desc="Processing images"):
        with torch.no_grad():
              input_image = Image.open(rgb_path).convert("RGB")
              input_array = np.array(input_image, dtype=np.float32)
              frame_buffer.append(input_array)

              # Calculate weights based on buffer size
              weights = np.array([lambda_decay**i for i in range(len(frame_buffer)-1, -1, -1)])
              normalized_weights = weights / np.sum(weights)

              # Compute weighted average of images
              blended_image_array = np.average(np.stack(frame_buffer, axis=0), axis=0, weights=normalized_weights)
              blended_image = Image.fromarray(np.uint8(blended_image_array))

              depth_gt_image = Image.open(depth_gt_path).convert("RGB")
              depth_gt_array = np.array(depth_gt_image)
              depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
              depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(blended_image.size, Image.BILINEAR)
              depth_gt_final = np.array(depth_gt_resized)

              # Call your depth estimation pipeline on the blended image
              pipeline_output = pipe(
                  blended_image,
                  denoising_steps=4,
                  ensemble_size=5,
                  processing_res=768,
                  match_input_res=True,
                  color_map="Spectral",
                  show_progress_bar=True
              )
              depth_pred = pipeline_output.depth_np

              depth_buffer.append(depth_pred)

              # RMSE on the latest depth map only
              rmse = calculate_rmse(depth_pred, depth_gt_final)
              errors.append(rmse)
              print(f'RMSE: {rmse}')

    return np.mean(errors), errors

In [None]:
def process_video_average(rgb_depth_pairs):
    depth_buffer = deque(maxlen=5)
    errors = []
    frame_buffer = deque(maxlen=5)

    for rgb_path, depth_gt_path in tqdm(rgb_depth_pairs, desc="Processing images"):
        with torch.no_grad():
            input_image = Image.open(rgb_path).convert("RGB")
            input_array = np.array(input_image, dtype=np.float32)
            frame_buffer.append(input_array)

            # Calculate equal weights for all images in the buffer
            if len(frame_buffer) > 0:
                normalized_weights = np.ones(len(frame_buffer)) / len(frame_buffer)
            else:
                normalized_weights = np.array([])

            # Compute average of images
            blended_image_array = np.average(np.stack(frame_buffer, axis=0), axis=0, weights=normalized_weights)
            blended_image = Image.fromarray(np.uint8(blended_image_array))

            depth_gt_image = Image.open(depth_gt_path).convert("RGB")
            depth_gt_array = np.array(depth_gt_image)
            depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
            depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(blended_image.size, Image.BILINEAR)
            depth_gt_final = np.array(depth_gt_resized)

            # Call your depth estimation pipeline on the blended image
            pipeline_output = pipe(
                blended_image,
                denoising_steps=4,
                ensemble_size=5,
                processing_res=768,
                match_input_res=True,
                color_map="Spectral",
                show_progress_bar=True
            )
            depth_pred = pipeline_output.depth_np

            depth_buffer.append(depth_pred)

            # RMSE on the latest depth map only
            rmse = calculate_rmse(depth_pred, depth_gt_final)
            errors.append(rmse)
            print(f'RMSE: {rmse}')
    return np.mean(errors), errors

In [None]:
def process_video_baseline(rgb_depth_pairs):
    errors = []
    for rgb_path, depth_gt_path in tqdm(rgb_depth_pairs, desc="Processing images"):
        with torch.no_grad():
            input_image = Image.open(rgb_path).convert("RGB")
            depth_gt_image = Image.open(depth_gt_path).convert("RGB")
            depth_gt_array = np.array(depth_gt_image)
            depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
            depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(input_image.size, Image.BILINEAR)
            depth_gt_final = np.array(depth_gt_resized)

            # Call your depth estimation pipeline
            pipeline_output = pipe(
                    input_image,
                    denoising_steps=4,
                    ensemble_size=5,
                    processing_res=768,
                    match_input_res=True,
                    color_map="Spectral",
                    show_progress_bar=True
            )
            depth_pred = pipeline_output.depth_np

            rmse = calculate_rmse(depth_pred, depth_gt_final)
            errors.append(rmse)
            print(f'RMSE: {rmse}')
    return np.mean(errors), errors

In [None]:
keys = list(dataset_paths.keys())
current_key = keys[0]

# Process only the second video
mean_rmse, errors = process_video_optimized(dataset_paths[current_key])
plot_errors(errors, f'Running RMSE Across {current_key}')
print(f"Average RMSE for {current_key}: {mean_rmse}")

In [None]:
print(f"RMSE for {current_key}: {errors}")
print(current_key)

RMSE for sample_2: [0.3688975, 0.36967906, 0.30891892, 0.33703595, 0.34714332, 0.35215288, 0.3646447, 0.34880856, 0.35053036, 0.3452124, 0.35682526, 0.36253852, 0.3408827, 0.3533279, 0.34189105, 0.34139425, 0.33831543, 0.34103617, 0.34061489, 0.34724608, 0.3412142, 0.3329564, 0.3344652, 0.3415812, 0.3592501, 0.34997138, 0.34696892, 0.35890526, 0.34587604, 0.36945957, 0.3492622, 0.3307263, 0.32984254, 0.33561787, 0.3272429, 0.34261015, 0.35631803, 0.3292814, 0.33928084, 0.34562105, 0.34719127, 0.31634128, 0.34739035, 0.3520074, 0.33738714, 0.3329149, 0.33168855, 0.33326676, 0.3363141, 0.34884605, 0.32949933, 0.3333555, 0.33736798, 0.3481781, 0.3367785, 0.3414029, 0.30640128, 0.31506127, 0.33648556, 0.36096376, 0.32493883, 0.33809954, 0.3200681, 0.329629, 0.34908876, 0.3562902, 0.32989317, 0.31921762, 0.36628598, 0.3594858, 0.34200868, 0.36178297, 0.3584098, 0.3209798, 0.33532596, 0.35584727, 0.33648467, 0.33627844, 0.34045452, 0.33428434, 0.34827194, 0.3325955, 0.3495238, 0.3279648, 0.3

In [None]:
keys = list(dataset_paths.keys())
current_key = keys[0]


mean_rmse_base_1, errors_base_1 = process_video_baseline(dataset_paths[current_key])
plot_errors(errors_base_1, f'Running RMSE Across {current_key}')
print(f"Average RMSE for {current_key}: {mean_rmse_base_1}")
print(f"RMSE for {current_key}: {errors_base_1}")

In [None]:
print(f"Average RMSE for {current_key}: {mean_rmse_base_1}")
print(f"RMSE for {current_key}: {errors_base_1}")

Average RMSE for sample_2: 0.3507709503173828
RMSE for sample_2: [0.37633568, 0.36740068, 0.37558514, 0.37047654, 0.3632187, 0.3709728, 0.35682863, 0.36476618, 0.36804646, 0.35948792, 0.35404092, 0.3601339, 0.34770527, 0.35543835, 0.34955055, 0.34321564, 0.34540886, 0.35107002, 0.35141963, 0.35417795, 0.35982662, 0.36105496, 0.35624057, 0.34340587, 0.3721493, 0.36884487, 0.36636475, 0.35299277, 0.3525359, 0.3580638, 0.3610465, 0.36061114, 0.36244515, 0.35645455, 0.3493328, 0.3432749, 0.34648037, 0.3570198, 0.33345327, 0.3435514, 0.33834785, 0.34554338, 0.34646392, 0.36042428, 0.3589725, 0.37231988, 0.37409192, 0.3492754, 0.35811773, 0.36068016, 0.35750026, 0.3657754, 0.35585278, 0.360988, 0.36458892, 0.36142832, 0.34967345, 0.3538665, 0.36086658, 0.36107618, 0.35462108, 0.3581254, 0.35027164, 0.36851376, 0.36163244, 0.36829343, 0.36417198, 0.3658908, 0.3568185, 0.36375442, 0.3746945, 0.37187123, 0.38424954, 0.367514, 0.35438952, 0.35714942, 0.35419384, 0.35362962, 0.3662249, 0.35445768

In [None]:
keys = list(dataset_paths.keys())
current_key = keys[0]


mean_rmse_base_2, errors_base_2 = process_video_exponential_decay(dataset_paths[current_key])
plot_errors(errors_base_2, f'Running RMSE Across {current_key}')
print(f"Average RMSE for {current_key}: {mean_rmse_base_2}")
print(f"RMSE for {current_key}: {errors_base_2}")

In [None]:
print(f"Average RMSE for {current_key}: {mean_rmse_base_2}")
print(f"RMSE for {current_key}: {errors_base_2}")

In [None]:
keys = list(dataset_paths.keys())
current_key = 'sample_2'


mean_rmse_base_3, errors_base_3 = process_video_average(dataset_paths[current_key])
plot_errors(errors_base_3, f'Running RMSE Across {current_key}')
print(f"Average RMSE for {current_key}: {mean_rmse_base_3}")
print(f"RMSE for {current_key}: {errors_base_3}")

In [None]:
# Test on data with weights as trained
depth_buffer = deque(maxlen=5)
frame_buffer = deque(maxlen=5)
errors = []
fig, ax = plt.subplots(figsize=(10, 5))
original_weights = np.array([0.2854117, 0.04321782, -0.33442682, 0.27889505, 0.7269023])

for rgb_path, depth_gt_path in tqdm(dataset, total=len(dataset), desc="Processing images"):
    with torch.no_grad():
        input_image = Image.open(rgb_path).convert("RGB")
        input_array = np.array(input_image, dtype=np.float32)
        frame_buffer.append(input_array)

        # Determine the weights based on the buffer size
        if len(frame_buffer) == frame_buffer.maxlen:
            normalized_weights = original_weights / np.sum(original_weights)
        else:
            # Use only the most recent weights corresponding to the number of frames available
            recent_weights = original_weights[-len(frame_buffer):]
            normalized_weights = recent_weights / np.sum(recent_weights)

        # Compute the weighted average of images
        blended_image_array = np.average(np.stack(frame_buffer, axis=0), axis=0, weights=normalized_weights)
        blended_image = Image.fromarray(np.uint8(blended_image_array))

        depth_gt_image = Image.open(depth_gt_path).convert("RGB")
        depth_gt_array = np.array(depth_gt_image)
        depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
        depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(blended_image.size, Image.BILINEAR)
        depth_gt_final = np.array(depth_gt_resized)

        # Call your depth estimation pipeline on the blended image
        pipeline_output = pipe(
            blended_image,
            denoising_steps=4,
            ensemble_size=5,
            processing_res=768,
            match_input_res=True,
            color_map="Spectral",
            show_progress_bar=True
        )
        depth_pred = pipeline_output.depth_np

        depth_buffer.append(depth_pred)

        # RMSE on the latest depth map only
        rmse = calculate_rmse(depth_pred, depth_gt_final)
        errors.append(rmse)
        print(f'RMSE: {rmse}')

# Finalize plot
ax.plot(errors, label='RMSE per Image')
ax.set_xlabel('Image Index')
ax.set_ylabel('RMSE')
ax.set_title('Running RMSE Across Dataset')
ax.legend()
plt.show()
plt.savefig("dataset_rmse_graph.pdf")
plt.close(fig)

print(f"Average RMSE for the dataset: {np.mean(errors)}")

In [None]:
# Test on data with exponentially decaying weights
depth_buffer = deque(maxlen=5)
frame_buffer = deque(maxlen=5)
errors = []

fig, ax = plt.subplots(figsize=(10, 5))

# Lambda for exponential decay
lambda_decay = 0.8

for rgb_path, depth_gt_path in tqdm(dataset, total=len(dataset), desc="Processing images"):
    with torch.no_grad():
        input_image = Image.open(rgb_path).convert("RGB")
        input_array = np.array(input_image, dtype=np.float32)
        frame_buffer.append(input_array)

        # Calculate weights based on buffer size
        weights = np.array([lambda_decay**i for i in range(len(frame_buffer)-1, -1, -1)])
        normalized_weights = weights / np.sum(weights)

        # Compute weighted average of images
        blended_image_array = np.average(np.stack(frame_buffer, axis=0), axis=0, weights=normalized_weights)
        blended_image = Image.fromarray(np.uint8(blended_image_array))

        depth_gt_image = Image.open(depth_gt_path).convert("RGB")
        depth_gt_array = np.array(depth_gt_image)
        depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
        depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(blended_image.size, Image.BILINEAR)
        depth_gt_final = np.array(depth_gt_resized)

        # Call your depth estimation pipeline on the blended image
        pipeline_output = pipe(
            blended_image,
            denoising_steps=4,
            ensemble_size=5,
            processing_res=768,
            match_input_res=True,
            color_map="Spectral",
            show_progress_bar=True
        )
        depth_pred = pipeline_output.depth_np

        depth_buffer.append(depth_pred)

        # RMSE on the latest depth map only
        rmse = calculate_rmse(depth_pred, depth_gt_final)
        errors.append(rmse)
        print(f'RMSE: {rmse}')

# Finalize plot
ax.plot(errors, label='RMSE per Image')
ax.set_xlabel('Image Index')
ax.set_ylabel('RMSE')
ax.set_title('Running RMSE Across Dataset')
ax.legend()
plt.show()
plt.savefig("dataset_rmse_graph.pdf")
plt.close(fig)

print(f"Average RMSE for the dataset: {np.mean(errors)}")

In [None]:
# Test on data with equal weights throughout (just average)
depth_buffer = deque(maxlen=5)
frame_buffer = deque(maxlen=5)
errors = []

fig, ax = plt.subplots(figsize=(10, 5))

for rgb_path, depth_gt_path in tqdm(dataset, total=len(dataset), desc="Processing images"):
    with torch.no_grad():
        input_image = Image.open(rgb_path).convert("RGB")
        input_array = np.array(input_image, dtype=np.float32)
        frame_buffer.append(input_array)

        # Calculate equal weights for all images in the buffer
        if len(frame_buffer) > 0:
            normalized_weights = np.ones(len(frame_buffer)) / len(frame_buffer)
        else:
            normalized_weights = np.array([])

        # Compute average of images
        blended_image_array = np.average(np.stack(frame_buffer, axis=0), axis=0, weights=normalized_weights)
        blended_image = Image.fromarray(np.uint8(blended_image_array))

        depth_gt_image = Image.open(depth_gt_path).convert("RGB")
        depth_gt_array = np.array(depth_gt_image)
        depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
        depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(blended_image.size, Image.BILINEAR)
        depth_gt_final = np.array(depth_gt_resized)

        # Call your depth estimation pipeline on the blended image
        pipeline_output = pipe(
            blended_image,
            denoising_steps=4,
            ensemble_size=5,
            processing_res=768,
            match_input_res=True,
            color_map="Spectral",
            show_progress_bar=True
        )
        depth_pred = pipeline_output.depth_np

        depth_buffer.append(depth_pred)

        # RMSE on the latest depth map only
        rmse = calculate_rmse(depth_pred, depth_gt_final)
        errors.append(rmse)
        print(f'RMSE: {rmse}')

# Finalize plot
ax.plot(errors, label='RMSE per Image')
ax.set_xlabel('Image Index')
ax.set_ylabel('RMSE')
ax.set_title('Running RMSE Across Dataset')
ax.legend()
plt.show()
plt.savefig("dataset_rmse_graph.pdf")
plt.close(fig)

print(f"Average RMSE for the dataset: {np.mean(errors)}")

In [None]:
# Test on data without any conditional inputs (baseline)
errors = []

fig, ax = plt.subplots(figsize=(10, 5))

for rgb_path, depth_gt_path in tqdm(dataset, total=len(dataset), desc="Processing images"):
    with torch.no_grad():
        input_image = Image.open(rgb_path).convert("RGB")
        depth_gt_image = Image.open(depth_gt_path).convert("RGB")
        depth_gt_array = np.array(depth_gt_image)
        depth_gt_decoded = depth_gt_array.mean(axis=2).astype(np.float32) / 255.0
        depth_gt_resized = Image.fromarray(depth_gt_decoded).resize(input_image.size, Image.BILINEAR)
        depth_gt_final = np.array(depth_gt_resized)

        # Call your depth estimation pipeline
        pipeline_output = pipe(
                input_image,
                denoising_steps=4,
                ensemble_size=5,
                processing_res=768,
                match_input_res=True,
                color_map="Spectral",
                show_progress_bar=True
        )
        depth_pred = pipeline_output.depth_np

        rmse = calculate_rmse(depth_pred, depth_gt_final)
        errors.append(rmse)
        print(f'RMSE: {rmse}')

# Finalize plot
ax.plot(errors, label='RMSE per Image')
ax.set_xlabel('Image Index')
ax.set_ylabel('RMSE')
ax.set_title('Running RMSE Across Dataset')
ax.legend()
plt.show()
plt.savefig("dataset_rmse_graph.pdf")
plt.close(fig)

print(f"Average RMSE for the dataset: {np.mean(errors)}")