In [2]:
from elastic_nerf.utils import wandb_utils as wu
from pathlib import Path
import pandas as pd
from IPython.display import display

sweep_mappings = {
    "2uxektzo": "ngp_occ-mipnerf360-baseline",
    # "kebumdc0": "ngp_occ-mipnerf360-baseline",
    "xxjsfkbw": "ngp_prop-mipnerf360-baseline",
    # "8w0wks0x": "ngp_prop-mipnerf360-baseline",
    # "qfkjdvv2": "ngp_occ-mipnerf360-sampling_single",
    # "hy03dx0e": "ngp_occ-mipnerf360-sampling",
    # "wsxh6gjo": "ngp_prop-mipnerf360-sampling",
    # "8ishbvau": "ngp_prop-mipnerf360-sampling_single",
    # "b674pjcs": "ngp_occ-mipnerf360-baseline_head_depth1",
    # "58hgroe5": "ngp_prop-mipnerf360-baseline_head_depth1",
    # "c6g1mc5g": "ngp_occ-mipnerf360-baseline-mup",
    # "ccrwhsr5": "ngp_prop-mipnerf360-baseline-mup",
}

# TR1a_Baseline_Fused
This experiment benchmarks the baseline for the Nerfacc NGP Occ and Nerfacc Prop models on all scenes from the Mip-NeRF 360 dataset. Similar to Matformer, we sample exponentially spaced widths of $d={64, 32, 16, 8}$ (with $d=64$ being the baseline full-width) and evaluate the performance of both the Nerfacc-Occ and Nerfacc-Prop models after naively shrinking every linear layer to these widths. The goal here is to understand how much of a performance drop there is when you train with a much smaller model. Note that models at all widths are trained using the same hyperparameters (batch size, learning rate, etc) as the baseline full-width implementation. While this is not going to result in optimally tuned small width models, keep in mind that our overarching goal is to be able to train models of multiple widths optimally and simultaneously, but before that, we need to establish baselines.

In [3]:
tables = ["EvalResultsSummarytable"]
sweeps = sweep_mappings.keys()
results_cache_dir = Path("/home/user/shared/results/elastic-nerf")
sweep_results = {}

for sweep in sweeps:
    sweep_results[sweep] = wu.fetch_sweep_results(
        sweep=sweep,
        refresh_cache=False,
        download_history=True,
        tables=tables,
        results_cache_dir=results_cache_dir,
    )
all_history = []
# Create a dataframe with all the results
for sweep_name in sweep_results:
    for run in sweep_results[sweep_name]:
        # Flatten the config
        flat_config = wu.flatten_dict(run.config, sep=".")
        # Concatenate the config with each row of the history results
        # Note that history results are already a dataframe
        history = run.history
        history["sweep_id"] = sweep_name
        history["run_id"] = run.run_id
        history["model_type"] = (
            "ngp_prop" if "prop" in sweep_mappings[sweep_name] else "ngp_occ"
        )
        history["sweep_name"] = sweep_mappings[sweep_name]
        for key in flat_config:
            try:
                history[key] = str(flat_config[key])
            except:
                print(f"Failed to add {key} to history with value {flat_config[key]}")
                raise
        all_history.append(history)

# %%
# Concatenate all the history results into a single dataframe
final_df = pd.concat(all_history, ignore_index=True)


# %%
fp = f"results_tr1a_baseline_fused.csv"
final_df.to_csv(fp, index=False)
print(f"Saved results to {fp}")
df = pd.read_csv(fp)



Saved results to results_tr1a_baseline_fused.csv


  df = pd.read_csv(fp)


# Results
Looking at the following Table, we can see that in general, across scenes and models, the smaller width architectures perform more poorly than larger width architectures. On average, decreasing the size of the Nerfacc Prop model to width of 8 seems to have a larger reduction in average PSNR compared to the Nerfacc Occ model, potentially due to the presence of the 2 proposal networks (which are also being shrunk). On the other hand, the Nerfacc Occ model uses the NGP hash grid estimator (which we do not reduce in size), potentially resulting in it being more robust against downstream width reductions. One exception to the overall trend is results on the Stump scene for the Nerfacc Occ model where the width 16 model does a lot more poorly than the Width 8 model. This will require additional scrutiny in a future experiment.

But overall though, it seems that even after shrinking all the layers to width 8, the performance drop on all scenes is less than 9\% compared to their respective baselines. Given that's not terrible, how much smaller are we making these networks anyway? For that, let's take a look at the parameter counts of both the Nerfacc Occ and Nerfacc Prop models for each stage and compare the architectural complexity at full width vs reduced widths.

## Issues
After running the experiment, I realized that I had created non-elastic (aka fused NGP) versions of all the models. At the time, I thought this would be appropriate because I was trying to benchmark the baseline. However, due to the way tiny-cuda-nn implements padding, this meant that the small width layers (for models with more than 2 hidden layers) would get padded to the nearest block size, and these would actually be trainable parameters. I think this is only really relevant for the width 8 models, but this could explain why the width 16 for stump does worse than the width 8 (not necessarily because the width 8 is bigger, but now we have more confounding variabels). I will be re-running this baseline benchmarking again, but this time using my native PyTorch implementation. 

In [4]:
num_scenes = len(df["scene"].unique())
table_cols = ["Scene", "Width 64", "Width 32", "Width 16", "Width 8"]
for i, (model_type, model_group) in enumerate(df.groupby(by="model_type")):
    table_data = []
    model_group = model_group.query("_step == 20000")
    model_name_split = [m.capitalize() for m in model_type.split("_")]
    model_name_split[0] = model_name_split[0].upper()
    model_type_name = " ".join(model_name_split)
    for j, (scene, scene_group) in enumerate(model_group.groupby(by="scene")):
        base_psnr = scene_group.query("_step == 20000 and hidden_dim == 64")[
            "Eval Results Summary/psnr_avg/elastic_64"
        ].iloc[0]
        table_row = {"Scene": scene.capitalize()}
        for dim, dim_group in scene_group.groupby(by="hidden_dim"):
            psnr_col = f"Eval Results Summary/psnr_avg/elastic_{dim}"
            psnr_avg = dim_group[psnr_col].iloc[0]
            pc_diff = 100 * (psnr_avg - base_psnr) / base_psnr
            if dim == 64:
                table_row.update({f"Width {dim}": f"{psnr_avg:.2f}"})
            else:
                table_row.update({f"Width {dim}": f"{psnr_avg:.2f} ({pc_diff:.2f}%)"})
        table_data.append(table_row)

    table_data = pd.DataFrame(table_data, columns=table_cols)
    caption = (
        f"PSNR values after 20k steps of training for {model_type_name} model at different widths across scenes from the MipNeRF-360 dataset."
        f"  Values in brackets are the percentage difference compared to the baseline PSNR for each model at full-size (width 64)."
    )
    table_data = table_data.style.set_caption(caption)
    display(table_data)
    # print(
    #     table_data.to_latex(
    #         index=False,
    #         caption=f"Baseline performance (PSNR) after 20k steps of training for {model_type_name} model at different widths across scenes from the MipNeRF-360 dataset",
    #         label=f"tab:baseline_{model_type_name.replace(' ', '_')}",
    #         position="h",
    #         column_format="lcccccc",
    #         escape=True,
    #         bold_rows=True,
    #     )
    # )

Unnamed: 0,Scene,Width 64,Width 32,Width 16,Width 8
0,Bicycle,22.37,22.12 (-1.13%),22.01 (-1.61%),21.74 (-2.86%)
1,Bonsai,29.35,29.12 (-0.80%),28.38 (-3.31%),27.76 (-5.41%)
2,Counter,26.56,26.31 (-0.94%),25.89 (-2.53%),25.33 (-4.63%)
3,Garden,24.36,24.28 (-0.34%),24.04 (-1.30%),23.65 (-2.92%)
4,Kitchen,27.95,27.26 (-2.49%),26.64 (-4.71%),25.88 (-7.43%)
5,Room,30.13,29.87 (-0.86%),29.80 (-1.10%),29.42 (-2.35%)
6,Stump,23.1,22.69 (-1.75%),21.53 (-6.79%),22.62 (-2.05%)


Unnamed: 0,Scene,Width 64,Width 32,Width 16,Width 8
0,Bicycle,23.04,22.95 (-0.38%),22.68 (-1.55%),22.33 (-3.07%)
1,Bonsai,29.87,29.36 (-1.70%),28.76 (-3.72%),28.11 (-5.90%)
2,Counter,26.42,25.93 (-1.87%),25.58 (-3.19%),24.29 (-8.07%)
3,Garden,25.11,24.91 (-0.77%),24.70 (-1.62%),24.47 (-2.56%)
4,Kitchen,30.01,29.50 (-1.70%),28.74 (-4.23%),27.47 (-8.45%)
5,Room,30.53,30.34 (-0.62%),30.06 (-1.54%),29.83 (-2.27%)
6,Stump,24.94,24.77 (-0.68%),24.64 (-1.19%),24.20 (-2.96%)
