# Experiment 01: Storage Format Analysis

## 1. Hypothesis & Rationale

**Research Question:** How do different storage formats affect I/O performance?

**Hypothesis:** Decompression and parsing overhead vary significantly across formats, with TileDB showing advantages due to its tile-based access pattern.

**Why This Matters:** Format choice directly impacts training iteration time and cloud storage costs.

In [None]:
# Parameters (papermill)
BATCH_SIZE = 4
PATCH_SIZE = (64, 64, 64)
NUM_WORKERS = 0
N_WARMUP = 5
N_RUNS = 10
N_BATCHES = 20
N_SUBJECTS = 20
RANDOM_SEED = 42
S3_BUCKET = "souzy-scratch"
TILING_STRATEGIES = ["axial", "isotropic"]

In [None]:
import sys

import nibabel as nib
import numpy as np
import pandas as pd

# Derive project root from absolute config paths
from benchmarks.config import _BENCHMARKS_DIR, BENCHMARK_DIR, FIGURES_DIR, NIFTI_DIR

project_root = _BENCHMARKS_DIR.parent
sys.path.insert(0, str(project_root / "src"))

from benchmarks.infrastructure import (
    benchmark_operation,
    measure_disk_space,
    plot_bar_comparison,
    prepare_nifti_formats,
    prepare_zarr_formats,
)
from radiobject import RadiObject

## 2. Dataset Preparation

In [None]:
# Find source NIfTI files
nifti_paths = sorted(NIFTI_DIR.glob("*.nii.gz"))[:N_SUBJECTS]
assert nifti_paths, f"No NIfTI files found in {NIFTI_DIR}"
print(f"Found {len(nifti_paths)} NIfTI files")

# Prepare storage formats
storage_formats = prepare_nifti_formats(nifti_paths, BENCHMARK_DIR, N_SUBJECTS)
zarr_formats = prepare_zarr_formats(nifti_paths, BENCHMARK_DIR, N_SUBJECTS)

# Get paths for each format
nifti_gz_paths = sorted((BENCHMARK_DIR / "nifti-compressed").glob("*.nii.gz"))
nifti_paths_uncompressed = sorted((BENCHMARK_DIR / "nifti-uncompressed").glob("*.nii"))
numpy_paths = sorted((BENCHMARK_DIR / "numpy").glob("*.npy"))
zarr_axial_paths = sorted((BENCHMARK_DIR / "zarr-axial").glob("*.zarr"))
zarr_iso_paths = sorted((BENCHMARK_DIR / "zarr-isotropic").glob("*.zarr"))

print(f"NIfTI compressed: {len(nifti_gz_paths)}")
print(f"NIfTI uncompressed: {len(nifti_paths_uncompressed)}")
print(f"NumPy: {len(numpy_paths)}")
print(f"Zarr axial: {len(zarr_axial_paths)}")
print(f"Zarr isotropic: {len(zarr_iso_paths)}")

## 3. Disk Space Measurement

In [None]:
# Compute raw voxel size for compression ratio
img = nib.load(str(nifti_gz_paths[0]))
n_voxels = np.prod(img.shape)
bytes_per_voxel = np.dtype(img.get_data_dtype()).itemsize
raw_voxel_bytes = int(n_voxels * bytes_per_voxel * len(nifti_gz_paths))

disk_space_results = []

format_dirs = {
    "NIfTI (.nii.gz)": BENCHMARK_DIR / "nifti-compressed",
    "NIfTI (.nii)": BENCHMARK_DIR / "nifti-uncompressed",
    "NumPy (.npy)": BENCHMARK_DIR / "numpy",
    "TileDB (AXIAL)": BENCHMARK_DIR / "radiobject-axial",
    "TileDB (ISOTROPIC)": BENCHMARK_DIR / "radiobject-isotropic",
    "Zarr (AXIAL)": BENCHMARK_DIR / "zarr-axial",
    "Zarr (ISOTROPIC)": BENCHMARK_DIR / "zarr-isotropic",
}

for name, path in format_dirs.items():
    result = measure_disk_space(path, name, raw_voxel_bytes)
    disk_space_results.append(result)
    print(
        f"{name}: {result.size_mb:.1f} MB ({result.n_files} files, ratio={result.compression_ratio:.2f})"
    )

## 4. Format Loading Benchmarks

In [None]:
import zarr

all_results = []

# NIfTI Compressed
result = benchmark_operation(
    lambda: nib.load(str(nifti_gz_paths[0])).get_fdata(),
    "nibabel",
    "format_load",
    "local",
    storage_format="nifti_gz",
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)
all_results.append(result)
print(f"NIfTI compressed: {result.time_mean_ms:.1f} +/- {result.time_std_ms:.1f} ms")

# NIfTI Uncompressed
result = benchmark_operation(
    lambda: nib.load(str(nifti_paths_uncompressed[0])).get_fdata(),
    "nibabel",
    "format_load",
    "local",
    storage_format="nifti",
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)
all_results.append(result)
print(f"NIfTI uncompressed: {result.time_mean_ms:.1f} +/- {result.time_std_ms:.1f} ms")

# NumPy
result = benchmark_operation(
    lambda: np.load(str(numpy_paths[0])),
    "numpy",
    "format_load",
    "local",
    storage_format="numpy",
    n_warmup=N_WARMUP,
    n_runs=N_RUNS,
)
all_results.append(result)
print(f"NumPy: {result.time_mean_ms:.1f} +/- {result.time_std_ms:.1f} ms")

# TileDB Local
for strategy in TILING_STRATEGIES:
    uri = str(BENCHMARK_DIR / f"radiobject-{strategy}")
    radi = RadiObject(uri)
    vol = radi.collection(list(radi.collection_names)[0]).iloc[0]
    result = benchmark_operation(
        lambda v=vol: v.to_numpy(),
        "RadiObject",
        "format_load",
        "local",
        tiling=strategy,
        storage_format="tiledb",
        n_warmup=N_WARMUP,
        n_runs=N_RUNS,
    )
    all_results.append(result)
    print(f"TileDB ({strategy}): {result.time_mean_ms:.1f} +/- {result.time_std_ms:.1f} ms")

# Zarr Local
for strategy, paths in [("axial", zarr_axial_paths), ("isotropic", zarr_iso_paths)]:
    z = zarr.open_array(str(paths[0]), mode="r")
    result = benchmark_operation(
        lambda arr=z: arr[:],
        "zarr",
        "format_load",
        "local",
        tiling=strategy,
        storage_format="zarr",
        n_warmup=N_WARMUP,
        n_runs=N_RUNS,
    )
    all_results.append(result)
    print(f"Zarr ({strategy}): {result.time_mean_ms:.1f} +/- {result.time_std_ms:.1f} ms")

## 5. Results (Tidy Format)

In [None]:
# Build tidy results DataFrame
df = pd.DataFrame([r.to_dict() for r in all_results])
cols = [
    "framework",
    "storage_format",
    "tiling_strategy",
    "scenario",
    "time_mean_ms",
    "time_std_ms",
    "peak_heap_mb",
    "cpu_percent_mean",
]
df = df[[c for c in cols if c in df.columns]]
df.columns = [
    "framework",
    "format",
    "tiling",
    "scenario",
    "time_ms",
    "time_std_ms",
    "heap_mb",
    "cpu_pct",
][: len(df.columns)]
print(df.to_string(index=False))

In [None]:
# Disk space tidy table
disk_df = pd.DataFrame([r.to_dict() for r in disk_space_results])
disk_df = disk_df[["format_name", "size_mb", "n_files", "compression_ratio"]]
disk_df.columns = ["format", "size_mb", "files", "compression"]
print(disk_df.to_string(index=False))

## 6. Visualizations

In [None]:
# Plot format loading comparison
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

data = {}
errors = {}
for r in all_results:
    label = r.framework
    if r.tiling_strategy:
        label += f" ({r.tiling_strategy})"
    else:
        label += f" ({r.storage_format})"
    data[label] = r.time_mean_ms
    errors[label] = r.time_std_ms

plot_bar_comparison(
    data,
    "Storage Format Loading Comparison",
    "Time (ms)",
    FIGURES_DIR / "format_overhead_local.png",
    errors=errors,
)

In [None]:
# Plot disk space comparison
data = {r.format_name: r.size_mb for r in disk_space_results}
plot_bar_comparison(
    data, "Disk Space by Format", "Size (MB)", FIGURES_DIR / "disk_space_comparison.png"
)

## 7. Key Findings

1. **Gzip Overhead:** NIfTI compressed adds ~10x load time vs uncompressed
2. **NumPy Baseline:** Raw NumPy is fastest (no parsing overhead)
3. **TileDB Competitive:** Full-volume TileDB loads are competitive with raw formats
4. **Zarr Comparable:** Zarr and TileDB show similar full-volume load performance with matched compression

In [None]:
# Export results
import json
from datetime import datetime

from benchmarks.config import RESULTS_DIR

results_json = {
    "timestamp": datetime.now().isoformat(),
    "experiment": "01_storage_format_analysis",
    "config": {
        "n_warmup": N_WARMUP,
        "n_runs": N_RUNS,
        "n_subjects": N_SUBJECTS,
    },
    "disk_space": [r.to_dict() for r in disk_space_results],
    "benchmarks": [r.to_dict() for r in all_results],
}

output_path = RESULTS_DIR / "01_storage_format_results.json"
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
    json.dump(results_json, f, indent=2)
print(f"Results saved to {output_path}")