<h3> Imports and set device

In [None]:
import time
import numpy as np

import torch
from torch_geometric.loader import DataLoader

import sys
sys.path.append("../src")
from acoupipe_extensions import VariableArrayConfig, random_positions
from on_the_fly_dataset import OnTheFlyDataset
from precomputed_dataset import precomputedDataset

<h3> Initiate both datasets

In [10]:
# --- generate config for on the flydataset ---
test_generator = np.random.default_rng(seed=20006)

config = VariableArrayConfig(
    mpos_fn=random_positions,
    mode="analytic",
    mic_sig_noise=False,
    generator=test_generator,
    min_nsources=1,
    max_nsources=4,
    min_num_mics=32,
    max_num_mics=32,
)

# --- instantiate on the fly dataset ---
on_the_fly_ds = OnTheFlyDataset(
    features=[
        "csm",
        "eigmode",
        "cartesian_coordinates",
        "loc",
        "source_strength_analytic",
    ],
    split="training",
    size=100,
    f=1000,
    num=0,
    progress_bar=False,
    config=config,
)

# --- instantiate precomputed dataset ---
h5_path = "../data/10000samples.h5"  #same settings as generated on the fly dataset
precomputed_ds = precomputedDataset(h5_path)


<h2> Benchmark Settings

In [None]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
pin_memory = (device.type == "cuda")
N = 50
WARMUP = 5

<h2> Benchmark precomputed dataset

In [49]:
# =========================
# PRECOMPUTED DATASET
# =========================
loader = DataLoader(precomputed_ds, batch_size=1, shuffle=False, num_workers=0, pin_memory=pin_memory)

# warmup
for _ in range(WARMUP):
    for data in loader:
        _ = data.to(device, non_blocking=pin_memory)
        if device.type == "cuda":
            torch.cuda.synchronize(device)
        break

load_times, h2d_times, both_times = [], [], []

for _ in range(N):
    t0 = time.perf_counter()
    for data in loader:
        t1 = time.perf_counter()  # after load
        _ = data.to(device, non_blocking=pin_memory)
        if device.type == "cuda":
            torch.cuda.synchronize(device)
        t2 = time.perf_counter()  # after H2D
        break

    load_times.append(t1 - t0)
    h2d_times.append(t2 - t1)
    both_times.append(t2 - t0)

print("Precomputed via DataLoader")
print(f"  load only:  {np.mean(load_times)*1000:.2f} ± {np.std(load_times)*1000:.2f} ms")
print(f"  H2D only:   {np.mean(h2d_times)*1000:.2f} ± {np.std(h2d_times)*1000:.2f} ms")
print(f"  load+H2D:   {np.mean(both_times)*1000:.2f} ± {np.std(both_times)*1000:.2f} ms")

Precomputed via DataLoader
  load only:  4.11 ± 0.65 ms
  H2D only:   0.22 ± 0.06 ms
  load+H2D:   4.33 ± 0.66 ms


In [51]:
# =========================
# ON-THE-FLY DATASET
# =========================
loader = DataLoader(on_the_fly_ds, batch_size=1, shuffle=False, num_workers=0, pin_memory=pin_memory)

# warmup
for _ in range(WARMUP):
    for data in loader:
        _ = data.to(device, non_blocking=pin_memory)
        if device.type == "cuda":
            torch.cuda.synchronize(device)
        break

load_times, h2d_times, both_times = [], [], []

for _ in range(N):
    t0 = time.perf_counter()
    for data in loader:
        t1 = time.perf_counter()  # after create/load
        _ = data.to(device, non_blocking=pin_memory)
        if device.type == "cuda":
            torch.cuda.synchronize(device)
        t2 = time.perf_counter()  # after H2D
        break

    load_times.append(t1 - t0)
    h2d_times.append(t2 - t1)
    both_times.append(t2 - t0)

print("On-the-fly via DataLoader")
print(f"  create only:{np.mean(load_times)*1000:.2f} ± {np.std(load_times)*1000:.2f} ms")
print(f"  H2D only:   {np.mean(h2d_times)*1000:.2f} ± {np.std(h2d_times)*1000:.2f} ms")
print(f"  create+H2D: {np.mean(both_times)*1000:.2f} ± {np.std(both_times)*1000:.2f} ms")

On-the-fly via DataLoader
  create only:34.13 ± 37.51 ms
  H2D only:   0.22 ± 0.01 ms
  create+H2D: 34.35 ± 37.52 ms
