# WOD-E2E segment inspection (step-by-step)

帮助快速验证 `seg_index_val_enriched.csv` 读取和 `SegmentRecord` 聚合逻辑。

In [1]:
from pathlib import Path
import pickle
import tensorflow as tf
from typing import List, Optional

from wod_e2e_exporter.io_reader import read_seg_index_enriched
from wod_e2e_exporter.utils import win_to_wsl_path

def _sanitize_name(s: str) -> str:
    return str(s).replace("/", "_").replace("\\", "_")

def cache_segments_full(
    seg_index_csv: str,
    out_root: str = r"D:\Datasets\WOD_E2E_Camera_v1\val_cache",
    seg_ids: Optional[List[str]] = None,
    max_segments: Optional[int] = None,
    skip_existing: bool = True,
    log_every: int = 1,
) -> list[Path]:
    out_root = Path(win_to_wsl_path(out_root))
    out_root.mkdir(parents=True, exist_ok=True)

    idx = read_seg_index_enriched(seg_index_csv)
    seg_id_list = seg_ids if seg_ids is not None else list(idx.keys())
    if max_segments is not None:
        seg_id_list = seg_id_list[:max_segments]

    total = len(seg_id_list)
    done = 0
    saved = []

    for seg_id in seg_id_list:
        rows = idx[seg_id]
        rows_sorted = sorted(
            rows,
            key=lambda r: (r.get("frame_id") if r.get("frame_id") is not None else 1e9, r.get("record_idx", 0)),
        )

        scenario_cluster = rows_sorted[0].get("scenario_cluster", "UNKNOWN")
        fname = f"{_sanitize_name(scenario_cluster)}_{_sanitize_name(seg_id)}.pkl"
        out_path = out_root / fname

        if skip_existing and out_path.exists():
            saved.append(out_path)
            done += 1
            if log_every and done % log_every == 0:
                print(f"[cache] total={total}, done={done}, remaining={total - done}")
            continue

        # group by tfrecord file
        by_file = {}
        for r in rows_sorted:
            by_file.setdefault(r["tfrecord_file"], set()).add(int(r["record_idx"]))

        e2e_bytes_by_key = {}
        for tfp, needed in by_file.items():
            ds = tf.data.TFRecordDataset(tfp, compression_type="")
            found = 0
            for i, rec in enumerate(ds):
                if i in needed:
                    e2e_bytes_by_key[(tfp, i)] = bytes(rec.numpy())
                    found += 1
                    if found == len(needed):
                        break

        # assemble in order
        missing = [
            r for r in rows_sorted
            if (r["tfrecord_file"], int(r["record_idx"])) not in e2e_bytes_by_key
        ]
        if missing:
            raise RuntimeError(f"missing records for seg_id={seg_id}: {len(missing)}")

        e2e_bytes_list = [
            e2e_bytes_by_key[(r["tfrecord_file"], int(r["record_idx"]))]
            for r in rows_sorted
        ]

        cache_obj = {
            "seg_id": seg_id,
            "scenario_cluster": scenario_cluster,
            "rows": rows_sorted,
            "frame_ids": [r.get("frame_id") for r in rows_sorted],
            "e2e_bytes": e2e_bytes_list,
        }

        with out_path.open("wb") as f:
            pickle.dump(cache_obj, f, protocol=pickle.HIGHEST_PROTOCOL)

        saved.append(out_path)
        done += 1
        if log_every and done % log_every == 0:
            print(f"[cache] total={total}, done={done}, remaining={total - done}")

    return saved


2026-01-22 17:40:56.236350: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-22 17:40:56.237561: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-22 17:40:56.258039: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-22 17:40:56.259249: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# # 只缓存一个 seg
# saved = cache_segments_full("seg_index_val_enriched.csv", seg_ids=["801a330a4f575841d82bd5200e2c4fb2"])
# print(saved)

# 或缓存前 N 个
# saved = cache_segments_full("seg_index_val_enriched.csv", max_segments=10)

# # 全量缓存
# saved = cache_segments_full("seg_index_val_enriched.csv")
# print("cached segments:", len(saved))

saved = cache_segments_full(
    "seg_index_val_enriched.csv",
    skip_existing=True,
)


[cache] total=479, done=1, remaining=478
[cache] total=479, done=2, remaining=477
[cache] total=479, done=3, remaining=476
[cache] total=479, done=4, remaining=475
[cache] total=479, done=5, remaining=474
[cache] total=479, done=6, remaining=473
[cache] total=479, done=7, remaining=472
[cache] total=479, done=8, remaining=471
[cache] total=479, done=9, remaining=470
[cache] total=479, done=10, remaining=469
[cache] total=479, done=11, remaining=468
[cache] total=479, done=12, remaining=467
[cache] total=479, done=13, remaining=466
[cache] total=479, done=14, remaining=465
[cache] total=479, done=15, remaining=464
[cache] total=479, done=16, remaining=463
[cache] total=479, done=17, remaining=462
[cache] total=479, done=18, remaining=461
[cache] total=479, done=19, remaining=460
[cache] total=479, done=20, remaining=459
[cache] total=479, done=21, remaining=458
[cache] total=479, done=22, remaining=457
[cache] total=479, done=23, remaining=456
[cache] total=479, done=24, remaining=455
[

2026-01-22 17:41:03.212502: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-22 17:41:03.212744: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[cache] total=479, done=190, remaining=289
[cache] total=479, done=191, remaining=288
[cache] total=479, done=192, remaining=287
[cache] total=479, done=193, remaining=286
[cache] total=479, done=194, remaining=285
[cache] total=479, done=195, remaining=284
[cache] total=479, done=196, remaining=283
[cache] total=479, done=197, remaining=282
[cache] total=479, done=198, remaining=281
[cache] total=479, done=199, remaining=280
[cache] total=479, done=200, remaining=279
[cache] total=479, done=201, remaining=278
[cache] total=479, done=202, remaining=277
[cache] total=479, done=203, remaining=276
[cache] total=479, done=204, remaining=275
[cache] total=479, done=205, remaining=274
[cache] total=479, done=206, remaining=273
[cache] total=479, done=207, remaining=272
[cache] total=479, done=208, remaining=271
[cache] total=479, done=209, remaining=270
[cache] total=479, done=210, remaining=269
[cache] total=479, done=211, remaining=268
[cache] total=479, done=212, remaining=267
[cache] tot