In [2]:
import os, bz2, json, random, _pickle as pkl
from pathlib import Path
from typing   import Tuple, Iterator
import numpy as np
import pandas as pd
import tifffile as tiff
from skimage.measure import label, regionprops
from tqdm import tqdm

# ───────────────────────────── ПАРАМЕТРЫ ─────────────────────────────
SEED          = 42          # фиксируем split
TEST_RATIO    = 0.20        # 20 % исходников → test
PATCH_SIZE    = (12, 256, 256)   # (Z, Y, X)!
STRIDE        = (6, 256, 256)
MAX_INST      = 20          # MAX_GT_INSTANCES в конфиге
PICKLE_PROTO  = 4           # совместимо с Python 3.6
GLOBAL_MAX    = 0
SRC_IMG = Path("/NAS/mmaiurov/Datasets/Hela_MRC/images")
SRC_MSK = Path("/NAS/mmaiurov/Datasets/Hela_MRC/masks")

DST_ROOT = Path("/NAS/mmaiurov/Datasets/hela_mrc_patches256")
# ─────────────────────────────────────────────────────────────────────


# --- helpers ---------------------------------------------------------
def extract_patches(vol: np.ndarray,
                    ps: Tuple[int,int,int],
                    st: Tuple[int,int,int]) -> Iterator[Tuple[np.ndarray,Tuple[int,int,int]]]:
    dz, dy, dx = ps
    sz, sy, sx = st
    Z, Y, X    = vol.shape
    for z0 in range(0, max(Z - dz + 1, 1), sz):
        for y0 in range(0, max(Y - dy + 1, 1), sy):
            for x0 in range(0, max(X - dx + 1, 1), sx):
                patch = vol[z0:z0+dz, y0:y0+dy, x0:x0+dx]
                if patch.shape == ps:
                    yield patch, (z0, y0, x0)


def instance_masks_from_semantic(seg_patch: np.ndarray) -> np.ndarray:
    """seg_patch: uint16 (0=фон, >0=класс).  Возвращает bool [Z,Y,X,N]."""
    conn = label(seg_patch > 0, connectivity=1)
    labels = np.unique(conn)
    labels = labels[labels != 0][:MAX_INST]
    if labels.size == 0:
        return np.zeros(seg_patch.shape + (0,), dtype=np.uint8)
    masks = np.zeros(seg_patch.shape + (labels.size,), dtype=np.uint8)
    for k, lbl in enumerate(labels):
        masks[..., k] = (conn == lbl)
    return masks


def bboxes_from_masks(masks: np.ndarray) -> list[str]:
    """masks[...,N] → строки 'class z0 y0 x0 z1 y1 x1' (class=1)."""
    if masks.size == 0:
        return []
    lines = []
    for k in range(masks.shape[-1]):
        coords = regionprops(masks[..., k])[0].bbox  # z0,y0,x0,z1,y1,x1
        z0,y0,x0,z1,y1,x1 = coords
        if z1>z0 and y1>y0 and x1>x0:
            lines.append(f"1 {z0} {y0} {x0} {z1} {y1} {x1}")
    return lines


def save_pickle(arr: np.ndarray, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with bz2.BZ2File(str(path.with_suffix('.pickle')), "wb") as fh:
        pkl.dump(arr, fh, protocol=PICKLE_PROTO)


def write_csv(rows: list[dict], dst: Path):
    dst.parent.mkdir(exist_ok=True)
    pd.DataFrame(rows).to_csv(dst, index=False)


# --- train / test split ---------------------------------------------
img_files = sorted(SRC_IMG.glob("*.tif"))
msk_files = sorted(SRC_MSK.glob("*.tif"))
assert len(img_files) == len(msk_files) > 0, "Не найдены парные TIF-ы"

pairs = list(zip(img_files, msk_files))
random.seed(SEED)
random.shuffle(pairs)

split_idx      = int(len(pairs) * (1 - TEST_RATIO))
split_to_pairs = {"train": pairs[:split_idx],
                  "test":  pairs[split_idx:]}

print(f"Исходных томов: train={len(split_to_pairs['train'])}, test={len(split_to_pairs['test'])}")

# --- основной цикл ---------------------------------------------------
for split, pairs in split_to_pairs.items():
    img_out  = DST_ROOT / split / "images"
    seg_out  = DST_ROOT / split / "seg"
    cab_out  = DST_ROOT / split / "classes_and_boxes"
    ipkl_out = DST_ROOT / split / "img_pickle"
    mpkl_out = DST_ROOT / split / "masks"

    for p in (img_out, seg_out, cab_out, ipkl_out, mpkl_out):
        p.mkdir(parents=True, exist_ok=True)

    csv_rows = []

    for img_path, msk_path in tqdm(pairs, desc=f"{split}: патчи"):
        base_name = img_path.stem           # без .tif
        img_vol   = tiff.imread(img_path)
        msk_vol   = tiff.imread(msk_path)

        for idx, ((i_patch,_), (s_patch,_)) in enumerate(
                zip(extract_patches(img_vol, PATCH_SIZE, STRIDE),
                    extract_patches(msk_vol, PATCH_SIZE, STRIDE))):

            p_tag = f"{base_name}_p{idx:04d}"
            # --- save raw tiff (для отладки) --------------------------
            tiff.imwrite(img_out / f"{p_tag}_image.tiff", i_patch, dtype=img_vol.dtype)
            tiff.imwrite(seg_out / f"{p_tag}_seg.tiff",   s_patch, dtype=msk_vol.dtype)

            # --- pickle image ----------------------------------------
            img_float = 2 * (i_patch.astype(np.float32) / 255.) - 1.
            save_pickle(img_float, ipkl_out / f"{p_tag}_image")

            # --- instance masks & pickle -----------------------------
            m_inst = instance_masks_from_semantic(s_patch)
            inst_cnt = m_inst.shape[-1]
            if inst_cnt > GLOBAL_MAX:
                GLOBAL_MAX = inst_cnt
            save_pickle(m_inst, mpkl_out / f"{p_tag}_seg")

            # --- classes_and_boxes (.dat) ----------------------------
            lines = bboxes_from_masks(m_inst)
            for suff in ("_seg.dat", "_image.dat"):
                (cab_out / f"{p_tag}{suff}").write_text("\n".join(lines))

            # --- row for CSV -----------------------------------------
            csv_rows.append({
                "names":  p_tag,
                "images": str(ipkl_out / f"{p_tag}_image.pickle"),
                "segs":   str(seg_out  / f"{p_tag}_seg.tiff"),
                "cabs":   str(cab_out  / f"{p_tag}_image.dat"),
                "masks":  str(mpkl_out / f"{p_tag}_seg.pickle")
            })

    write_csv(csv_rows, DST_ROOT / "datasets" / f"{split}.csv")
    print(f"✓ {split}.csv   ({len(csv_rows)} патчей)")

print("Готово — все патчи, pickle и CSV созданы.")
print(f"\n★ Максимум объектов в одном патче = {GLOBAL_MAX}")


Исходных томов: train=4, test=2


train: патчи: 100%|██████████| 4/4 [12:43<00:00, 190.81s/it]


✓ train.csv   (2176 патчей)


test: патчи: 100%|██████████| 2/2 [03:59<00:00, 119.76s/it]

✓ test.csv   (896 патчей)
Готово — все патчи, pickle и CSV созданы.

★ Максимум объектов в одном патче = 20





In [4]:
DST_ROOT = Path("/NAS/mmaiurov/Datasets/hela_mrc_patches256")
OUT_DIR = DST_ROOT / "datasets"
OUT_DIR.mkdir(exist_ok=True)

for split in ("train", "test"):
    img_dir = DST_ROOT / split / "images"
    cab_dir = DST_ROOT / split / "classes_and_boxes"
    mask_dir = DST_ROOT / split / "masks"

    rows = []
    for tiff_path in sorted(img_dir.glob("*_image.tiff")):
        base = tiff_path.name.replace("_image.tiff", "")
        row = {
            "names":  f"/NAS/mmaiurov/Datasets/hela_mrc_patches256/{split}/images/{base}_image.tiff",
            "images": f"/NAS/mmaiurov/Datasets/hela_mrc_patches256/{split}/images/{base}_image.tiff",
            "segs":   f"/NAS/mmaiurov/Datasets/hela_mrc_patches256/{split}/images/{base}_seg.tiff",
            "cabs":   f"/NAS/mmaiurov/Datasets/hela_mrc_patches256/{split}/classes_and_boxes/{base}_image.dat",
            "masks":  f"/NAS/mmaiurov/Datasets/hela_mrc_patches256/{split}/masks/{base}_seg.pickle"
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(OUT_DIR / f"{split}.csv", index=False)
    print(f"✓ Создано: {split}.csv ({len(rows)} строк)")

✓ Создано: train.csv (2176 строк)
✓ Создано: test.csv (896 строк)
