In [1]:
# Cell 1: Imports and Constants
import shutil
from pathlib import Path
import pandas as pd
from tqdm import tqdm

# Columns we expect
COLUMNS = [
    "study_id",
    "series_id",
    "image_id",
    "laterality",
    "view_position",
    "height",
    "width",
    "breast_birads",
    "breast_density",
    "split",
]

CSV_PATH = Path("/home/pranaypalem/Downloads/VinDr_png_archive/breast-level_annotations.csv")
IMAGES_ROOT = Path("/home/pranaypalem/Downloads/VinDr_png_archive/images_png")
OUT_ROOT = Path("/home/pranaypalem/Downloads/VinDr_png_archive/split_images")


In [2]:
# Cell 2: Load the CSV and inspect
df = pd.read_csv(CSV_PATH, usecols=COLUMNS)
print("Columns found:", list(df.columns))
df.head()


Columns found: ['study_id', 'series_id', 'image_id', 'laterality', 'view_position', 'height', 'width', 'breast_birads', 'breast_density', 'split']


Unnamed: 0,study_id,series_id,image_id,laterality,view_position,height,width,breast_birads,breast_density,split
0,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,d8125545210c08e1b1793a5af6458ee2,L,CC,3518,2800,BI-RADS 2,DENSITY C,training
1,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,290c658f4e75a3f83ec78a847414297c,L,MLO,3518,2800,BI-RADS 2,DENSITY C,training
2,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,cd0fc7bc53ac632a11643ac4cc91002a,R,CC,3518,2800,BI-RADS 2,DENSITY C,training
3,b8d273e8601f348d3664778dae0e7e0b,b36517b9cbbcfd286a7ae04f643af97a,71638b1e853799f227492bfb08a01491,R,MLO,3518,2800,BI-RADS 2,DENSITY C,training
4,8269f5971eaca3e5d3772d1796e6bd7a,d931832a0815df082c085b6e09d20aac,dd9ce3288c0773e006a294188aadba8e,L,CC,3518,2800,BI-RADS 1,DENSITY C,training


In [3]:
# Cell 3: Define the function to resolve image paths
def resolve_source_path(images_root: Path, study_id: str, image_id: str) -> Path | None:
    """
    Try a few sensible filename patterns to find the source image.
    """
    p = images_root / study_id / image_id
    if p.exists():
        return p

    p_png = (images_root / study_id / image_id).with_suffix(".png")
    if p_png.exists():
        return p_png

    candidates = list((images_root / study_id).glob(f"{image_id}.*"))
    if candidates:
        return candidates[0]

    return None


In [4]:
# Cell 4: Create split folders and copy images
OUT_ROOT.mkdir(parents=True, exist_ok=True)

for split_name, group in df.groupby("split"):
    dest_dir = OUT_ROOT / split_name
    dest_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nCopying {len(group)} images to: {dest_dir}")
    for row in tqdm(group.itertuples(index=False), total=len(group), desc=f"{split_name}"):
        study_id = str(row.study_id)
        image_id = str(row.image_id)

        src = resolve_source_path(IMAGES_ROOT, study_id, image_id)
        if src is None:
            tqdm.write(f"WARNING: Missing image for study_id={study_id}, image_id={image_id}")
            continue

        dst = dest_dir / f"{study_id}_{src.name}"
        try:
            shutil.copy2(src, dst)
        except Exception as e:
            tqdm.write(f"ERROR copying {src} -> {dst}: {e}")

print("\nDone.")



Copying 4000 images to: /home/pranaypalem/Downloads/VinDr_png_archive/split_images/test


test: 100%|██████████| 4000/4000 [00:02<00:00, 1410.48it/s]



Copying 16000 images to: /home/pranaypalem/Downloads/VinDr_png_archive/split_images/training


training: 100%|██████████| 16000/16000 [00:11<00:00, 1398.26it/s]


Done.



