In [8]:
import os
from pathlib import Path
import pandas as pd
import pydicom
import dicom2nifti
from tqdm import tqdm

# Set paths
BASE_DIR = Path("data/TCGA-SARC").resolve()
OUTPUT_DIR = BASE_DIR.parent / "NIfTI"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Step 1: Traverse directory structure
print("[INFO] Scanning folder structure...")
all_dirs = [Path(dp) for dp, _, _ in os.walk(BASE_DIR)]
df_structure = pd.DataFrame({'path': all_dirs})
print(f"[INFO] {len(df_structure)} folders found.")

# Step 2: Filter folders that contain valid DICOM series
valid_series = []

print("[INFO] Checking for valid DICOM series...")
for folder in tqdm(df_structure['path']):
    try:
        dicom_files = [f for f in os.listdir(folder) if f.lower().endswith(".dcm")]
        if not dicom_files:
            continue

        # Check first file to confirm it's a valid DICOM with SeriesInstanceUID
        ds = pydicom.dcmread(folder / dicom_files[0], stop_before_pixels=True)
        if hasattr(ds, "SeriesInstanceUID"):
            valid_series.append(folder)
    except Exception:
        continue

print(f"[INFO] {len(valid_series)} valid DICOM series found.")

# Step 3: Convert each valid series to NIfTI
print("[INFO] Converting to NIfTI...")
for series_path in tqdm(valid_series, desc="Converting"):
    try:
        patient_id = series_path.parts[-1]
        output_path = OUTPUT_DIR / f"{patient_id}.nii.gz"
        dicom2nifti.convert_directory(str(series_path), str(OUTPUT_DIR), reorient=True)
    except Exception as e:
        print(f"[WARNING] Failed to convert {series_path}: {e}")

print("[INFO] Conversion process completed.")


[INFO] Scanning folder structure...
[INFO] 0 folders found.
[INFO] Checking for valid DICOM series...


0it [00:00, ?it/s]


[INFO] 0 valid DICOM series found.
[INFO] Converting to NIfTI...


Converting: 0it [00:00, ?it/s]

[INFO] Conversion process completed.



