In [6]:
import os
import re

# Path to your main folder
root_dir = "Breast-diagnosis/manifest-BbshIhaG7188578559074019493/BREAST-DIAGNOSIS"

# Keywords to identify MRI protocols
MRI_KEYWORDS = ["MRI", "T2", "STIR", "BLISS", "AX", "SENSE", "NA"]

# Store unique MRI protocol names
mri_protocols = set()

# Walk through all directories and files
for dirpath, dirnames, filenames in os.walk(root_dir):
    for name in dirnames + filenames:
        name_upper = name.upper()

        if any(keyword in name_upper for keyword in MRI_KEYWORDS):
            # Remove leading date pattern like 04-12-2008- or 11-25-2008-
            cleaned_name = re.sub(r"^\d{2}-\d{2}-\d{4}-", "", name_upper)

            # Optionally normalize further: remove special characters (keep alphanumerics and dash/underscore)
            cleaned_name = ''.join(c for c in cleaned_name if c.isalnum() or c in "-_ ")

            mri_protocols.add(cleaned_name.strip())

# Sort and write to file
output_path = "mri_protocols.txt"
with open(output_path, "w") as f:
    for protocol in sorted(mri_protocols):
        f.write(protocol + "\n")

print(f"Extracted {len(mri_protocols)} unique MRI protocols. Saved to {output_path}")



Extracted 491 unique MRI protocols. Saved to mri_protocols.txt


In [7]:
import os
import re
from collections import defaultdict

# Path to your main folder
root_dir = "Breast-diagnosis/manifest-BbshIhaG7188578559074019493/BREAST-DIAGNOSIS"

# Keywords to identify MRI protocols
MRI_KEYWORDS = ["MRI", "T2", "STIR", "BLISS", "AX", "SENSE", "NA"]

# Store unique MRI protocol names
mri_protocols = set()

# Count each keyword occurrence
keyword_counts = defaultdict(int)

# Walk through all directories and files
for dirpath, dirnames, filenames in os.walk(root_dir):
    for name in dirnames + filenames:
        name_upper = name.upper()

        if any(keyword in name_upper for keyword in MRI_KEYWORDS):
            # Remove leading date like 04-12-2008-
            cleaned_name = re.sub(r"^\d{2}-\d{2}-\d{4}-", "", name_upper)

            # Normalize name: keep alphanumerics, dashes, underscores and spaces
            cleaned_name = ''.join(c for c in cleaned_name if c.isalnum() or c in "-_ ")

            # Add cleaned name to the set (deduplicated)
            mri_protocols.add(cleaned_name.strip())

            # Count each keyword in the name
            for keyword in MRI_KEYWORDS:
                if keyword in name_upper:
                    keyword_counts[keyword] += 1

# Sort and write to file
output_path = "mri_protocols.txt"
with open(output_path, "w") as f:
    for protocol in sorted(mri_protocols):
        f.write(protocol + "\n")

# Print results
print(f"\n✅ Total unique MRI protocols found: {len(mri_protocols)}\n")

print("📊 Keyword Occurrence Counts:")
for keyword in MRI_KEYWORDS:
    print(f"  {keyword}: {keyword_counts[keyword]}")



✅ Total unique MRI protocols found: 491

📊 Keyword Occurrence Counts:
  MRI: 114
  T2: 120
  STIR: 108
  BLISS: 119
  AX: 136
  SENSE: 317
  NA: 152
