In [None]:
import zipfile
from pathlib import Path
import os

In [5]:
def extract_mprage_files(zip_path, target_folder):
    """
    Extract all MPRAGE files from zip, maintaining subject structure
    """
    import zipfile
    from pathlib import Path
    
    # Create target folder
    target_base = Path(target_folder)
    target_base.mkdir(exist_ok=True)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        all_items = zip_ref.namelist()
        
        # Find all MPRAGE-related items
        mprage_items = []
        for item in all_items:
            if 'MPRAGE' in item or 'MP-RAGE' in item:
                mprage_items.append(item)
        
        print(f"🔍 Found {len(mprage_items)} MPRAGE items")
        
        # Extract each MPRAGE item
        extracted_count = 0
        for item in mprage_items:
            try:
                # Parse the path: ADNI/133_S_0525/MPRAGE/...
                parts = item.split('/')
                
                # Find subject ID (e.g., "133_S_0525")
                subject_id = None
                for part in parts:
                    if '_S_' in part:
                        subject_id = part
                        break
                
                if subject_id:
                    # Create target path: target_folder/ADNI/subject_id/MPRAGE/...
                    # Remove the first part (zip root) and keep the rest
                    relative_path = '/'.join(parts[1:])  # Skip first empty part
                    target_path = target_base / relative_path
                    
                    # Create parent directories
                    target_path.parent.mkdir(parents=True, exist_ok=True)
                    
                    # Extract the file
                    with zip_ref.open(item) as source, open(target_path, 'wb') as target:
                        target.write(source.read())
                    
                    extracted_count += 1
                    if extracted_count <= 10:  # Show first 10
                        print(f"   📁 Extracted: {relative_path}")
                
            except Exception as e:
                print(f"❌ Error extracting {item}: {e}")
        
        print(f"✅ Successfully extracted {extracted_count} MPRAGE files")
        return extracted_count

# Usage
zip_path = r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_3.zip"
target_folder = r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_2"

print(f"🚀 Extracting MPRAGE files from: {zip_path}")
print(f"📁 Target folder: {target_folder}")

extracted_count = extract_mprage_files(zip_path, target_folder)

print(f"\n🎯 Extraction complete! Found {extracted_count} MPRAGE files")
print(f"�� Files extracted to: {target_folder}")

🚀 Extracting MPRAGE files from: D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_3.zip
📁 Target folder: D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_2
🔍 Found 35447 MPRAGE items
   📁 Extracted: 023_S_0031/MPRAGE_Repeat/2005-10-28_12_35_33.0/I8299/ADNI_023_S_0031_MR_MPRAGE_Repeat__br_raw_20051030161558586_1_S10009_I8299.dcm
   📁 Extracted: 023_S_0031/MPRAGE_Repeat/2005-10-28_12_35_33.0/I8299/ADNI_023_S_0031_MR_MPRAGE_Repeat__br_raw_20051030161558883_10_S10009_I8299.dcm
   📁 Extracted: 023_S_0031/MPRAGE_Repeat/2005-10-28_12_35_33.0/I8299/ADNI_023_S_0031_MR_MPRAGE_Repeat__br_raw_20051030161559164_100_S10009_I8299.dcm
   📁 Extracted: 023_S_0031/MPRAGE_Repeat/2005-10-28_12_35_33.0/I8299/ADNI_023_S_0031_MR_MPRAGE_Repeat__br_raw_20051030161559461_101_S10009_I8299.dcm
   📁 Extracted: 023_S_0031/MPRAGE_Repeat/2005-10-28_12_35_33.0/I8299/ADNI_023_S_0031_MR_MPRAGE_Repeat__br_raw_20051030161559961_102_S10009_I8299.dcm
   📁 Extracted: 023_

In [3]:
def count_mprage_spellings(target_folder):
    """
    Count different spellings of MPRAGE in the extracted folder
    """
    target_base = Path(target_folder)
    
    if not target_base.exists():
        print(f"❌ Target folder not found: {target_folder}")
        return
    
    print(f"🔍 Scanning for MPRAGE spellings in: {target_folder}")
    print("=" * 60)
    
    # Dictionary to store counts and examples
    spelling_counts = {}
    
    # Walk through all directories
    for root, dirs, files in os.walk(target_base):
        # Check directory names
        for dir_name in dirs:
            # Look for MPRAGE variations (case insensitive)
            if any(variant in dir_name.upper() for variant in ['MPRAGE', 'MP-RAGE', 'MP_RAGE']):
                # Find the exact spelling (preserve case)
                for variant in ['MPRAGE', 'MP-RAGE', 'MP_RAGE', 'MPRAGE', 'MP-RAGE', 'MP_RAGE']:
                    if variant.upper() in dir_name.upper():
                        exact_spelling = dir_name
                        if exact_spelling not in spelling_counts:
                            spelling_counts[exact_spelling] = {
                                'count': 0,
                                'examples': [],
                                'paths': []
                            }
                        spelling_counts[exact_spelling]['count'] += 1
                        spelling_counts[exact_spelling]['examples'].append(dir_name)
                        
                        # Get the full path for context
                        full_path = Path(root) / dir_name
                        relative_path = full_path.relative_to(target_base)
                        spelling_counts[exact_spelling]['paths'].append(str(relative_path))
                        break
    
    # Display results
    if spelling_counts:
        print(f"📊 Found {len(spelling_counts)} different MPRAGE spellings:")
        print("-" * 60)
        
        total_folders = sum(info['count'] for info in spelling_counts.values())
        
        for spelling, info in spelling_counts.items():
            print(f"\n🔍 '{spelling}': {info['count']} folders")
            print(f"   Examples:")
            for path in info['paths'][:5]:  # Show first 5 examples
                print(f"      • {path}")
            if len(info['paths']) > 5:
                print(f"      ... and {len(info['paths']) - 5} more")
        
        print(f"\n�� Summary:")
        print(f"   • Total different spellings: {len(spelling_counts)}")
        print(f"   • Total MPRAGE folders: {total_folders}")
        print(f"   • Most common: {max(spelling_counts.items(), key=lambda x: x[1]['count'])[0]}")
        
    else:
        print("❌ No MPRAGE folders found!")
    
    return spelling_counts

# Usage
target_folder = r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_1 (2)"

print("🚀 Counting MPRAGE spelling variations...")
spelling_counts = count_mprage_spellings(target_folder)

🚀 Counting MPRAGE spelling variations...
🔍 Scanning for MPRAGE spellings in: D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_1 (2)
📊 Found 19 different MPRAGE spellings:
------------------------------------------------------------

🔍 'MPRAGE_Repeat': 17 folders
   Examples:
      • 023_S_0031\MPRAGE_Repeat
      • 023_S_0061\MPRAGE_Repeat
      • 032_S_0479\MPRAGE_Repeat
      • 033_S_0724\MPRAGE_Repeat
      • 023_S_0926\MPRAGE_Repeat
      ... and 12 more

🔍 'MPRAGE': 23 folders
   Examples:
      • 023_S_0031\MPRAGE
      • 067_S_0029\MPRAGE
      • 023_S_0061\MPRAGE
      • 133_S_0433\MPRAGE
      • 032_S_0479\MPRAGE
      ... and 18 more

🔍 'HHP_6_DOF_AC-PC_registered_MPRAGE': 4 folders
   Examples:
      • 023_S_0031\HHP_6_DOF_AC-PC_registered_MPRAGE
      • 002_S_0413\HHP_6_DOF_AC-PC_registered_MPRAGE
      • 067_S_1253\HHP_6_DOF_AC-PC_registered_MPRAGE
      • 133_S_0525\HHP_6_DOF_AC-PC_registered_MPRAGE

🔍 'MP-RAGE': 18 folders
   Examples:
      •

In [None]:
def count_subject_ids_at_level(zip_paths):
    """
    Count subject IDs at the specific directory level: ADNI/[Subject_ID]/
    """
    import zipfile
    from pathlib import Path
    from collections import defaultdict
    
    print("🔍 Counting subject IDs at ADNI/[Subject_ID] level across ZIP files...")
    print("=" * 70)
    
    # Dictionary to store subjects per ZIP
    zip_subjects = {}
    # Dictionary to track which subjects appear in multiple ZIPs
    subject_occurrences = defaultdict(list)
    
    for zip_path in zip_paths:
        zip_name = Path(zip_path).name
        print(f"\n📦 Processing: {zip_name}")
        
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                all_items = zip_ref.namelist()
                
                # Find subject IDs ONLY at the ADNI/[Subject_ID] level
                subjects_in_zip = set()
                for item in all_items:
                    parts = item.split('/')
                    
                    # Check if this item has the structure: ADNI/[Subject_ID]/...
                    if len(parts) >= 3:  # Must have at least 3 parts
                        if parts[0] == 'ADNI' and '_S_' in parts[1]:
                            # This is at the ADNI/[Subject_ID] level
                            subject_id = parts[1]
                            subjects_in_zip.add(subject_id)
                            subject_occurrences[subject_id].append(zip_name)
                
                zip_subjects[zip_name] = subjects_in_zip
                print(f"   ✅ Found {len(subjects_in_zip)} unique subjects at ADNI/[Subject_ID] level")
                
        except Exception as e:
            print(f"   ❌ Error processing {zip_name}: {e}")
            zip_subjects[zip_name] = set()
    
    # Calculate totals
    print(f"\n📊 SUBJECT COUNT SUMMARY")
    print("=" * 70)
    
    total_unique_subjects = set()
    total_subject_entries = 0
    
    for zip_name, subjects in zip_subjects.items():
        count = len(subjects)
        total_subject_entries += count
        total_unique_subjects.update(subjects)
        print(f"   {zip_name}: {count} subjects")
    
    print(f"\n📈 TOTALS:")
    print(f"   • Total unique subjects across all ZIPs: {len(total_unique_subjects)}")
    print(f"   • Total subject entries (sum of all ZIPs): {total_subject_entries}")
    
    # Check for overlaps
    overlapping_subjects = {subject: zips for subject, zips in subject_occurrences.items() if len(zips) > 1}
    
    if overlapping_subjects:
        print(f"\n⚠️ OVERLAPS DETECTED:")
        print(f"   • Subjects in multiple ZIPs: {len(overlapping_subjects)}")
        print(f"   • Subjects in single ZIPs: {len(total_unique_subjects) - len(overlapping_subjects)}")
        
        print(f"\n🔍 Overlapping subjects:")
        for subject, zips in overlapping_subjects.items():
            print(f"   • {subject}: appears in {len(zips)} ZIPs")
            for zip_name in zips:
                print(f"      - {zip_name}")
    else:
        print(f"\n✅ NO OVERLAPS: Each subject appears in only one ZIP file")
    
    return zip_subjects, total_unique_subjects, total_subject_entries

# Define the 5 ZIP file paths
zip_paths = [
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_1 (1).zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_2.zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_3.zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_4.zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_5.zip"
]

# Count subject IDs at the specific level
print("🚀 Starting subject ID count at ADNI/[Subject_ID] level...")
zip_subjects, total_unique, total_entries = count_subject_ids_at_level(zip_paths)