In [18]:
import os
from pathlib import Path
from collections import Counter, defaultdict
import itertools

def analyze_scan_type_combinations(directory_path):
    """
    Analyze unique combinations of scan types across all subjects.
    
    Args:
        directory_path (str): Path to the directory containing subject folders
        
    Returns:
        dict: Dictionary with combination analysis results
    """
    directory = Path(directory_path)
    
    if not directory.exists():
        print(f"❌ Directory not found: {directory}")
        return None
    
    print(f"🔍 Analyzing scan type combinations in: {directory}")
    print("=" * 80)
    
    # Dictionary to store each subject's scan types
    subject_scan_types = {}
    # Counter for individual scan types
    scan_type_counts = Counter()
    # Counter for scan type combinations
    combination_counts = Counter()
    
    # Get all subject directories
    subject_dirs = [d for d in directory.iterdir() if d.is_dir()]
    total_subjects = len(subject_dirs)
    
    print(f"📊 Total subjects found: {total_subjects}")
    print()
    
    # Analyze each subject
    for subject_dir in subject_dirs:
        subject_id = subject_dir.name
        scan_types = set()
        
        # Get all scan type directories for this subject
        scan_type_dirs = [d for d in subject_dir.iterdir() if d.is_dir()]
        
        for scan_type_dir in scan_type_dirs:
            scan_type = scan_type_dir.name
            scan_types.add(scan_type)
            scan_type_counts[scan_type] += 1
        
        subject_scan_types[subject_id] = scan_types
        
        # Create a sorted tuple for the combination (for consistent counting)
        if scan_types:
            combination = tuple(sorted(scan_types))
            combination_counts[combination] += 1
    
    # Print individual scan type counts
    print("📋 INDIVIDUAL SCAN TYPE COUNTS")
    print("=" * 50)
    for scan_type, count in scan_type_counts.most_common():
        print(f"{scan_type:<60} : {count:>4}")
    
    print()
    print("🔗 UNIQUE SCAN TYPE COMBINATIONS")
    print("=" * 80)
    print(f"Total unique combinations: {len(combination_counts)}")
    print()
    
    # Sort combinations by frequency
    sorted_combinations = combination_counts.most_common()
    
    print("📊 Combination Counts (sorted by frequency):")
    print("-" * 80)
    for i, (combination, count) in enumerate(sorted_combinations, 1):
        combination_str = " + ".join(combination)
        print(f"{i:>3}. {combination_str:<70} : {count:>4} subjects")
    
    print()
    print("📈 COMBINATION STATISTICS")
    print("=" * 50)
    print(f"Total subjects: {total_subjects}")
    print(f"Total unique scan types: {len(scan_type_counts)}")
    print(f"Total unique combinations: {len(combination_counts)}")
    
    # Most common combination
    if sorted_combinations:
        most_common = sorted_combinations[0]
        print(f"Most common combination: {' + '.join(most_common[0])} ({most_common[1]} subjects)")
    
    # Subjects with most/least scan types
    subject_scan_counts = {subj: len(scan_types) for subj, scan_types in subject_scan_types.items()}
    if subject_scan_counts:
        max_count = max(subject_scan_counts.values())
        min_count = min(subject_scan_counts.values())
        max_subjects = [subj for subj, count in subject_scan_counts.items() if count == max_count]
        min_subjects = [subj for subj, count in subject_scan_counts.items() if count == min_count]
        
        print(f"Maximum scan types per subject: {max_count}")
        print(f"Subjects with {max_count} scan types: {', '.join(max_subjects[:5])}{'...' if len(max_subjects) > 5 else ''}")
        print(f"Minimum scan types per subject: {min_count}")
        print(f"Subjects with {min_count} scan types: {', '.join(min_subjects[:5])}{'...' if len(min_subjects) > 5 else ''}")
    
    # Show some examples of unique combinations
    print()
    print("🔍 EXAMPLES OF UNIQUE COMBINATIONS")
    print("=" * 50)
    for i, (combination, count) in enumerate(sorted_combinations[:10], 1):
        combination_str = " + ".join(combination)
        print(f"{i:>2}. {combination_str} ({count} subjects)")
    
    return {
        'subject_scan_types': subject_scan_types,
        'scan_type_counts': dict(scan_type_counts),
        'combination_counts': dict(combination_counts),
        'total_subjects': total_subjects,
        'total_scan_types': len(scan_type_counts),
        'total_combinations': len(combination_counts)
    }

# Run the analysis
directory_path = r"C:\Users\User\github_repos\AD_CN_all_available_data_final\ADNI"
result = analyze_scan_type_combinations(directory_path)


🔍 Analyzing scan type combinations in: C:\Users\User\github_repos\AD_CN_all_available_data_final\ADNI
📊 Total subjects found: 1540

📋 INDIVIDUAL SCAN TYPE COUNTS
MPRAGE                                                       :  527
Accelerated_Sagittal_MPRAGE__MSV21_                          :  486
Accelerated_Sagittal_MPRAGE                                  :  442
MPRAGE_GRAPPA2                                               :  217
MPRAGE_Repeat                                                :  210
MP-RAGE                                                      :  200
MP-RAGE_REPEAT                                               :  165
MPRAGE_SENSE2                                                :   95
Accelerated_Sagittal_MPRAGE_ND                               :   89
Sagittal_3D_Accelerated_MPRAGE                               :   84
Accelerated_Sagittal_MPRAGE__MSV22_                          :   64
CS_Sagittal_MPRAGE__MSV22_                                   :   40
HS_Sagittal_MPRAGE__MS

In [7]:
def create_training_dataset(source_dir, target_dir, resume=True):
    """
    Create training dataset by selecting one scan per subject with progress bar and resume capability.
    Selection logic:
    1. Look for 'MPRAGE' scan type
    2. If not available, look for 'MP-RAGE' scan type
    3. Copy the selected scan maintaining directory organization
    
    Args:
        source_dir (str): Source directory containing subject folders
        target_dir (str): Target directory for training dataset
        resume (bool): Resume from where left off (skip existing files)
        
    Returns:
        dict: Dictionary with selection and copy results
    """
    import shutil
    from pathlib import Path
    from tqdm import tqdm
    import time
    
    source_path = Path(source_dir)
    target_path = Path(target_dir)
    
    if not source_path.exists():
        print(f"❌ Source directory not found: {source_path}")
        return None
    
    # Create target directory
    target_path.mkdir(parents=True, exist_ok=True)
    
    print(f"🔍 Creating training dataset")
    print(f"📂 Source: {source_path}")
    print(f"📁 Target: {target_path}")
    print(f"🔄 Resume mode: {'ON' if resume else 'OFF'}")
    print("=" * 80)
    
    # Categories for tracking
    subjects_with_mprage = []
    subjects_with_mp_rage = []
    subjects_with_neither = []
    
    # Copy statistics
    copied_count = 0
    skipped_count = 0
    error_count = 0
    
    # Get all subject directories
    subject_dirs = [d for d in source_path.iterdir() if d.is_dir()]
    total_subjects = len(subject_dirs)
    
    print(f"📊 Total subjects found: {total_subjects}")
    
    # Check existing files for resume
    if resume:
        existing_subjects = set()
        for existing_dir in target_path.iterdir():
            if existing_dir.is_dir():
                # Check if subject has scan type directory
                scan_dirs = [d for d in existing_dir.iterdir() if d.is_dir()]
                if scan_dirs:  # Has at least one scan type
                    existing_subjects.add(existing_dir.name)
        print(f"🔄 Found {len(existing_subjects)} existing subjects, will skip if complete")
    
    print()
    
    # Process each subject with progress bar
    start_time = time.time()
    
    with tqdm(total=total_subjects, desc="Processing subjects", unit="subj") as pbar:
        for subject_dir in subject_dirs:
            subject_id = subject_dir.name
            
            # Skip if already processed (resume mode)
            if resume and subject_id in existing_subjects:
                # Check if this subject has a complete scan
                target_subject_dir = target_path / subject_id
                scan_dirs = [d for d in target_subject_dir.iterdir() if d.is_dir()]
                if scan_dirs:  # Has scan type directory
                    skipped_count += 1
                    pbar.set_postfix({
                        'Copied': copied_count, 
                        'Skipped': skipped_count, 
                        'Errors': error_count
                    })
                    pbar.update(1)
                    continue
            
            scan_types = []
            
            # Get all scan type directories for this subject
            scan_type_dirs = [d for d in subject_dir.iterdir() if d.is_dir()]
            
            for scan_type_dir in scan_type_dirs:
                scan_type = scan_type_dir.name
                scan_types.append(scan_type)
            
            # Selection logic: prioritize MPRAGE over MP-RAGE
            selected_scan_type = None
            
            if 'MPRAGE' in scan_types:
                selected_scan_type = 'MPRAGE'
                subjects_with_mprage.append(subject_id)
            elif 'MP-RAGE' in scan_types:
                selected_scan_type = 'MP-RAGE'
                subjects_with_mp_rage.append(subject_id)
            else:
                subjects_with_neither.append(subject_id)
                pbar.set_postfix({
                    'Copied': copied_count, 
                    'Skipped': skipped_count, 
                    'Errors': error_count
                })
                pbar.update(1)
                continue
            
            # Copy the selected scan type
            source_scan_dir = subject_dir / selected_scan_type
            target_subject_dir = target_path / subject_id
            target_scan_dir = target_subject_dir / selected_scan_type
            
            try:
                # Create target subject directory
                target_subject_dir.mkdir(parents=True, exist_ok=True)
                
                # Copy the scan type directory
                if target_scan_dir.exists():
                    skipped_count += 1
                else:
                    shutil.copytree(source_scan_dir, target_scan_dir)
                    copied_count += 1
                    
            except Exception as e:
                error_count += 1
                # Only log errors, not every operation
                if error_count <= 5:  # Log first 5 errors
                    print(f"\n❌ Error copying {subject_id}: {e}")
                elif error_count == 6:
                    print(f"\n⚠️  Additional errors will be counted but not displayed...")
            
            # Update progress bar
            pbar.set_postfix({
                'Copied': copied_count, 
                'Skipped': skipped_count, 
                'Errors': error_count
            })
            pbar.update(1)
    
    # Calculate final statistics
    total_selected = len(subjects_with_mprage) + len(subjects_with_mp_rage)
    total_excluded = len(subjects_with_neither)
    elapsed_time = time.time() - start_time
    
    print()
    print("📋 SELECTION SUMMARY")
    print("=" * 60)
    print(f"Subjects with 'MPRAGE' selected:        {len(subjects_with_mprage):>4}")
    print(f"Subjects with 'MP-RAGE' selected:       {len(subjects_with_mp_rage):>4}")
    print(f"Subjects excluded (no MPRAGE):          {len(subjects_with_neither):>4}")
    print("-" * 60)
    print(f"Total subjects processed:               {total_subjects:>4}")
    print(f"Total subjects selected:                {total_selected:>4}")
    
    print()
    print("📁 COPY OPERATIONS")
    print("=" * 60)
    print(f"✅ Successfully copied:                 {copied_count:>4}")
    print(f"⏭ Skipped (already exists):            {skipped_count:>4}")
    print(f"❌ Errors:                              {error_count:>4}")
    
    print()
    print("📊 COVERAGE STATISTICS")
    print("=" * 60)
    coverage_percentage = (total_selected / total_subjects) * 100
    print(f"Dataset coverage: {coverage_percentage:.1f}% ({total_selected}/{total_subjects})")
    print(f"Processing time: {elapsed_time:.1f} seconds")
    
    # Show some examples (only if not too many)
    if total_selected <= 20:
        print()
        print("🔍 ALL SELECTIONS")
        print("=" * 60)
        print("Subjects with 'MPRAGE' selected:")
        for i, subj in enumerate(subjects_with_mprage, 1):
            print(f"  {i:>2}. {subj}")
        
        print("\nSubjects with 'MP-RAGE' selected:")
        for i, subj in enumerate(subjects_with_mp_rage, 1):
            print(f"  {i:>2}. {subj}")
    else:
        print()
        print("🔍 SAMPLE SELECTIONS (first 10 of each)")
        print("=" * 60)
        print("Subjects with 'MPRAGE' selected:")
        for i, subj in enumerate(subjects_with_mprage[:10], 1):
            print(f"  {i:>2}. {subj}")
        if len(subjects_with_mprage) > 10:
            print(f"  ... and {len(subjects_with_mprage) - 10} more")
        
        print("\nSubjects with 'MP-RAGE' selected:")
        for i, subj in enumerate(subjects_with_mp_rage[:10], 1):
            print(f"  {i:>2}. {subj}")
        if len(subjects_with_mp_rage) > 10:
            print(f"  ... and {len(subjects_with_mp_rage) - 10} more")
    
    if subjects_with_neither and len(subjects_with_neither) <= 10:
        print(f"\nSubjects excluded (no MPRAGE):")
        for i, subj in enumerate(subjects_with_neither, 1):
            print(f"  {i:>2}. {subj}")
    elif subjects_with_neither:
        print(f"\nSubjects excluded: {len(subjects_with_neither)} (no MPRAGE available)")
    
    print()
    print(f"🎯 Training dataset created at: {target_path}")
    print(f"📁 Directory structure: {target_path}/<subject_id>/<scan_type>/...")
    
    return {
        'total_subjects': total_subjects,
        'subjects_with_mprage': subjects_with_mprage,
        'subjects_with_mp_rage': subjects_with_mp_rage,
        'subjects_with_neither': subjects_with_neither,
        'total_selected': total_selected,
        'total_excluded': total_excluded,
        'copied_count': copied_count,
        'skipped_count': skipped_count,
        'error_count': error_count,
        'coverage_percentage': coverage_percentage,
        'processing_time': elapsed_time,
        'target_directory': str(target_path)
    }

# Create training dataset
source_directory = r"C:\Users\User\github_repos\AD_CN_all_available_data_final\ADNI"
target_directory = r"C:\Users\User\github_repos\AD_CN_train_v1"

training_result = create_training_dataset(source_directory, target_directory)


🔍 Creating training dataset
📂 Source: C:\Users\User\github_repos\AD_CN_all_available_data_final\ADNI
📁 Target: C:\Users\User\github_repos\AD_CN_train_v1
🔄 Resume mode: ON
📊 Total subjects found: 1540
🔄 Found 136 existing subjects, will skip if complete



Processing subjects: 100%|██████████| 1540/1540 [08:06<00:00,  3.16subj/s, Copied=538, Skipped=136, Errors=0]


📋 SELECTION SUMMARY
Subjects with 'MPRAGE' selected:         423
Subjects with 'MP-RAGE' selected:        115
Subjects excluded (no MPRAGE):           866
------------------------------------------------------------
Total subjects processed:               1540
Total subjects selected:                 538

📁 COPY OPERATIONS
✅ Successfully copied:                  538
⏭ Skipped (already exists):             136
❌ Errors:                                 0

📊 COVERAGE STATISTICS
Dataset coverage: 34.9% (538/1540)
Processing time: 486.8 seconds

🔍 SAMPLE SELECTIONS (first 10 of each)
Subjects with 'MPRAGE' selected:
   1. 016_S_0991
   2. 016_S_1263
   3. 018_S_0043
   4. 018_S_0055
   5. 018_S_0286
   6. 018_S_0335
   7. 018_S_0369
   8. 018_S_0425
   9. 018_S_0633
  10. 018_S_0682
  ... and 413 more

Subjects with 'MP-RAGE' selected:
   1. 021_S_0159
   2. 021_S_0337
   3. 021_S_0343
   4. 021_S_0642
   5. 021_S_0647
   6. 021_S_0984
   7. 021_S_1109
   8. 024_S_0985
   9. 024_S_1063
  1




In [16]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import re
from tqdm import tqdm

def create_training_metadata(csv_path, training_dir, output_path=None):
    """
    Create metadata for training dataset by matching CSV data with training directory structure.
    
    Args:
        csv_path (str): Path to the CSV file with metadata
        training_dir (str): Path to training dataset directory
        output_path (str): Path to save metadata CSV (optional)
        
    Returns:
        pd.DataFrame: Training dataset metadata
    """
    print("🔍 Creating training dataset metadata...")
    print("=" * 60)
    
    # Read CSV file
    print(f"📂 Reading CSV: {csv_path}")
    df = pd.read_csv(csv_path)
    print(f"📊 CSV loaded: {len(df)} records")
    
    # Filter out unwanted scan types to speed up processing
    print("🔍 Filtering out unwanted scan types...")
    unwanted_keywords = ['mapping', 'localizer', 'survey', 'scout']
    
    # Create a mask to exclude rows with unwanted keywords in Description
    mask = ~df['Description'].str.lower().str.contains('|'.join(unwanted_keywords), na=False)
    df_filtered = df[mask].copy()
    
    print(f"📊 After filtering: {len(df_filtered)} records (removed {len(df) - len(df_filtered)} unwanted scans)")
    
    # Use filtered dataframe for processing
    df = df_filtered
    
    # Load training directory structure
    print(f"📁 Scanning training directory: {training_dir}")
    training_path = Path(training_dir)
    
    # Get all subject directories first for progress tracking
    subject_dirs = [d for d in training_path.iterdir() if d.is_dir()]
    total_subjects = len(subject_dirs)
    
    print(f"📊 Found {total_subjects} subjects to process")
    
    # Collect unique Image Data IDs (3D scans) instead of individual files
    training_scans = []
    
    print("\n🔍 Scanning training scans (by Image Data ID)...")
    with tqdm(total=total_subjects, desc="Scanning subjects", unit="subj") as pbar:
        for subject_dir in subject_dirs:
            subject_id = subject_dir.name
            
            # Find scan type directories
            for scan_dir in subject_dir.iterdir():
                if not scan_dir.is_dir():
                    continue
                    
                scan_type = scan_dir.name
                
                # Find timestamp directories
                for timestamp_dir in scan_dir.iterdir():
                    if not timestamp_dir.is_dir():
                        continue
                        
                    timestamp_str = timestamp_dir.name
                    
                    # Find Image Data ID directories (I238627, etc.)
                    for image_id_dir in timestamp_dir.iterdir():
                        if not image_id_dir.is_dir():
                            continue
                            
                        image_data_id = image_id_dir.name
                        
                        # Count DICOM files in this scan
                        dcm_files = list(image_id_dir.glob("*.dcm"))
                        if dcm_files:  # Only include if has DICOM files
                            # Get one representative file path
                            sample_file = dcm_files[0]
                            training_scans.append({
                                'subject_id': subject_id,
                                'scan_type': scan_type,
                                'timestamp': timestamp_str,
                                'image_data_id': image_data_id,
                                'dcm_count': len(dcm_files),
                                'sample_file_path': str(sample_file),
                                'relative_scan_path': str(image_id_dir.relative_to(training_path))
                            })
            
            pbar.set_postfix({'Scans found': len(training_scans)})
            pbar.update(1)
    
    print(f"📊 Found {len(training_scans)} unique training scans (3D images)")
    
    # Convert to DataFrame
    training_df = pd.DataFrame(training_scans)
    
    # Create metadata by matching using Image Data ID (much faster!)
    metadata_records = []
    
    print("\n🔗 Matching training scans with CSV metadata by Image Data ID...")
    
    # Visual inspection counter
    inspection_count = 0
    
    with tqdm(total=len(training_df), desc="Matching scans", unit="scan") as pbar:
        for idx, (_, training_scan) in enumerate(training_df.iterrows()):
            subject_id = training_scan['subject_id']
            image_data_id = training_scan['image_data_id']
            
            # Find matching record in CSV by Image Data ID (exact match - much faster!)
            matching_record = df[df['Image Data ID'] == image_data_id]
            
            if len(matching_record) == 0:
                pbar.update(1)
                continue
            
            # Should be exactly one match since Image Data ID is unique
            best_match = matching_record.iloc[0]
            
            # Visual inspection for first 5 scans
            if inspection_count < 5:
                print(f"\n🔍 INSPECTION #{inspection_count + 1}")
                print("-" * 60)
                print(f"Training Scan: {training_scan['relative_scan_path']}")
                print(f"Subject ID: {subject_id}")
                print(f"Scan Type: {training_scan['scan_type']}")
                print(f"Timestamp: {training_scan['timestamp']}")
                print(f"Image Data ID: {image_data_id}")
                print(f"DICOM files count: {training_scan['dcm_count']}")
                print(f"CSV Match:")
                print(f"  - Description: {best_match['Description']}")
                print(f"  - Acq Date: {best_match['Acq Date']}")
                print(f"  - Group: {best_match['Group']}")
                print(f"  - Sex: {best_match['Sex']}")
                print(f"  - Age: {best_match['Age']}")
                print(f"  - Visit: {best_match['Visit']}")
                print("-" * 60)
                inspection_count += 1
            
            # Create one metadata record per 3D scan (not per DICOM file)
            metadata_record = {
                'Subject': subject_id,
                'ScanPath': training_scan['relative_scan_path'],
                'ScanType': training_scan['scan_type'],
                'Timestamp': training_scan['timestamp'],
                'ImageDataID': image_data_id,
                'DicomCount': training_scan['dcm_count'],
                'Group': best_match['Group'],
                'Sex': best_match['Sex'],
                'Age': best_match['Age'],
                'Visit': best_match['Visit'],
                'Modality': best_match['Modality'],
                'Description': best_match['Description'],
                'Type': best_match['Type'],
                'AcqDate': best_match['Acq Date'],
                'Format': best_match['Format']
            }
            
            metadata_records.append(metadata_record)
            
            pbar.set_postfix({'Scans matched': idx + 1, 'Total scans': len(metadata_records)})
            pbar.update(1)
    
    # Create final metadata DataFrame
    metadata_df = pd.DataFrame(metadata_records)
    
    print(f"✅ Created metadata for {len(metadata_df)} 3D scans")
    
    # Summary statistics
    print("\n📊 METADATA SUMMARY")
    print("=" * 60)
    print(f"Total training scans (3D images): {len(training_df)}")
    print(f"Scans with metadata: {len(metadata_df)}")
    print(f"Total DICOM files: {metadata_df['DicomCount'].sum()}")
    print(f"Coverage: 100.0% (exact Image Data ID matching)")
    
    # Group distribution
    if 'Group' in metadata_df.columns:
        print(f"\nGroup distribution:")
        group_counts = metadata_df['Group'].value_counts()
        for group, count in group_counts.items():
            print(f"  {group}: {count}")
    
    # Scan type distribution
    if 'ScanType' in metadata_df.columns:
        print(f"\nScan type distribution:")
        scan_counts = metadata_df['ScanType'].value_counts()
        for scan_type, count in scan_counts.items():
            print(f"  {scan_type}: {count}")
    
    # Matching quality (Image Data ID is exact match)
    print(f"\nMatching quality:")
    print(f"  Method: Exact Image Data ID matching")
    print(f"  Accuracy: 100% (no fuzzy matching needed)")
    
    # Save metadata if output path provided
    if output_path:
        print(f"\n💾 Saving metadata to: {output_path}")
        metadata_df.to_csv(output_path, index=False)
        print(f"✅ Metadata saved successfully!")
    
    return metadata_df

# Run the metadata creation
csv_file = r"C:\Users\User\github_repos\AD_CN_all_available_data_final\AD_CN_all_available_data.csv"
training_directory = r"C:\Users\User\github_repos\AD_CN_train_v1"
output_file = r"C:\Users\User\github_repos\AD_CN_train_v1\metadata.csv"

metadata = create_training_metadata(csv_file, training_directory, output_file)


🔍 Creating training dataset metadata...
📂 Reading CSV: C:\Users\User\github_repos\AD_CN_all_available_data_final\AD_CN_all_available_data.csv
📊 CSV loaded: 31182 records
🔍 Filtering out unwanted scan types...
📊 After filtering: 15852 records (removed 15330 unwanted scans)
📁 Scanning training directory: C:\Users\User\github_repos\AD_CN_train_v1
📊 Found 674 subjects to process

🔍 Scanning training scans (by Image Data ID)...


Scanning subjects: 100%|██████████| 674/674 [00:05<00:00, 117.14subj/s, Scans found=2968]


📊 Found 2968 unique training scans (3D images)

🔗 Matching training scans with CSV metadata by Image Data ID...


Matching scans:   1%|          | 34/2968 [00:00<00:14, 201.67scan/s, Scans matched=35, Total scans=35]


🔍 INSPECTION #1
------------------------------------------------------------
Training Scan: 002_S_0295\MPRAGE\2011-06-02_07_58_50.0\I238627
Subject ID: 002_S_0295
Scan Type: MPRAGE
Timestamp: 2011-06-02_07_58_50.0
Image Data ID: I238627
DICOM files count: 170
CSV Match:
  - Description: MPRAGE
  - Acq Date: 6/02/2011
  - Group: CN
  - Sex: M
  - Age: 90
  - Visit: v06
------------------------------------------------------------

🔍 INSPECTION #2
------------------------------------------------------------
Training Scan: 002_S_0295\MPRAGE\2012-05-10_15_44_50.0\I303066
Subject ID: 002_S_0295
Scan Type: MPRAGE
Timestamp: 2012-05-10_15_44_50.0
Image Data ID: I303066
DICOM files count: 170
CSV Match:
  - Description: MPRAGE
  - Acq Date: 5/10/2012
  - Group: CN
  - Sex: M
  - Age: 91
  - Visit: v11
------------------------------------------------------------

🔍 INSPECTION #3
------------------------------------------------------------
Training Scan: 002_S_0413\MPRAGE\2006-11-15_14_23_26.0\I

Matching scans: 100%|██████████| 2968/2968 [00:15<00:00, 192.50scan/s, Scans matched=2968, Total scans=2968]


✅ Created metadata for 2968 files

📊 METADATA SUMMARY
Total training scans (3D images): 2968
Total DICOM files: 2968
Files with metadata: 2968
Coverage: 100.0% (exact Image Data ID matching)

Group distribution:
  CN: 1989
  AD: 979

Scan type distribution:
  MPRAGE: 2276
  MP-RAGE: 692

Matching quality:
  Method: Exact Image Data ID matching
  Accuracy: 100% (no fuzzy matching needed)

💾 Saving metadata to: C:\Users\User\github_repos\AD_CN_train_v1\metadata.csv
✅ Metadata saved successfully!


# Analyze demographics

In [25]:
import pandas as pd
import numpy as np

def analyze_demographics(csv_path):
    """
    Analyze demographic statistics from the training metadata CSV
    """
    print("🔍 Loading metadata...")
    df = pd.read_csv(csv_path)
    
    print(f"📊 Total scans in dataset: {len(df)}")
    print(f"📊 Unique subjects: {df['Subject'].nunique()}")
    
    # 1. AD vs CN class count
    print("\n" + "="*60)
    print("📋 1. AD vs CN CLASS DISTRIBUTION")
    print("="*60)
    
    group_counts = df['Group'].value_counts()
    print(f"CN (Cognitively Normal): {group_counts.get('CN', 0)} scans")
    print(f"AD (Alzheimer's Disease): {group_counts.get('AD', 0)} scans")
    
    # Calculate percentages
    total_scans = len(df)
    cn_pct = (group_counts.get('CN', 0) / total_scans) * 100
    ad_pct = (group_counts.get('AD', 0) / total_scans) * 100
    print(f"CN: {cn_pct:.1f}%")
    print(f"AD: {ad_pct:.1f}%")
    
    # 2. Male vs Female distribution for AD and CN
    print("\n" + "="*60)
    print("📋 2. SEX DISTRIBUTION BY GROUP")
    print("="*60)
    
    # Cross-tabulation of Group and Sex
    cross_tab = pd.crosstab(df['Group'], df['Sex'], margins=True)
    print("\nSex Distribution:")
    print(cross_tab)
    
    # Detailed breakdown
    print("\nDetailed Breakdown:")
    for group in ['CN', 'AD']:
        group_data = df[df['Group'] == group]
        if len(group_data) > 0:
            male_count = len(group_data[group_data['Sex'] == 'M'])
            female_count = len(group_data[group_data['Sex'] == 'F'])
            total_group = len(group_data)
            
            print(f"\n{group} Group:")
            print(f"  Male: {male_count} scans ({male_count/total_group*100:.1f}%)")
            print(f"  Female: {female_count} scans ({female_count/total_group*100:.1f}%)")
    
    # 3. Age statistics for AD and CN
    print("\n" + "="*60)
    print("📋 3. AGE STATISTICS BY GROUP")
    print("="*60)
    
    for group in ['CN', 'AD']:
        group_data = df[df['Group'] == group]
        if len(group_data) > 0:
            ages = group_data['Age'].dropna()
            if len(ages) > 0:
                mean_age = ages.mean()
                std_age = ages.std()
                min_age = ages.min()
                max_age = ages.max()
                
                print(f"\n{group} Group:")
                print(f"  Mean Age: {mean_age:.1f} years")
                print(f"  Std Dev: {std_age:.1f} years")
                print(f"  Age Range: {min_age:.0f} - {max_age:.0f} years")
                print(f"  Sample Size: {len(ages)} scans")
    
    # Overall age statistics
    print(f"\nOverall Dataset:")
    overall_ages = df['Age'].dropna()
    if len(overall_ages) > 0:
        print(f"  Mean Age: {overall_ages.mean():.1f} years")
        print(f"  Std Dev: {overall_ages.std():.1f} years")
        print(f"  Age Range: {overall_ages.min():.0f} - {overall_ages.max():.0f} years")
    
    # Additional insights
    print("\n" + "="*60)
    print("📋 ADDITIONAL INSIGHTS")
    print("="*60)
    
    # Scan type distribution
    scan_type_counts = df['ScanType'].value_counts()
    print(f"\nScan Type Distribution:")
    for scan_type, count in scan_type_counts.items():
        print(f"  {scan_type}: {count} scans")
    
    # Visit distribution
    visit_counts = df['Visit'].value_counts()
    print(f"\nTop 10 Visits:")
    for visit, count in visit_counts.head(10).items():
        print(f"  {visit}: {count} scans")
    
    return df

if __name__ == "__main__":
    csv_path = r"C:\Users\User\github_repos\AD_CN_train_v2\metadata.csv"
    df = analyze_demographics(csv_path)
    print("\n✅ Analysis completed!")

🔍 Loading metadata...
📊 Total scans in dataset: 674
📊 Unique subjects: 674

📋 1. AD vs CN CLASS DISTRIBUTION
CN (Cognitively Normal): 385 scans
AD (Alzheimer's Disease): 289 scans
CN: 57.1%
AD: 42.9%

📋 2. SEX DISTRIBUTION BY GROUP

Sex Distribution:
Sex      F    M  All
Group               
AD     135  154  289
CN     194  191  385
All    329  345  674

Detailed Breakdown:

CN Group:
  Male: 191 scans (49.6%)
  Female: 194 scans (50.4%)

AD Group:
  Male: 154 scans (53.3%)
  Female: 135 scans (46.7%)

📋 3. AGE STATISTICS BY GROUP

CN Group:
  Mean Age: 76.5 years
  Std Dev: 6.3 years
  Age Range: 57 - 95 years
  Sample Size: 385 scans

AD Group:
  Mean Age: 76.0 years
  Std Dev: 7.7 years
  Age Range: 56 - 91 years
  Sample Size: 289 scans

Overall Dataset:
  Mean Age: 76.2 years
  Std Dev: 6.9 years
  Age Range: 56 - 95 years

📋 ADDITIONAL INSIGHTS

Scan Type Distribution:
  MPRAGE: 527 scans
  MP-RAGE: 147 scans

Top 10 Visits:
  sc: 111 scans
  v02: 90 scans
  m06: 90 scans
  m12: 

In [24]:
def analyze_subject_groups(csv_path):
    """
    Analyze no of unique subjects and check for subjects with both AD and CN groups
    """
    print("🔍 Loading metadata...")
    df = pd.read_csv(csv_path)
    
    print(f"📊 Total scans in dataset: {len(df)}")
    
    # Count unique subjects
    unique_subjects = df['Subject'].nunique()
    print(f"📊 Unique subjects: {unique_subjects}")
    
    # Analyze subject-group combinations
    subject_groups = df.groupby('Subject')['Group'].unique().reset_index()
    subject_groups['GroupCount'] = subject_groups['Group'].apply(len)
    subject_groups['Groups'] = subject_groups['Group'].apply(lambda x: ', '.join(sorted(x)))
    
    print(f"\n📋 SUBJECT-GROUP ANALYSIS")
    print("=" * 60)
    
    # Count subjects by number of groups
    group_counts = subject_groups['GroupCount'].value_counts().sort_index()
    print(f"\nSubjects by number of groups:")
    for num_groups, count in group_counts.items():
        group_type = "group" if num_groups == 1 else "groups"
        print(f"  {num_groups} {group_type}: {count} subjects")
    
    # Find subjects with both AD and CN
    subjects_with_both = subject_groups[subject_groups['GroupCount'] > 1]
    
    if len(subjects_with_both) > 0:
        print(f"\n🔍 SUBJECTS WITH MULTIPLE GROUPS ({len(subjects_with_both)} subjects):")
        print("-" * 60)
        
        for _, row in subjects_with_both.iterrows():
            subject_id = row['Subject']
            groups = row['Groups']
            group_list = row['Group']
            
            # Get scan counts for this subject
            subject_data = df[df['Subject'] == subject_id]
            scan_counts = subject_data['Group'].value_counts()
            
            print(f"Subject {subject_id}:")
            print(f"  Groups: {groups}")
            for group in group_list:
                count = scan_counts.get(group, 0)
                print(f"    {group}: {count} scans")
            print()
    else:
        print(f"\n✅ No subjects found with multiple groups")
        print("All subjects belong to only one group (AD or CN)")
    
    # Summary statistics
    print(f"\n📊 SUMMARY:")
    print("=" * 60)
    print(f"Total unique subjects: {unique_subjects}")
    print(f"Subjects with single group: {len(subject_groups[subject_groups['GroupCount'] == 1])}")
    print(f"Subjects with multiple groups: {len(subjects_with_both)}")
    
    if len(subjects_with_both) > 0:
        print(f"Percentage with multiple groups: {len(subjects_with_both)/unique_subjects*100:.1f}%")
    
    return subject_groups, subjects_with_both

# Run the analysis
csv_path = r"C:\Users\User\github_repos\AD_CN_train_v1\metadata.csv"
subject_groups, subjects_with_both = analyze_subject_groups(csv_path)

🔍 Loading metadata...
📊 Total scans in dataset: 2968
📊 Unique subjects: 674

📋 SUBJECT-GROUP ANALYSIS

Subjects by number of groups:
  1 group: 674 subjects

✅ No subjects found with multiple groups
All subjects belong to only one group (AD or CN)

📊 SUMMARY:
Total unique subjects: 674
Subjects with single group: 674
Subjects with multiple groups: 0


In [None]:
## Given there is no subjects with MPRAGE or MP-RAGE that has change in labels is CN to AD

# Select 1 scan per 'Subject'

In [22]:
import pandas as pd
import shutil
from pathlib import Path
from tqdm import tqdm
import os
import numpy as np

def select_one_scan_per_subject(csv_path, selection_strategy='random'):
    """
    Select one scan per unique subject based on selection strategy
    
    Args:
        csv_path: Path to metadata CSV
        selection_strategy: 'random', 'earliest', 'latest', or 'baseline'
    
    Returns:
        DataFrame with one scan per subject
    """
    print("🔍 Loading metadata...")
    df = pd.read_csv(csv_path)
    
    print(f"📊 Original dataset: {len(df)} scans from {df['Subject'].nunique()} subjects")
    
    # Convert timestamp to datetime for proper sorting
    df['TimestampDateTime'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d_%H_%M_%S.%f')
    
    selected_scans = []
    
    print(f"\n🎯 Selecting scans using '{selection_strategy}' strategy...")
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Group by subject and select one scan per subject
    for subject_id, subject_data in tqdm(df.groupby('Subject'), desc="Processing subjects"):
        if selection_strategy == 'random':
            # Select random scan
            selected_scan = subject_data.sample(n=1, random_state=np.random.randint(0, 10000)).iloc[0]
        elif selection_strategy == 'earliest':
            # Select scan with earliest timestamp
            selected_scan = subject_data.loc[subject_data['TimestampDateTime'].idxmin()]
        elif selection_strategy == 'latest':
            # Select scan with latest timestamp
            selected_scan = subject_data.loc[subject_data['TimestampDateTime'].idxmax()]
        elif selection_strategy == 'baseline':
            # Prefer baseline visits (bl, m00, sc) then earliest
            baseline_visits = subject_data[subject_data['Visit'].str.lower().isin(['bl', 'm00', 'sc'])]
            if len(baseline_visits) > 0:
                selected_scan = baseline_visits.loc[baseline_visits['TimestampDateTime'].idxmin()]
            else:
                selected_scan = subject_data.loc[subject_data['TimestampDateTime'].idxmin()]
        else:
            # Default to random
            selected_scan = subject_data.sample(n=1, random_state=np.random.randint(0, 10000)).iloc[0]
        
        selected_scans.append(selected_scan)
    
    # Create DataFrame from selected scans
    selected_df = pd.DataFrame(selected_scans)
    
    print(f"✅ Selected {len(selected_df)} scans (1 per subject)")
    
    # Show selection summary
    print(f"\n📊 SELECTION SUMMARY:")
    print("=" * 50)
    print(f"Original scans: {len(df)}")
    print(f"Selected scans: {len(selected_df)}")
    print(f"Reduction: {len(df) - len(selected_df)} scans removed")
    
    # Group distribution
    group_dist = selected_df['Group'].value_counts()
    print(f"\nSelected dataset distribution:")
    for group, count in group_dist.items():
        print(f"  {group}: {count} subjects")
    
    return selected_df

def copy_selected_scans(source_dir, target_dir, selected_df, resume=True):
    """
    Copy selected scans to new directory maintaining structure
    
    Args:
        source_dir: Source directory path
        target_dir: Target directory path  
        selected_df: DataFrame with selected scans
        resume: Whether to skip already copied scans
    """
    source_path = Path(source_dir)
    target_path = Path(target_dir)
    
    # Create target directory
    target_path.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 Copying selected scans...")
    print(f"   Source: {source_dir}")
    print(f"   Target: {target_dir}")
    
    copied_count = 0
    skipped_count = 0
    error_count = 0
    
    # Copy each selected scan
    with tqdm(total=len(selected_df), desc="Copying scans") as pbar:
        for _, scan in selected_df.iterrows():
            try:
                # Parse scan path to get components
                scan_path_parts = Path(scan['ScanPath']).parts
                subject_id = scan_path_parts[0]
                scan_type = scan_path_parts[1]
                timestamp = scan_path_parts[2]
                image_data_id = scan_path_parts[3]
                
                # Source and target paths
                source_scan_dir = source_path / subject_id / scan_type / timestamp / image_data_id
                target_scan_dir = target_path / subject_id / scan_type / timestamp / image_data_id
                
                # Check if already copied (resume functionality)
                if resume and target_scan_dir.exists():
                    # Check if target has same number of files
                    if source_scan_dir.exists():
                        source_files = list(source_scan_dir.glob("*.dcm"))
                        target_files = list(target_scan_dir.glob("*.dcm"))
                        if len(source_files) == len(target_files):
                            skipped_count += 1
                            pbar.set_postfix({
                                'Copied': copied_count, 
                                'Skipped': skipped_count,
                                'Errors': error_count
                            })
                            pbar.update(1)
                            continue
                
                # Copy the scan directory
                if source_scan_dir.exists():
                    # Create parent directories
                    target_scan_dir.parent.mkdir(parents=True, exist_ok=True)
                    
                    # Remove target if it exists (for clean copy)
                    if target_scan_dir.exists():
                        shutil.rmtree(target_scan_dir)
                    
                    # Copy the directory
                    shutil.copytree(source_scan_dir, target_scan_dir)
                    copied_count += 1
                else:
                    print(f"⚠️  Source not found: {source_scan_dir}")
                    error_count += 1
                
            except Exception as e:
                print(f"❌ Error copying {scan['Subject']}: {e}")
                error_count += 1
            
            pbar.set_postfix({
                'Copied': copied_count, 
                'Skipped': skipped_count,
                'Errors': error_count
            })
            pbar.update(1)
    
    print(f"\n✅ Copy completed!")
    print(f"   Copied: {copied_count} scans")
    print(f"   Skipped: {skipped_count} scans (already exist)")
    print(f"   Errors: {error_count} scans")
    
    return copied_count, skipped_count, error_count

def create_single_scan_dataset(source_csv, source_dir, target_dir, selection_strategy='random'):
    """
    Main function to create dataset with one scan per subject
    
    Args:
        source_csv: Path to source metadata CSV
        source_dir: Source directory with scans
        target_dir: Target directory for new dataset
        selection_strategy: 'random', 'earliest', 'latest', or 'baseline'
    """
    print("🚀 Creating single-scan dataset...")
    print("=" * 60)
    
    # Step 1: Select one scan per subject
    selected_df = select_one_scan_per_subject(source_csv, selection_strategy)
    
    # Step 2: Copy selected scans
    copied_count, skipped_count, error_count = copy_selected_scans(
        source_dir, target_dir, selected_df, resume=True
    )
    
    # Step 3: Save updated metadata
    target_metadata_path = Path(target_dir) / "metadata.csv"
    
    # Remove the TimestampDateTime column before saving
    metadata_to_save = selected_df.drop('TimestampDateTime', axis=1)
    
    metadata_to_save.to_csv(target_metadata_path, index=False)
    print(f"💾 Saved metadata: {target_metadata_path}")
    
    # Final summary
    print(f"\n🎉 DATASET CREATION COMPLETE!")
    print("=" * 60)
    print(f"Strategy used: {selection_strategy}")
    print(f"Original dataset: {len(pd.read_csv(source_csv))} scans")
    print(f"New dataset: {len(selected_df)} scans (1 per subject)")
    print(f"Files copied: {copied_count}")
    print(f"Target directory: {target_dir}")
    print(f"Metadata file: {target_metadata_path}")
    
    return selected_df

if __name__ == "__main__":
    # Configuration
    source_csv = r"C:\Users\User\github_repos\AD_CN_train_v1\metadata.csv"
    source_dir = r"C:\Users\User\github_repos\AD_CN_train_v1"
    target_dir = r"C:\Users\User\github_repos\AD_CN_train_v2"
    
    # Create the dataset
    selected_df = create_single_scan_dataset(
        source_csv=source_csv,
        source_dir=source_dir, 
        target_dir=target_dir,
        selection_strategy='random'  # Options: 'random', 'earliest', 'latest', 'baseline'
    )
    
    print("\n✅ Single-scan dataset creation completed!")


🚀 Creating single-scan dataset...
🔍 Loading metadata...
📊 Original dataset: 2968 scans from 674 subjects

🎯 Selecting scans using 'random' strategy...


Processing subjects: 100%|██████████| 674/674 [00:00<00:00, 888.11it/s] 


✅ Selected 674 scans (1 per subject)

📊 SELECTION SUMMARY:
Original scans: 2968
Selected scans: 674
Reduction: 2294 scans removed

Selected dataset distribution:
  CN: 385 subjects
  AD: 289 subjects
📁 Copying selected scans...
   Source: C:\Users\User\github_repos\AD_CN_train_v1
   Target: C:\Users\User\github_repos\AD_CN_train_v2


Copying scans: 100%|██████████| 674/674 [03:54<00:00,  2.88it/s, Copied=644, Skipped=30, Errors=0]


✅ Copy completed!
   Copied: 644 scans
   Skipped: 30 scans (already exist)
   Errors: 0 scans
💾 Saved metadata: C:\Users\User\github_repos\AD_CN_train_v2\metadata.csv

🎉 DATASET CREATION COMPLETE!
Strategy used: random
Original dataset: 2968 scans
New dataset: 674 scans (1 per subject)
Files copied: 644
Target directory: C:\Users\User\github_repos\AD_CN_train_v2
Metadata file: C:\Users\User\github_repos\AD_CN_train_v2\metadata.csv

✅ Single-scan dataset creation completed!



