In [None]:
import zipfile
from pathlib import Path
import os
import pandas as pd

# Extract the MRI for subjects with paired MRI and Protein with class labels

In [3]:
def extract_mprage_files(zip_path, target_folder):
    """
    Extract all MPRAGE files from zip, maintaining subject structure
    """
    import zipfile
    from pathlib import Path
    
    # Create target folder
    target_base = Path(target_folder)
    target_base.mkdir(exist_ok=True)
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        all_items = zip_ref.namelist()
        
        # Find all MPRAGE-related items
        mprage_items = []
        for item in all_items:
            if 'MPRAGE' in item or 'MP-RAGE' in item:
                mprage_items.append(item)
        
        print(f"🔍 Found {len(mprage_items)} MPRAGE items")
        
        # Extract each MPRAGE item
        extracted_count = 0
        for item in mprage_items:
            try:
                # Parse the path: ADNI/133_S_0525/MPRAGE/...
                parts = item.split('/')
                
                # Find subject ID (e.g., "133_S_0525")
                subject_id = None
                for part in parts:
                    if '_S_' in part:
                        subject_id = part
                        break
                
                if subject_id:
                    # Create target path: target_folder/ADNI/subject_id/MPRAGE/...
                    # Remove the first part (zip root) and keep the rest
                    relative_path = '/'.join(parts[1:])  # Skip first empty part
                    target_path = target_base / relative_path
                    
                    # Create parent directories
                    target_path.parent.mkdir(parents=True, exist_ok=True)
                    
                    # Extract the file
                    with zip_ref.open(item) as source, open(target_path, 'wb') as target:
                        target.write(source.read())
                    
                    extracted_count += 1
                    if extracted_count <= 10:  # Show first 10
                        print(f"   📁 Extracted: {relative_path}")
                
            except Exception as e:
                print(f"❌ Error extracting {item}: {e}")
        
        print(f"✅ Successfully extracted {extracted_count} MPRAGE files")
        return extracted_count

# Usage
zip_path = r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_5.zip"
target_folder = r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_2"

print(f"🚀 Extracting MPRAGE files from: {zip_path}")
print(f"📁 Target folder: {target_folder}")

extracted_count = extract_mprage_files(zip_path, target_folder)

print(f"\n🎯 Extraction complete! Found {extracted_count} MPRAGE files")
print(f"�� Files extracted to: {target_folder}")

🚀 Extracting MPRAGE files from: D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_5.zip
📁 Target folder: D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_2
🔍 Found 208 MPRAGE items
   📁 Extracted: 002_S_0413/Accelerated_Sagittal_MPRAGE__MSV21_/2025-02-19_10_21_54.0/I11128519/ADNI_002_S_0413_MR_Accelerated_Sagittal_MPRAGE_(MSV21)__br_raw_20250226190620274_90.dcm
   📁 Extracted: 002_S_0413/Accelerated_Sagittal_MPRAGE__MSV21_/2025-02-19_10_21_54.0/I11128519/ADNI_002_S_0413_MR_Accelerated_Sagittal_MPRAGE_(MSV21)__br_raw_20250226190620365_179.dcm
   📁 Extracted: 002_S_0413/Accelerated_Sagittal_MPRAGE__MSV21_/2025-02-19_10_21_54.0/I11128519/ADNI_002_S_0413_MR_Accelerated_Sagittal_MPRAGE_(MSV21)__br_raw_20250226190620557_47.dcm
   📁 Extracted: 002_S_0413/Accelerated_Sagittal_MPRAGE__MSV21_/2025-02-19_10_21_54.0/I11128519/ADNI_002_S_0413_MR_Accelerated_Sagittal_MPRAGE_(MSV21)__br_raw_20250226190620686_101.dcm
   📁 Extracted: 002_S_0413/Acc

In [10]:
def count_subject_ids_at_level(zip_paths):
    """
    Count subject IDs at the specific directory level: ADNI/[Subject_ID]/
    """
    import zipfile
    from pathlib import Path
    from collections import defaultdict
    
    print("🔍 Counting subject IDs at ADNI/[Subject_ID] level across ZIP files...")
    print("=" * 70)
    
    # Dictionary to store subjects per ZIP
    zip_subjects = {}
    # Dictionary to track which subjects appear in multiple ZIPs
    subject_occurrences = defaultdict(list)
    
    for zip_path in zip_paths:
        zip_name = Path(zip_path).name
        print(f"\n📦 Processing: {zip_name}")
        
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                all_items = zip_ref.namelist()
                
                # Find subject IDs ONLY at the ADNI/[Subject_ID] level
                subjects_in_zip = set()
                for item in all_items:
                    parts = item.split('/')
                    
                    # Check if this item has the structure: ADNI/[Subject_ID]/...
                    if len(parts) >= 3:  # Must have at least 3 parts
                        if parts[0] == 'ADNI' and '_S_' in parts[1]:
                            # This is at the ADNI/[Subject_ID] level
                            subject_id = parts[1]
                            subjects_in_zip.add(subject_id)
                            subject_occurrences[subject_id].append(zip_name)
                
                zip_subjects[zip_name] = subjects_in_zip
                print(f"   ✅ Found {len(subjects_in_zip)} unique subjects at ADNI/[Subject_ID] level")
                
        except Exception as e:
            print(f"   ❌ Error processing {zip_name}: {e}")
            zip_subjects[zip_name] = set()
        
    return zip_subjects

# Define the 5 ZIP file paths
zip_paths = [
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_1 (1).zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_2.zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_3.zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_4.zip",
    r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\MRI\MRI_5.zip"
]

# Count subject IDs at the specific level
print("🚀 Starting subject ID count at ADNI/[Subject_ID] level...")
zip_subjects= count_subject_ids_at_level(zip_paths)

🚀 Starting subject ID count at ADNI/[Subject_ID] level...
🔍 Counting subject IDs at ADNI/[Subject_ID] level across ZIP files...

📦 Processing: MRI_1 (1).zip
   ✅ Found 39 unique subjects at ADNI/[Subject_ID] level

📦 Processing: MRI_2.zip
   ✅ Found 39 unique subjects at ADNI/[Subject_ID] level

📦 Processing: MRI_3.zip
   ✅ Found 39 unique subjects at ADNI/[Subject_ID] level

📦 Processing: MRI_4.zip
   ✅ Found 39 unique subjects at ADNI/[Subject_ID] level

📦 Processing: MRI_5.zip
   ✅ Found 39 unique subjects at ADNI/[Subject_ID] level


# Extract Additional MRI Training Images

In [None]:
# Return subject_id for 200 additional images for AD and CN

"D:\ADNI\AD_CN\AD,CN_My_Table_25Jun2025.csv"
# read csv into pandas df called ad_cn_df
# filter for visit is 'bl'

# read D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\rid_subject_mapping.csv into pandas
# call it as rid_subject_mapping_df

# in ad_cn_df drop the rows with subject_id in rid_subject_mapping_df

In [None]:
ad_cn_df = pd.read_csv("D:\ADNI\AD_CN\AD,CN_My_Table_25Jun2025.csv")
print(f"Original AD/CN table shape: {ad_cn_df.shape}")

# Filter for baseline visit ('bl')
ad_cn_df = ad_cn_df[ad_cn_df['visit'] == 'bl']
print(f"After filtering for baseline visit: {ad_cn_df.shape}")

# Read the RID subject mapping file
rid_subject_mapping_df = pd.read_csv(r"D:\ADNI\AD_CN\proteomics\Biomarkers Consortium Plasma Proteomics MRM\rid_subject_mapping.csv")
print(f"RID subject mapping shape: {rid_subject_mapping_df.shape}")

# # Display sample of both DataFrames
# print(f"\n📋 Sample of AD/CN table:")
# print(ad_cn_df.head())
# print(f"\n📋 Sample of RID subject mapping:")
# print(rid_subject_mapping_df.head())

# Get the subject IDs from the mapping file
subject_ids_to_remove = rid_subject_mapping_df['subject_id'].unique()
print(f"\n�� Found {len(subject_ids_to_remove)} unique subject IDs to remove")

# Drop rows where subject_id exists in the mapping file
original_count = len(ad_cn_df)
ad_cn_df = ad_cn_df[~ad_cn_df['subject_id'].isin(subject_ids_to_remove)]
final_count = len(ad_cn_df)

print(f"\n✅ Filtering complete:")
print(f"   • Original rows: {original_count}")
print(f"   • Rows removed: {original_count - final_count}")
print(f"   • Final rows: {final_count}")

Original AD/CN table shape: (2693, 6)
After filtering for baseline visit: (804, 6)
RID subject mapping shape: (39, 2)

�� Found 39 unique subject IDs to remove

✅ Filtering complete:
   • Original rows: 804
   • Rows removed: 37
   • Final rows: 767


  ad_cn_df = pd.read_csv("D:\ADNI\AD_CN\AD,CN_My_Table_25Jun2025.csv")


In [9]:
print(ad_cn_df.head())

# Randomly sample 100 AD and 100 CN subjects
print(f"\n🎯 SAMPLING 100 AD AND 100 CN SUBJECTS")
print("=" * 50)

# Check current research_group distribution
print(f"Current research_group distribution:")
print(ad_cn_df['research_group'].value_counts())

# Randomly sample 100 AD subjects
ad_subjects = ad_cn_df[ad_cn_df['research_group'] == 'AD'].sample(n=100, random_state=42)
print(f"\n✅ Sampled {len(ad_subjects)} AD subjects")

# Randomly sample 100 CN subjects  
cn_subjects = ad_cn_df[ad_cn_df['research_group'] == 'CN'].sample(n=100, random_state=42)
print(f"✅ Sampled {len(cn_subjects)} CN subjects")

# Combine the sampled subjects
sampled_df = pd.concat([ad_subjects, cn_subjects], ignore_index=True)
print(f"✅ Combined sample size: {len(sampled_df)}")

# # Display the sampled subjects with subject_id and research_group
# print(f"\n📋 SAMPLED SUBJECTS:")
# print("=" * 60)
# print(f"{'Index':<6} {'subject_id':<15} {'research_group':<15}")
# print("-" * 60)

for idx, row in sampled_df.iterrows():
    print(f"{idx:<6} {row['subject_id']:<15} {row['research_group']:<15}")

# Print all 200 subject_ids with commas
print(f"\n📋 ALL 200 SUBJECT IDs (comma-separated):")
print("=" * 60)
subject_ids_list = sampled_df['subject_id'].tolist()
subject_ids_string = ','.join(map(str, subject_ids_list))
print(subject_ids_string)

# Show summary statistics
print(f"\n�� SAMPLE SUMMARY:")
print(f"   • Total subjects: {len(sampled_df)}")
print(f"   • AD subjects: {len(sampled_df[sampled_df['research_group'] == 'AD'])}")
print(f"   • CN subjects: {len(sampled_df[sampled_df['research_group'] == 'CN'])}")

# Verify the sampling worked correctly
print(f"\n🔍 VERIFICATION:")
print(f"   • AD count = 100: {'✅' if len(ad_subjects) == 100 else '❌'}")
print(f"   • CN count = 100: {'✅' if len(cn_subjects) == 100 else '❌'}")
print(f"   • Total = 200: {'✅' if len(sampled_df) == 200 else '❌'}")

# Save the sampled data if needed
# sampled_df.to_csv('sampled_ad_cn_subjects.csv', index=False)
# print(f"\n�� Sampled data saved to 'sampled_ad_cn_subjects.csv'")

   subject_id visit  PROTEIN research_group subject_date  subject_age
1  011_S_0023    bl     30.0             CN   2005-10-18        71.79
2  011_S_0010    bl     35.0             AD   2005-10-26        73.90
3  011_S_0005    bl     43.0             CN   2005-08-23        73.73
4  011_S_0003    bl     86.0             AD   2005-08-18        81.30
5  011_S_0008    bl     41.0             CN   2005-08-29        84.50

🎯 SAMPLING 100 AD AND 100 CN SUBJECTS
Current research_group distribution:
research_group
CN    516
AD    251
Name: count, dtype: int64

✅ Sampled 100 AD subjects
✅ Sampled 100 CN subjects
✅ Combined sample size: 200
0      033_S_5017      AD             
1      023_S_0083      AD             
2      127_S_5028      AD             
3      130_S_1337      AD             
4      135_S_4657      AD             
5      005_S_5119      AD             
6      073_S_5090      AD             
7      033_S_10249     AD             
8      005_S_0221      AD             
9      137_

# Extract additional MRI training data