In [None]:
import os

image_dataset = r"MIMICCXR\mimic-cxr-images\files"

total_images = 0

for big_folder in os.listdir(image_dataset):
    big_folder_path = os.path.join(image_dataset, big_folder)
    
    if not os.path.isdir(big_folder_path):
        continue
    
    for patient_id in os.listdir(big_folder_path):
        patient_path = os.path.join(big_folder_path, patient_id)
        
        if not os.path.isdir(patient_path):
            continue
        
        for study_id in os.listdir(patient_path):
            study_path = os.path.join(patient_path, study_id)
            
            if not os.path.isdir(study_path):
                continue
            
            # Count JPG files
            for image_file in os.listdir(study_path):
                if image_file.lower().endswith('.jpg'):
                    total_images += 1

print(f"✅ Total JPG images found: {total_images:,}")



✅ Total JPG images found: 105,475


In [8]:
import os

image_dataset = r"MIMICCXR\mimic-cxr-images\files"
report_dataset = r"MIMICCXR\mimic-cxr-reports\files"

total_images = 0
images_with_reports = 0
images_without_reports = 0
missing_report_files = []

for big_folder in os.listdir(image_dataset):
    big_folder_path = os.path.join(image_dataset, big_folder)
    
    if not os.path.isdir(big_folder_path):
        continue
    
    for patient_id in os.listdir(big_folder_path):
        patient_path = os.path.join(big_folder_path, patient_id)
        
        if not os.path.isdir(patient_path):
            continue
        
        for study_id in os.listdir(patient_path):
            study_path = os.path.join(patient_path, study_id)
            
            if not os.path.isdir(study_path):
                continue
            
            # Check if report exists for this study
            report_path = os.path.join(report_dataset, big_folder, patient_id, f"{study_id}.txt")
            report_exists = os.path.exists(report_path)
            
            # Count images in this study
            study_images = [f for f in os.listdir(study_path) if f.lower().endswith('.jpg')]
            num_images = len(study_images)
            total_images += num_images
            
            if report_exists:
                images_with_reports += num_images
            else:
                images_without_reports += num_images
                if len(missing_report_files) < 10:  # Store first 10 examples
                    missing_report_files.append({
                        'big_folder': big_folder,
                        'patient_id': patient_id,
                        'study_id': study_id,
                        'num_images': num_images,
                        'expected_report': report_path
                    })

print("="*70)
print("DIAGNOSTIC REPORT")
print("="*70)
print(f"Total images scanned:          {total_images:,}")
print(f"Images WITH matching reports:  {images_with_reports:,}")
print(f"Images WITHOUT reports:        {images_without_reports:,}")
print(f"Percentage matched:            {(images_with_reports/total_images)*100:.1f}%")

print(f"\n{'='*70}")
print("EXAMPLES OF MISSING REPORTS (first 10):")
print("="*70)
for i, missing in enumerate(missing_report_files, 1):
    print(f"\n{i}. Study: {missing['study_id']}")
    print(f"   Patient: {missing['patient_id']}")
    print(f"   Folder: {missing['big_folder']}")
    print(f"   Images: {missing['num_images']}")
    print(f"   Expected report at: {missing['expected_report']}")
    print(f"   Report exists: {os.path.exists(missing['expected_report'])}")


DIAGNOSTIC REPORT
Total images scanned:          105,475
Images WITH matching reports:  105,475
Images WITHOUT reports:        0
Percentage matched:            100.0%

EXAMPLES OF MISSING REPORTS (first 10):


In [9]:
import os

image_dataset = r"MIMICCXR\mimic-cxr-images\files"
report_dataset = r"MIMICCXR\mimic-cxr-reports\files"

images = []
reports = []

for big_folder in os.listdir(image_dataset):
    big_folder_path = os.path.join(image_dataset, big_folder)
    
    if not os.path.isdir(big_folder_path):
        continue
    
    for patient_id in os.listdir(big_folder_path):
        patient_path = os.path.join(big_folder_path, patient_id)
        
        if not os.path.isdir(patient_path):
            continue
        
        for study_id in os.listdir(patient_path):
            study_path = os.path.join(patient_path, study_id)
            
            if not os.path.isdir(study_path):
                continue
            
            report_path = os.path.join(report_dataset, big_folder, patient_id, f"{study_id}.txt")
            
            # Only process .jpg files
            for image_file in os.listdir(study_path):
                if image_file.lower().endswith('.jpg'):  # ← FIX: Only JPG files!
                    image_path = os.path.join(study_path, image_file)
                    images.append(image_path)
                    reports.append(report_path)

print(f"✅ Total image-report pairs: {len(images):,}")
print(f"✅ Unique images: {len(set(images)):,}")
print(f"✅ Unique reports: {len(set(reports)):,}")

✅ Total image-report pairs: 105,475
✅ Unique images: 105,475
✅ Unique reports: 63,751


In [10]:
import pandas as pd

# Create DataFrame
df = pd.DataFrame({
    'image_path': images,
    'report_path': reports
})

# Read report texts
print("Reading report texts...")
def read_report(path):
    try:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            return f.read().strip()
    except:
        return ""

df['report_text'] = df['report_path'].apply(read_report)

# Remove empty reports
df = df[df['report_text'] != ""]

# Add metadata
df['report_words'] = df['report_text'].str.split().str.len()

print(f"\n{'='*70}")
print("FINAL DATASET")
print(f"{'='*70}")
print(f"Total samples:       {len(df):,}")
print(f"Unique images:       {df['image_path'].nunique():,}")
print(f"Unique reports:      {df['report_path'].nunique():,}")
print(f"\nReport statistics:")
print(f"  Mean words:        {df['report_words'].mean():.1f}")
print(f"  Median words:      {df['report_words'].median():.0f}")
print(f"  Min words:         {df['report_words'].min()}")
print(f"  Max words:         {df['report_words'].max()}")

# Images per study distribution
images_per_study = df.groupby('report_path').size()
print(f"\nImages per study:")
print(f"  Mean:              {images_per_study.mean():.2f}")
print(f"  Distribution:")
for n in sorted(images_per_study.value_counts().index[:5]):
    count = images_per_study.value_counts()[n]
    print(f"    {n} image(s):  {count:,} studies")

# Save
df.to_csv('mimic_complete_dataset.csv', index=False)
print(f"\n✅ Saved to: mimic_complete_dataset.csv")
print(f"{'='*70}")

Reading report texts...

FINAL DATASET
Total samples:       105,475
Unique images:       105,475
Unique reports:      63,751

Report statistics:
  Mean words:        85.8
  Median words:      77
  Min words:         15
  Max words:         552

Images per study:
  Mean:              1.65
  Distribution:
    1 image(s):  28,787 studies
    2 image(s):  28,870 studies
    3 image(s):  5,468 studies
    4 image(s):  594 studies
    5 image(s):  24 studies

✅ Saved to: mimic_complete_dataset.csv


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# ==============================================================================
# LOAD DATASET
# ==============================================================================

df = pd.read_csv('mimic_complete_dataset.csv')

print("="*70)
print("DATASET SPLITTING - BY STUDY")
print("="*70)
print(f"Total images:    {len(df):,}")
print(f"Unique studies:  {df['report_path'].nunique():,}")

# ==============================================================================
# SPLIT BY STUDY (90% train, 10% val)
# ==============================================================================

# Get unique studies (reports)
unique_studies = df['report_path'].unique()  # 63,751 studies

# Split: 90% train, 10% val
train_studies, val_studies = train_test_split(
    unique_studies,
    test_size=0.1,      # 10% for validation
    random_state=42
)

print(f"\nTrain studies: {len(train_studies):,} ({len(train_studies)/len(unique_studies)*100:.1f}%)")
print(f"Val studies:   {len(val_studies):,} ({len(val_studies)/len(unique_studies)*100:.1f}%)")

# ==============================================================================
# GET ALL IMAGES FOR EACH SPLIT
# ==============================================================================

# Get all images that belong to train studies
train_df = df[df['report_path'].isin(train_studies)]

# Get all images that belong to val studies
val_df = df[df['report_path'].isin(val_studies)]

print(f"\n{'='*70}")
print("FINAL SPLIT")
print("="*70)
print(f"Train: {len(train_df):,} images from {len(train_studies):,} studies")
print(f"Val:   {len(val_df):,} images from {len(val_studies):,} studies")

# Verify no overlap
overlap = set(train_studies) & set(val_studies)
assert len(overlap) == 0, "ERROR: Overlap detected!"
print(f"\n✅ No overlap - split is valid!")

# ==============================================================================
# SAVE SPLITS
# ==============================================================================

train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

print(f"\n✅ Saved train.csv ({len(train_df):,} samples)")
print(f"✅ Saved val.csv ({len(val_df):,} samples)")




DATASET SPLITTING - BY STUDY
Total images:    105,475
Unique studies:  63,751

Train studies: 57,375 (90.0%)
Val studies:   6,376 (10.0%)

FINAL SPLIT
Train: 94,949 images from 57,375 studies
Val:   10,526 images from 6,376 studies

✅ No overlap - split is valid!

✅ Saved train.csv (94,949 samples)
✅ Saved val.csv (10,526 samples)
