# Debug NIH Dataset Loading

This notebook helps debug why the dataset isn't loading images.

In [None]:
# === Check Dataset Structure ===
from pathlib import Path
import pandas as pd

data_dir = Path('/content/drive/MyDrive/NIH Chest XRAY Dataset')

print("üìÅ Checking dataset structure...\n")

# Check if directory exists
print(f"Directory exists: {data_dir.exists()}")
print(f"Directory path: {data_dir}\n")

# List all contents
print("Contents of data directory:")
for item in sorted(data_dir.iterdir()):
    if item.is_dir():
        # Count files in directory
        file_count = len(list(item.iterdir()))
        print(f"  üìÇ {item.name}/ ({file_count} items)")
    else:
        size_mb = item.stat().st_size / (1024*1024)
        print(f"  üìÑ {item.name} ({size_mb:.2f} MB)")

# Check for CSV file
print("\nüìä Checking labels file...")
labels_file = data_dir / 'Data_Entry_2017.csv'
if labels_file.exists():
    df = pd.read_csv(labels_file)
    print(f"‚úì Labels file found: {len(df)} entries")
    print(f"  Columns: {list(df.columns)}")
    print(f"\n  First few image names:")
    for i, name in enumerate(df['Image Index'].head(5)):
        print(f"    {i+1}. {name}")
else:
    print("‚úó Labels file NOT found!")

# Check image directories
print("\nüñºÔ∏è  Checking image directories...")
image_dirs = list(data_dir.glob('images*'))
print(f"Found {len(image_dirs)} image directories:")

for img_dir in sorted(image_dirs):
    if img_dir.is_dir():
        # Check for PNG files
        png_files = list(img_dir.glob('*.png'))
        jpg_files = list(img_dir.glob('*.jpg'))
        
        print(f"\n  {img_dir.name}/:")
        print(f"    PNG files: {len(png_files)}")
        print(f"    JPG files: {len(jpg_files)}")
        
        # Show sample files
        all_files = png_files + jpg_files
        if all_files:
            print(f"    Sample files:")
            for f in all_files[:3]:
                print(f"      - {f.name}")
        else:
            # Check subdirectories
            subdirs = [d for d in img_dir.iterdir() if d.is_dir()]
            if subdirs:
                print(f"    Has {len(subdirs)} subdirectories")
                for subdir in subdirs[:3]:
                    sub_files = list(subdir.glob('*.png')) + list(subdir.glob('*.jpg'))
                    print(f"      {subdir.name}/: {len(sub_files)} images")


In [None]:
# === Test Dataset Loading ===
import sys
sys.path.insert(0, '/content/fednams-plus')

from data import NIHChestXrayDataset

print("üß™ Testing NIH dataset loading...\n")

dataset = NIHChestXrayDataset(data_dir=data_dir)
dataset.load()

print(f"\n‚úì Dataset loaded: {len(dataset)} images")
print(f"  Classes: {len(dataset.class_names)}")
print(f"  Class names: {dataset.class_names[:5]}...")

if len(dataset) > 0:
    print("\n‚úÖ SUCCESS! Dataset is loading correctly.")
    
    # Test loading a sample
    print("\nüß™ Testing sample loading...")
    img, label = dataset[0]
    print(f"  Image shape: {img.size if hasattr(img, 'size') else 'N/A'}")
    print(f"  Label shape: {label.shape}")
    print(f"  Active labels: {label.sum():.0f}")
else:
    print("\n‚ùå PROBLEM: Dataset loaded 0 images!")
    print("\nPossible issues:")
    print("  1. Images are in subdirectories (not directly in images_xxx/)")
    print("  2. Image file names don't match CSV entries")
    print("  3. Wrong file extension (.jpg vs .png)")
