In [1]:
import os
from PIL import Image
from tqdm import tqdm
import warnings

# =========================
# CONFIGURATION
# =========================
PROCESSED_DIR = r"/home/sandeshprasai/Final_Semester_Project/AI_Attendance_System/ai-ml-model/DataSets/raw"
OUTPUT_FILE = r"/home/sandeshprasai/Documents/images_size_analysis.txt"
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}

# Suppress warnings
warnings.filterwarnings('ignore')

# =========================
# MAIN LOGIC
# =========================
def main():
    # Check if processed directory exists
    if not os.path.isdir(PROCESSED_DIR):
        print(f"Error: Directory not found: {PROCESSED_DIR}")
        return
    
    # Get all class directories
    class_dirs = sorted([
        d for d in os.listdir(PROCESSED_DIR)
        if os.path.isdir(os.path.join(PROCESSED_DIR, d))
    ])
    
    if not class_dirs:
        print(f"No class directories found in: {PROCESSED_DIR}")
        return
    
    # Initialize overall statistics
    total_images = 0
    total_gt_112 = 0
    total_gt_224 = 0
    total_failed = 0
    class_results = {}
    
    # First, count total images for progress bar
    print("Counting total images...")
    total_to_process = 0
    for class_name in class_dirs:
        class_path = os.path.join(PROCESSED_DIR, class_name)
        image_files = [f for f in os.listdir(class_path) 
                      if os.path.splitext(f.lower())[1] in IMAGE_EXTENSIONS]
        total_to_process += len(image_files)
    
    # Process all images with single progress bar
    print(f"\nProcessing {total_to_process:,} images across {len(class_dirs)} directories...")
    
    with tqdm(total=total_to_process, desc="Analyzing images", unit="img", 
              bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]") as pbar:
        
        for class_name in class_dirs:
            class_path = os.path.join(PROCESSED_DIR, class_name)
            
            # Initialize counters for this class
            class_total = 0
            class_gt_112 = 0
            class_gt_224 = 0
            class_failed = 0
            
            # Get all image files in this class
            image_files = [f for f in os.listdir(class_path) 
                          if os.path.splitext(f.lower())[1] in IMAGE_EXTENSIONS]
            
            # Process each image
            for img_file in image_files:
                try:
                    img_path = os.path.join(class_path, img_file)
                    with Image.open(img_path) as img:
                        width, height = img.size
                        class_total += 1
                        
                        if width > 112 and height > 112:
                            class_gt_112 += 1
                            if width > 224 and height > 224:
                                class_gt_224 += 1
                
                except Exception:
                    class_failed += 1
                
                # Update progress bar
                pbar.update(1)
            
            # Store class results
            class_results[class_name] = {
                'total': class_total,
                'gt_112': class_gt_112,
                'gt_224': class_gt_224,
                'failed': class_failed
            }
            
            # Update overall totals
            total_images += class_total
            total_gt_112 += class_gt_112
            total_gt_224 += class_gt_224
            total_failed += class_failed
    
    # Write results to text file
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        # Header
        f.write("=" * 80 + "\n")
        f.write("IMAGE SIZE ANALYSIS REPORT\n")
        f.write("=" * 80 + "\n\n")
        
        f.write(f"Source Directory: {PROCESSED_DIR}\n")
        f.write(f"Total Directories Analyzed: {len(class_dirs)}\n")
        f.write(f"Total Images Processed: {total_images:,}\n")
        f.write(f"Total Images Failed to Read: {total_failed}\n\n")
        
        # Class-wise statistics table
        f.write("CLASS-WISE STATISTICS\n")
        f.write("-" * 80 + "\n")
        f.write(f"{'Class Name':<30} {'Total':>8} {'>112x112':>12} {'>224x224':>12} {'% >224':>8}\n")
        f.write("-" * 80 + "\n")
        
        for class_name, stats in sorted(class_results.items()):
            percentage = (stats['gt_224'] / stats['total'] * 100) if stats['total'] > 0 else 0
            f.write(f"{class_name:<30} {stats['total']:>8,} {stats['gt_112']:>12,} "
                   f"{stats['gt_224']:>12,} {percentage:>7.1f}%\n")
        
        # Summary section
        f.write("\n" + "=" * 80 + "\n")
        f.write("OVERALL SUMMARY\n")
        f.write("=" * 80 + "\n\n")
        
        # Image size distribution
        f.write("IMAGE SIZE DISTRIBUTION:\n")
        f.write(f"  Images > 112x112:  {total_gt_112:,} ({total_gt_112/total_images*100:.1f}%)\n")
        f.write(f"  Images > 224x224:  {total_gt_224:,} ({total_gt_224/total_images*100:.1f}%)\n")
        f.write(f"  Images ≤ 112x112:  {total_images - total_gt_112:,} ({(total_images - total_gt_112)/total_images*100:.1f}%)\n")
        f.write(f"  Images ≤ 224x224:  {total_images - total_gt_224:,} ({(total_images - total_gt_224)/total_images*100:.1f}%)\n\n")
        
        # Directory analysis
        f.write("DIRECTORY ANALYSIS:\n")
        
        # Directories with all images > 224x224
        perfect_dirs = [name for name, stats in class_results.items() 
                       if stats['total'] > 0 and stats['gt_224'] == stats['total']]
        f.write(f"  Directories with ALL images > 224x224: {len(perfect_dirs)}\n")
        
        # Directories with no images > 224x224
        poor_dirs = [name for name, stats in class_results.items() 
                    if stats['total'] > 0 and stats['gt_224'] == 0]
        f.write(f"  Directories with NO images > 224x224: {len(poor_dirs)}\n\n")
        
        # Quality assessment
        f.write("QUALITY ASSESSMENT:\n")
        if total_gt_224 / total_images >= 0.8:
            f.write("  ✅ Excellent: Over 80% of images are high resolution (>224x224)\n")
        elif total_gt_224 / total_images >= 0.5:
            f.write("  ⚠ Moderate: 50-80% of images are high resolution\n")
        else:
            f.write("  ❌ Poor: Less than 50% of images are high resolution\n")
        
        if total_gt_112 / total_images >= 0.9:
            f.write("  ✅ Good: Over 90% of images are acceptable (>112x112)\n")
        else:
            f.write("  ⚠ Warning: Less than 90% of images are acceptable size\n")
    
    # Final console message
    print(f"\n✓ Analysis complete!")
    print(f"  Report saved to: {OUTPUT_FILE}")
    print(f"  Total images: {total_images:,}")
    print(f"  Images > 224x224: {total_gt_224:,} ({total_gt_224/total_images*100:.1f}%)")

if __name__ == "__main__":
    main()

Counting total images...

Processing 197,693 images across 540 directories...


Analyzing images: 100%|██████████| 197693/197693 [01:14<00:00]


✓ Analysis complete!
  Report saved to: /home/sandeshprasai/Documents/images_size_analysis.txt
  Total images: 197,693
  Images > 224x224: 85,804 (43.4%)



