In [1]:
import os
import shutil
from tqdm import tqdm

# =========================
# CONFIGURATION
# =========================
RAW_DIR = r"D:\Final_Semester_Project\AI_Attendance_System\ai-ml-model\DataSets\raw"
PROCESSED_DIR = r"D:\Final_Semester_Project\AI_Attendance_System\ai-ml-model\DataSets\processed_500"
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
MIN_IMAGES_PER_CLASS = 500

# =========================
# UTILITY FUNCTIONS
# =========================
def is_image_file(filename: str) -> bool:
    return os.path.splitext(filename.lower())[1] in IMAGE_EXTENSIONS


def count_images_in_directory(dir_path: str) -> int:
    """Count image files in a directory"""
    count = 0
    for file in os.listdir(dir_path):
        if is_image_file(file):
            count += 1
    return count


def copy_class_directory(src_class_path: str, dst_class_path: str):
    """Copy all files from source to destination directory"""
    # Create destination directory
    os.makedirs(dst_class_path, exist_ok=True)
    
    # Copy all files
    for file in os.listdir(src_class_path):
        if is_image_file(file):
            src_file = os.path.join(src_class_path, file)
            dst_file = os.path.join(dst_class_path, file)
            shutil.copy2(src_file, dst_file)


# =========================
# MAIN LOGIC
# =========================
def main():
    # Validate raw directory exists
    if not os.path.isdir(RAW_DIR):
        raise FileNotFoundError(f"Raw directory not found: {RAW_DIR}")
    
    # Create processed directory if it doesn't exist
    os.makedirs(PROCESSED_DIR, exist_ok=True)
    print(f"Processed directory created/verified: {PROCESSED_DIR}")
    
    # Get all class directories
    class_dirs = [
        d for d in os.listdir(RAW_DIR)
        if os.path.isdir(os.path.join(RAW_DIR, d))
    ]
    
    print(f"Found {len(class_dirs)} class directories in raw folder.\n")
    
    # STEP 1: Analyze classes and filter those with enough images
    print("Analyzing class image counts...")
    qualified_classes = []
    image_counts = {}
    
    for class_name in tqdm(class_dirs, desc="Analyzing classes", unit="class"):
        class_path = os.path.join(RAW_DIR, class_name)
        image_count = count_images_in_directory(class_path)
        image_counts[class_name] = image_count
        
        if image_count >= MIN_IMAGES_PER_CLASS:
            qualified_classes.append(class_name)
    
    # Display analysis results
    print("\n" + "="*60)
    print("IMAGE COUNT ANALYSIS")
    print("="*60)
    print(f"Minimum images required per class: {MIN_IMAGES_PER_CLASS}")
    print(f"Total classes found: {len(class_dirs)}")
    print(f"Classes with ≥{MIN_IMAGES_PER_CLASS} images: {len(qualified_classes)}")
    print(f"Classes to be excluded: {len(class_dirs) - len(qualified_classes)}")
    
    # Display qualified classes with counts
    if qualified_classes:
        print(f"\nQUALIFIED CLASSES (≥{MIN_IMAGES_PER_CLASS} images):")
        for cls in sorted(qualified_classes):
            print(f"  {cls:<30} : {image_counts[cls]} images")
    
    # Display excluded classes
    excluded = [cls for cls in class_dirs if cls not in qualified_classes]
    if excluded:
        print(f"\nEXCLUDED CLASSES (<{MIN_IMAGES_PER_CLASS} images):")
        for cls in sorted(excluded):
            print(f"  {cls:<30} : {image_counts[cls]} images")
    
    # Ask for confirmation before copying
    print(f"\n{'='*60}")
    print(f"COPY OPERATION SUMMARY:")
    print(f"  Source:      {RAW_DIR}")
    print(f"  Destination: {PROCESSED_DIR}")
    print(f"  Classes to copy: {len(qualified_classes)}")
    total_images = sum(image_counts[cls] for cls in qualified_classes)
    print(f"  Total images to copy: {total_images}")
    print(f"{'='*60}")
    
    response = input("\nProceed with copying? (y/n): ").strip().lower()
    if response not in ['y', 'yes']:
        print("Operation cancelled.")
        return
    
    # STEP 2: Copy qualified classes with single progress bar
    if not qualified_classes:
        print("No classes meet the minimum image requirement.")
        return
    
    print(f"\nCopying {len(qualified_classes)} classes to processed directory...")
    
    # Single progress bar for all classes
    for class_name in tqdm(qualified_classes, desc="Copying classes", unit="class"):
        src_class_path = os.path.join(RAW_DIR, class_name)
        dst_class_path = os.path.join(PROCESSED_DIR, class_name)
        copy_class_directory(src_class_path, dst_class_path)
    
    # STEP 3: Final verification and summary
    print("\n" + "="*60)
    print("COPYING COMPLETE")
    print("="*60)
    print(f"Successfully copied {len(qualified_classes)} classes to:")
    print(f"  {PROCESSED_DIR}")
    
    # Count images in processed directory for verification
    total_copied = 0
    print(f"\nVERIFICATION - Images in processed directory:")
    for class_name in sorted(qualified_classes):
        class_path = os.path.join(PROCESSED_DIR, class_name)
        if os.path.exists(class_path):
            count = count_images_in_directory(class_path)
            total_copied += count
            print(f"  {class_name:<30} : {count} images")
    
    print(f"\nTotal images copied: {total_copied}")
    
    if total_images == total_copied:
        print("✓ All images copied successfully!")
    else:
        print(f"⚠ Warning: Expected {total_images} images, but found {total_copied}")


if __name__ == "__main__":
    main()

Processed directory created/verified: D:\Final_Semester_Project\AI_Attendance_System\ai-ml-model\DataSets\processed_500
Found 540 class directories in raw folder.

Analyzing class image counts...


Analyzing classes: 100%|█████████████████████████████████████████████████████████| 540/540 [00:03<00:00, 178.64class/s]


IMAGE COUNT ANALYSIS
Minimum images required per class: 500
Total classes found: 540
Classes with ≥500 images: 52
Classes to be excluded: 488

QUALIFIED CLASSES (≥500 images):
  n000032                        : 519 images
  n000067                        : 511 images
  n000080                        : 594 images
  n000081                        : 536 images
  n000097                        : 603 images
  n000103                        : 572 images
  n000105                        : 515 images
  n000111                        : 511 images
  n000129                        : 585 images
  n000149                        : 522 images
  n000163                        : 542 images
  n000176                        : 560 images
  n000185                        : 554 images
  n000186                        : 500 images
  n000192                        : 518 images
  n000202                        : 648 images
  n000203                        : 526 images
  n000223                        : 506 im





Proceed with copying? (y/n):  y



Copying 52 classes to processed directory...


Copying classes: 100%|██████████████████████████████████████████████████████████████| 52/52 [13:18<00:00, 15.35s/class]


COPYING COMPLETE
Successfully copied 52 classes to:
  D:\Final_Semester_Project\AI_Attendance_System\ai-ml-model\DataSets\processed_500

VERIFICATION - Images in processed directory:
  n000032                        : 519 images
  n000067                        : 511 images
  n000080                        : 594 images
  n000081                        : 536 images
  n000097                        : 603 images
  n000103                        : 572 images
  n000105                        : 515 images
  n000111                        : 511 images
  n000129                        : 585 images
  n000149                        : 522 images
  n000163                        : 542 images
  n000176                        : 560 images
  n000185                        : 554 images
  n000186                        : 500 images
  n000192                        : 518 images
  n000202                        : 648 images
  n000203                        : 526 images
  n000223                        :


