In [2]:
%pip install tqdm Pillow ipywidgets jupyter ipykernel

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.3.3-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Downloading nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.3.6-py3-none-any.whl.metadata (16 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting httpx>=0.25.0 (fro

In [None]:
import os
import pathlib
from PIL import Image
import multiprocessing
from functools import partial
import concurrent.futures
from tqdm.notebook import tqdm  # Using notebook version of tqdm

# Set these variables to your desired paths and size
parent_dir = '/Volumes/External SSD/rxrx3/'
output_dir = '/Volumes/External SSD/rxrx3_resized/'
target_size = (512, 512)  # Change this if you want a different size

# Function to resize a single image
def resize_image(src_path, dst_path):
    try:
        # Create destination directory if needed
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)

        # Open, resize and save the image
        with Image.open(src_path) as img:
            # Resize with antialiasing
            resized_img = img.resize(target_size, Image.LANCZOS)
            # Save the resized image
            resized_img.save(dst_path)
        return True
    except Exception as e:
        return (src_path, str(e))

# Get all image paths from a directory
def get_image_paths(source_dir, dest_dir):
    image_paths = []

    # Get all directories first
    all_dirs = list(os.walk(source_dir))

    # Display progress for directory scanning
    for root, _, files in tqdm(all_dirs, desc="Scanning directories"):
        rel_path = os.path.relpath(root, source_dir)

        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff")):
                src_path = os.path.join(root, file)

                # Construct destination path
                if rel_path == ".":
                    dst_path = os.path.join(dest_dir, file)
                else:
                    dst_path = os.path.join(dest_dir, rel_path, file)

                image_paths.append((src_path, dst_path))

    return image_paths

# Process a single directory
def process_directory(input_dir, output_base_dir):
    """Process a single directory of images"""
    
    # Get directory name and create output dir name with _512 suffix
    dir_name = os.path.basename(input_dir)
    output_dir = os.path.join(output_base_dir, f"{dir_name}_512")
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"\nProcessing directory: {dir_name}")
    print(f"Output directory: {output_dir}")
    
    # Get all image paths
    image_paths = get_image_paths(input_dir, output_dir)
    print(f"Found {len(image_paths)} images to resize")
    
    if not image_paths:
        print(f"No images found in {input_dir}. Skipping...")
        return (0, 0)
    
    # Get number of CPU cores
    num_cores = multiprocessing.cpu_count()
    # Use 80% of available cores to avoid overloading the system
    num_workers = max(1, int(num_cores * 0.8))
    print(f"Using {num_workers} workers")
    
    # Process images in parallel
    results = []
    failures = []

    # Process in batches to show progress
    batch_size = 100  # Adjust based on memory constraints
    total_batches = (len(image_paths) + batch_size - 1) // batch_size

    for i in range(0, len(image_paths), batch_size):
        batch = image_paths[i:i + batch_size]
        batch_num = i // batch_size + 1
        print(f"Processing batch {batch_num}/{total_batches}")

        # Process current batch
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
            # Map function directly to all items in batch
            batch_src_paths, batch_dst_paths = zip(*batch)
            for idx, result in enumerate(
                tqdm(
                    executor.map(resize_image, batch_src_paths, batch_dst_paths),
                    total=len(batch),
                    desc=f"Resizing images ({dir_name})"
                )
            ):
                if result is True:
                    results.append(result)
                else:
                    failures.append(result)
    
    print(f"\nResizing complete for {dir_name}!")
    print(f"Successfully resized: {len(results)}/{len(image_paths)} images")
    
    if failures:
        print(f"Failed to resize: {len(failures)} images")
        print("First few errors:")
        for src_path, error in failures[:5]:
            print(f"- {src_path}: {error}")

        if len(failures) > 5:
            print(f"... and {len(failures) - 5} more errors")
            
    return (len(results), len(failures))

# Main execution code
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get subdirectories
subdirs = [os.path.join(parent_dir, d) for d in os.listdir(parent_dir) 
          if os.path.isdir(os.path.join(parent_dir, d))]

if not subdirs:
    print(f"No subdirectories found in {parent_dir}")
else:
    print(f"Found {len(subdirs)} subdirectories to process:")
    for d in subdirs:
        print(f"- {os.path.basename(d)}")
    print("-" * 50)
    
    # Process each subdirectory
    total_success = 0
    total_failures = 0
    
    for subdir in subdirs:
        success, failures = process_directory(subdir, output_dir)
        total_success += success
        total_failures += failures
    
    # Final statistics
    print("\n" + "="*50)
    print("OVERALL STATISTICS:")
    print(f"Processed {len(subdirs)} directories")
    print(f"Successfully resized: {total_success} images")
    print(f"Failed: {total_failures} images")
    print("="*50)

Found 2 subdirectories to process:
- rxrx3
- rxrx3_temp
--------------------------------------------------

Processing directory: rxrx3
Output directory: /Volumes/External SSD/rxrx3_resized/rxrx3_512


Scanning directories:   0%|          | 0/6 [00:00<?, ?it/s]

Found 30510 images to resize
Using 12 workers
Processing batch 1/306


Resizing images (rxrx3):   0%|          | 0/100 [00:00<?, ?it/s]

Processing batch 2/306


Resizing images (rxrx3):   0%|          | 0/100 [00:00<?, ?it/s]

Processing batch 3/306


Resizing images (rxrx3):   0%|          | 0/100 [00:00<?, ?it/s]

Processing batch 4/306


Resizing images (rxrx3):   0%|          | 0/100 [00:00<?, ?it/s]