# Image Slide Processor - Tile Processing and Quality Control

This notebook handles post-extraction processing of histopathology image tiles, including splitting large tiles into smaller ones and filtering out uninformative (background) tiles.

## Pipeline Overview
1. Split large tiles (2048×2048) into smaller tiles (256×256)
2. Filter tiles based on tissue content (remove background-only tiles)
3. Organize processed tiles by stain type

## 1. Imports

In [None]:
import os
import shutil

import numpy as np
from PIL import Image
from tqdm import tqdm

## 2. Configuration

In [None]:
# =============================================================================
# PATH CONFIGURATION
# =============================================================================

# Directory containing large tiles to split
INPUT_TILES_DIR = "Images/2048All"

# Directory for output split tiles
OUTPUT_TILES_DIR = "Images/2048Tiles"

# =============================================================================
# PROCESSING PARAMETERS
# =============================================================================

# Size of input tiles (expected dimensions)
INPUT_TILE_SIZE = 2048

# Size of output tiles after splitting
OUTPUT_TILE_SIZE = 256

# Informativeness threshold (fraction of white pixels allowed)
# Tiles with more than this fraction of white pixels are considered uninformative
INFORMATIVE_THRESHOLD = 0.91

# Gray value threshold for detecting white/background pixels
WHITE_PIXEL_THRESHOLD = 224

# Stain types for organization
STAIN_KEYWORDS = ["PIR", "PBP", "MITF", "HE", "ANX", "BRAF", "BCL2", "BCL3"]

## 3. Tile Informativeness Detection

In [None]:
def is_informative(tile, threshold=0.91, white_threshold=224):
    """
    Determine if a tile contains meaningful tissue content.
    
    A tile is considered informative if less than `threshold` fraction
    of its pixels are white/near-white (background). This helps filter
    out tiles that contain only slide background or minimal tissue.
    
    Args:
        tile: PIL Image object (RGB)
        threshold (float): Maximum allowed fraction of white pixels (0.0-1.0)
        white_threshold (int): Grayscale value above which pixels are considered white (0-255)
    
    Returns:
        bool: True if tile is informative (contains tissue), False otherwise
    
    Example:
        >>> img = Image.open("tile.png")
        >>> if is_informative(img, threshold=0.91):
        ...     print("Tile contains tissue")
    """
    arr = np.array(tile)
    
    # Convert to grayscale by averaging RGB channels
    gray = np.mean(arr, axis=-1)
    
    # Count white/near-white pixels
    white_pixels = np.sum(gray > white_threshold)
    total_pixels = gray.size
    
    # Tile is informative if white pixel ratio is below threshold
    white_ratio = white_pixels / total_pixels
    
    return white_ratio < threshold


def get_white_ratio(tile, white_threshold=224):
    """
    Calculate the fraction of white pixels in a tile.
    
    Useful for debugging and threshold tuning.
    
    Args:
        tile: PIL Image object
        white_threshold (int): Grayscale value threshold
    
    Returns:
        float: Fraction of pixels above the white threshold
    """
    arr = np.array(tile)
    gray = np.mean(arr, axis=-1)
    white_pixels = np.sum(gray > white_threshold)
    total_pixels = gray.size
    return white_pixels / total_pixels

## 4. Split Large Tiles into Smaller Tiles

In [None]:
def split_into_tiles(input_dir, output_dir, input_size=2048, tile_size=256):
    """
    Split large tiles into smaller tiles.
    
    Takes images of a specified size and splits them into a grid of smaller tiles.
    For example, a 2048x2048 image split into 256x256 tiles produces 64 tiles.
    
    Args:
        input_dir (str): Directory containing large tiles
        output_dir (str): Directory for output small tiles
        input_size (int): Expected size of input images (assumes square)
        tile_size (int): Size of output tiles (assumes square)
    
    Returns:
        tuple: (total_images_processed, total_tiles_created)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Calculate tiles per image
    tiles_per_side = input_size // tile_size
    tiles_per_image = tiles_per_side ** 2
    
    print(f"Splitting {input_size}x{input_size} images into {tile_size}x{tile_size} tiles")
    print(f"Each image will produce {tiles_per_image} tiles ({tiles_per_side}x{tiles_per_side} grid)")
    
    # Get list of image files
    image_files = [f for f in os.listdir(input_dir) 
                   if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".bmp"))]
    
    total_images = 0
    total_tiles = 0
    skipped = 0
    
    for fname in tqdm(image_files, desc="Splitting images"):
        img_path = os.path.join(input_dir, fname)
        image = Image.open(img_path).convert("RGB")
        
        w, h = image.size
        if w != input_size or h != input_size:
            print(f"Skipping {fname}: size {w}x{h} != expected {input_size}x{input_size}")
            skipped += 1
            continue
        
        base, ext = os.path.splitext(fname)
        slide_num = 1
        
        # Extract tiles in row-major order
        for row in range(0, h, tile_size):
            for col in range(0, w, tile_size):
                box = (col, row, col + tile_size, row + tile_size)
                tile = image.crop(box)
                
                out_name = f"{base}_slide_{slide_num}{ext}"
                tile.save(os.path.join(output_dir, out_name))
                slide_num += 1
        
        total_images += 1
        total_tiles += (slide_num - 1)
    
    print(f"\nSplitting complete:")
    print(f"  Images processed: {total_images}")
    print(f"  Images skipped: {skipped}")
    print(f"  Total tiles created: {total_tiles}")
    
    return total_images, total_tiles

## 5. Run Tile Splitting

In [None]:
# Split large tiles into smaller ones
# split_into_tiles(INPUT_TILES_DIR, OUTPUT_TILES_DIR, input_size=INPUT_TILE_SIZE, tile_size=OUTPUT_TILE_SIZE)

## 6. Remove Uninformative Tiles

In [None]:
def clean_uninformative_tiles(tiles_dir, threshold=0.91, white_threshold=224, dry_run=False):
    """
    Remove uninformative (background-only) tiles from a directory.
    
    Iterates through all images in a directory, checks if each contains
    meaningful tissue content, and deletes those that don't.
    
    Args:
        tiles_dir (str): Directory containing tiles to filter
        threshold (float): Informativeness threshold (max white pixel ratio)
        white_threshold (int): Grayscale value for white pixel detection
        dry_run (bool): If True, only report what would be deleted without actually deleting
    
    Returns:
        tuple: (kept_count, deleted_count)
    """
    deleted = 0
    kept = 0
    errors = 0
    
    image_files = [f for f in os.listdir(tiles_dir) 
                   if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".bmp"))]
    
    print(f"Checking {len(image_files)} tiles for informativeness...")
    print(f"Threshold: {threshold} (tiles with >{threshold*100:.0f}% white pixels will be removed)")
    if dry_run:
        print("DRY RUN - no files will be deleted")
    
    for fname in tqdm(image_files, desc="Filtering tiles"):
        fpath = os.path.join(tiles_dir, fname)
        
        try:
            img = Image.open(fpath).convert("RGB")
        except Exception as e:
            print(f"Error opening {fname}: {e}")
            errors += 1
            continue
        
        if is_informative(img, threshold=threshold, white_threshold=white_threshold):
            kept += 1
        else:
            if not dry_run:
                os.remove(fpath)
            deleted += 1
        
        img.close()
    
    print(f"\nCleanup {'preview' if dry_run else 'complete'}:")
    print(f"  Kept: {kept} informative tiles")
    print(f"  {'Would delete' if dry_run else 'Deleted'}: {deleted} uninformative tiles")
    if errors > 0:
        print(f"  Errors: {errors} files could not be processed")
    print(f"  Retention rate: {kept/(kept+deleted)*100:.1f}%")
    
    return kept, deleted


def analyze_informativeness_distribution(tiles_dir, sample_size=None):
    """
    Analyze the distribution of white pixel ratios in a tile directory.
    
    Useful for choosing an appropriate threshold value.
    
    Args:
        tiles_dir (str): Directory containing tiles
        sample_size (int): Number of tiles to sample (None for all)
    
    Returns:
        list: White pixel ratios for each analyzed tile
    """
    image_files = [f for f in os.listdir(tiles_dir) 
                   if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".bmp"))]
    
    if sample_size and sample_size < len(image_files):
        import random
        image_files = random.sample(image_files, sample_size)
    
    ratios = []
    
    for fname in tqdm(image_files, desc="Analyzing tiles"):
        fpath = os.path.join(tiles_dir, fname)
        try:
            img = Image.open(fpath).convert("RGB")
            ratio = get_white_ratio(img)
            ratios.append(ratio)
            img.close()
        except:
            continue
    
    # Print statistics
    ratios = np.array(ratios)
    print(f"\nWhite pixel ratio statistics (n={len(ratios)}):")
    print(f"  Min: {ratios.min():.3f}")
    print(f"  Max: {ratios.max():.3f}")
    print(f"  Mean: {ratios.mean():.3f}")
    print(f"  Median: {np.median(ratios):.3f}")
    print(f"  Std: {ratios.std():.3f}")
    
    # Show what different thresholds would keep
    for thresh in [0.85, 0.90, 0.91, 0.92, 0.95]:
        kept = np.sum(ratios < thresh)
        print(f"  Threshold {thresh}: would keep {kept} ({kept/len(ratios)*100:.1f}%)")
    
    return ratios

## 7. Run Tile Cleanup

In [None]:
# First, analyze to choose a good threshold (optional)
# ratios = analyze_informativeness_distribution(OUTPUT_TILES_DIR, sample_size=1000)

# Preview what would be deleted (dry run)
# clean_uninformative_tiles(OUTPUT_TILES_DIR, threshold=INFORMATIVE_THRESHOLD, dry_run=True)

# Actually clean the tiles
# clean_uninformative_tiles(OUTPUT_TILES_DIR, threshold=INFORMATIVE_THRESHOLD, dry_run=False)

## 8. Organize Tiles by Stain Type

In [None]:
def organize_by_stain(source_dir, dest_root, keywords=None, move=False):
    """
    Organize image tiles into subdirectories based on stain type keywords.
    
    Scans filenames for stain type keywords and copies/moves matching files
    to corresponding subdirectories.
    
    Args:
        source_dir (str): Directory containing tiles to organize
        dest_root (str): Root directory for organized output
        keywords (list): Stain type keywords to match
        move (bool): If True, move files; if False, copy files
    
    Returns:
        dict: Count of files organized per stain type
    """
    if keywords is None:
        keywords = STAIN_KEYWORDS
    
    counts = {kw: 0 for kw in keywords}
    unmatched = 0
    
    files = [f for f in os.listdir(source_dir) 
             if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".bmp"))]
    
    print(f"Processing {len(files)} files...")
    
    for filename in tqdm(files, desc="Organizing files"):
        upper_name = filename.upper()
        matched = False
        
        for keyword in keywords:
            if keyword in upper_name:
                keyword_folder = os.path.join(dest_root, keyword)
                os.makedirs(keyword_folder, exist_ok=True)
                
                src_path = os.path.join(source_dir, filename)
                dst_path = os.path.join(keyword_folder, filename)
                
                if move:
                    shutil.move(src_path, dst_path)
                else:
                    shutil.copy(src_path, dst_path)
                
                counts[keyword] += 1
                matched = True
                break
        
        if not matched:
            unmatched += 1
    
    print("\nOrganization Summary:")
    print("-" * 30)
    for kw, count in counts.items():
        if count > 0:
            print(f"  {kw}: {count} files")
    print(f"  Unmatched: {unmatched} files")
    
    return counts


# Organize tiles by stain type
# organize_by_stain(OUTPUT_TILES_DIR, "Images/", move=False)

## 9. Testing and Debugging Utilities

In [None]:
def test_single_tile(tile_path, threshold=0.91):
    """
    Test informativeness of a single tile and show details.
    
    Args:
        tile_path (str): Path to tile image
        threshold (float): Threshold to test against
    """
    img = Image.open(tile_path).convert("RGB")
    
    white_ratio = get_white_ratio(img)
    informative = is_informative(img, threshold=threshold)
    
    print(f"Tile: {os.path.basename(tile_path)}")
    print(f"  Size: {img.size}")
    print(f"  White pixel ratio: {white_ratio:.3f} ({white_ratio*100:.1f}%)")
    print(f"  Threshold: {threshold}")
    print(f"  Is informative: {informative}")
    
    img.close()
    return informative


def preview_tiles(tiles_dir, n=5, informative_only=True, threshold=0.91):
    """
    Display a few sample tiles from a directory.
    
    Args:
        tiles_dir (str): Directory containing tiles
        n (int): Number of tiles to show info for
        informative_only (bool): If True, only show informative tiles
        threshold (float): Informativeness threshold
    """
    import random
    
    image_files = [f for f in os.listdir(tiles_dir) 
                   if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".bmp"))]
    
    random.shuffle(image_files)
    
    shown = 0
    for fname in image_files:
        if shown >= n:
            break
        
        fpath = os.path.join(tiles_dir, fname)
        img = Image.open(fpath).convert("RGB")
        is_inf = is_informative(img, threshold=threshold)
        
        if informative_only and not is_inf:
            img.close()
            continue
        
        white_ratio = get_white_ratio(img)
        print(f"{fname}: {white_ratio:.3f} white ratio, informative={is_inf}")
        shown += 1
        img.close()


# Test a specific tile
# test_single_tile("Images/2048Tiles/example_slide_1.png", threshold=0.91)

# Preview some tiles
# preview_tiles(OUTPUT_TILES_DIR, n=10, informative_only=False)

## 10. Summary Statistics

In [None]:
def get_directory_stats(tiles_dir):
    """
    Get summary statistics for a tiles directory.
    
    Args:
        tiles_dir (str): Directory to analyze
    """
    image_files = [f for f in os.listdir(tiles_dir) 
                   if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".bmp"))]
    
    if not image_files:
        print(f"No image files found in {tiles_dir}")
        return
    
    # Count by extension
    extensions = {}
    for f in image_files:
        ext = os.path.splitext(f)[1].lower()
        extensions[ext] = extensions.get(ext, 0) + 1
    
    # Sample a few for size info
    sample_path = os.path.join(tiles_dir, image_files[0])
    sample_img = Image.open(sample_path)
    sample_size = sample_img.size
    sample_img.close()
    
    print(f"Directory: {tiles_dir}")
    print(f"  Total images: {len(image_files)}")
    print(f"  Sample size: {sample_size}")
    print(f"  Extensions: {extensions}")
    
    # Count by stain if applicable
    stain_counts = {kw: 0 for kw in STAIN_KEYWORDS}
    stain_counts['other'] = 0
    
    for f in image_files:
        upper = f.upper()
        matched = False
        for kw in STAIN_KEYWORDS:
            if kw in upper:
                stain_counts[kw] += 1
                matched = True
                break
        if not matched:
            stain_counts['other'] += 1
    
    print("  By stain type:")
    for kw, count in stain_counts.items():
        if count > 0:
            print(f"    {kw}: {count}")


# Get stats for output directory
# get_directory_stats(OUTPUT_TILES_DIR)