# Ideal Processing Pipeline for Consistent CTCF in Fluorescence Microscopy

Here's a comprehensive pipeline for analyzing fluorescence microscopy images with consistent CTCF measurement across different conditions and microscopes:



In [1]:
## Import necessary libraries
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import tifffile as tiff
import time, os, sys
import pandas as pd
import json
from datetime import datetime
from skimage import measure, draw
from skimage.transform import resize
from PIL import Image
from tqdm import tqdm
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
import scipy.stats
import h5py

import gc
import concurrent.futures as cf
import multiprocessing
import threading
mpl.rcParams['figure.dpi'] = 200

from AutoImgUtils import * 

In [2]:
!nvcc --version
!nvidia-smi

import os, shutil
import numpy as np
import matplotlib.pyplot as plt
from cellpose import core, utils, io, models, metrics, denoise
from glob import glob

use_GPU = core.use_gpu()
yn = ['NO', 'YES']
print(f'>>> GPU activated? {yn[use_GPU]}')


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:42:46_Pacific_Standard_Time_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
Tue Apr  8 22:12:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.61                 Driver Version: 572.61         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA TITAN RTX             WDDM  |   00000000:81:00.0 Off |                  N/A |
| 40%   39C    P8             21W /  280W |    1001MiB /  24576MiB |     18%      Default |
|     

## Function Definitions

### Visualization functions

In [3]:
def visualize_background_mask(channel_image, bg_model, output_path, n_components=3):
    """Visualize background mask from GMM model with distribution plots"""
    # Create figure with 4 subplots (2x2 grid)
    fig, axes = plt.subplots(2, 2, figsize=(15, 12), gridspec_kw={'height_ratios': [3, 1]})
    
    # Original image
    axes[0, 0].imshow(channel_image, cmap='gray')
    axes[0, 0].set_title('Original Channel')
    plt.colorbar(axes[0, 0].get_images()[0], ax=axes[0, 0])
    
    # Background mask
    axes[0, 1].imshow(bg_model['mask'], cmap='hot')
    axes[0, 1].set_title(f'Background Mask\nMean: {bg_model["mean"]:.2f}, Std: {bg_model["std"]:.2f}')
    plt.colorbar(axes[0, 1].get_images()[0], ax=axes[0, 1])
    
    # Original with background highlighted
    norm_img = (channel_image - np.min(channel_image)) / (np.max(channel_image) - np.min(channel_image))
    rgb_img = np.stack([norm_img, norm_img, norm_img], axis=-1)
    
    # Highlight background in red
    rgb_img[:,:,0][bg_model['mask']] = 1.0  # Set red high for background
    rgb_img[:,:,1][bg_model['mask']] = 0.0  # Set green low for background
    rgb_img[:,:,2][bg_model['mask']] = 0.0  # Set blue low for background
    
    axes[1, 0].imshow(rgb_img)
    axes[1, 0].set_title('Background Regions (Red)')
    
    # Plot intensity histogram with GMM distributions
    if 'gmm' in bg_model:
        gmm = bg_model['gmm']
        flat_img = channel_image.flatten()
        
        # Plot histogram
        hist_range = (np.min(flat_img), np.max(flat_img))
        n_bins = 100
        axes[1, 1].hist(flat_img, bins=n_bins, range=hist_range, density=True, 
                       alpha=0.6, color='gray', label='Pixel Intensity')
        
        # Create x values for plotting GMM curves
        x = np.linspace(hist_range[0], hist_range[1], 1000)
        x_reshaped = x.reshape(-1, 1)
        
        # Plot the individual components
        colors = ['blue', 'green', 'red', 'purple', 'orange']
        bg_component = np.argmin(gmm.means_.flatten())
        
        for i in range(gmm.n_components):
            # Calculate component density
            weight = gmm.weights_[i]
            mean = gmm.means_[i, 0]
            std = np.sqrt(gmm.covariances_[i, 0, 0])
            
            # Create a normal distribution for this component
            y = weight * scipy.stats.norm.pdf(x, mean, std)
            
            # Plot with higher alpha for background component
            alpha = 0.8 if i == bg_component else 0.5
            label = f"Background (μ={mean:.1f})" if i == bg_component else f"Component {i+1} (μ={mean:.1f})"
            axes[1, 1].plot(x, y, color=colors[i % len(colors)], 
                          alpha=alpha, linewidth=2, label=label)
        
        axes[1, 1].set_title('Pixel Intensity Distribution')
        axes[1, 1].set_xlabel('Pixel Value')
        axes[1, 1].set_xscale('log')
        axes[1, 1].set_ylabel('Density')
        axes[1, 1].legend()
        
    plt.tight_layout()
    plt.savefig(output_path, dpi=200)
    plt.close('all')
    
def create_visualization(image, masks, measurements, output_path):
    """Create multi-panel visualization for QC"""
    fig, axes = plt.subplots(1, image.shape[2] + 1, figsize=(5*(image.shape[2]+1), 5))
    
    # Plot segmentation mask
    axes[0].imshow(masks > 0, cmap='gray')
    axes[0].set_title('Cell Segmentation')
    
    # Add cell labels
    for cell in measurements:
        y, x = cell['centroid']
        axes[0].text(x, y, str(cell['label']), color='red', fontsize=5)
    
    # Plot each channel
    for ch in range(image.shape[2]):
        # Normalize channel for visualization
        norm_channel = np.clip(normalize_img(image, method='percentile'), 0, 1)
        
        axes[ch+1].imshow(norm_channel, cmap='hot')
        axes[ch+1].set_title(f'Channel {ch+1}')
        
        # Overlay cell outlines
        for cell in measurements:
            mask = masks == cell['label']
            boundary = find_boundaries(mask)
            axes[ch+1].imshow(boundary, alpha=0.3, cmap='cool')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300)
    plt.close('all')

In [4]:
def process_experiments_batch(main_directory, config):
    """
    Process all experiments in the main directory with a robust CTCF analysis pipeline
    """
    # Create results directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = os.path.join(main_directory, f"CTCF_Analysis_{timestamp}")
    os.makedirs(results_dir, exist_ok=True)
    
    # Find all experiment folders
    experiment_folders = [f.path for f in os.scandir(main_directory) if f.is_dir() 
                         and not f.name.startswith('.') and not "CTCF_Analysis" in f.name]
    
    batch_results = {}
    
    # Process each experiment folder
    for experiment_folder in experiment_folders:
        exp_name = os.path.basename(experiment_folder)
        print(f"\nProcessing experiment: {exp_name}")
        
        # Create experiment results folder
        exp_results_dir = os.path.join(results_dir, exp_name)
        os.makedirs(exp_results_dir, exist_ok=True)
        
        # Process all images in experiment folder
        image_files = [f for f in os.listdir(experiment_folder) 
                      if f.endswith(('.tif', '.tiff')) and not f.startswith('.')]
        
        exp_results = []
        
        for image_file in image_files:
            # Full image processing pipeline for each image
            img_result = process_single_image(
                os.path.join(experiment_folder, image_file),
                exp_results_dir,
                config
            )
            exp_results.append(img_result)
        
        # Compile experiment results
        exp_df = pd.DataFrame(exp_results)
        exp_df.to_csv(os.path.join(exp_results_dir, f"{exp_name}_results.csv"), index=False)
        batch_results[exp_name] = exp_df
    
    # Compile batch summary
    all_results = pd.concat([df for df in batch_results.values()], keys=batch_results.keys())
    all_results.to_csv(os.path.join(results_dir, "batch_summary.csv"))
    
    return batch_results

def resample_image(image, scale_factor=0.5):
    """Resample image by the given scale factor"""
    
    # Get original dimensions
    h, w, c = image.shape
    
    # Calculate new dimensions
    new_h, new_w = int(h * scale_factor), int(w * scale_factor)
    
    # Resize image
    resized = resize(image, (new_h, new_w, c), preserve_range=True, anti_aliasing=True)
    
    return resized.astype(image.dtype)


def process_single_image(image_path, output_dir, config):
    """Process a single fluorescence microscopy image for CTCF analysis"""
    # Load image and normalize channels
    image = tiff.imread(image_path)
    img_name = os.path.basename(image_path)
    img_base = os.path.splitext(img_name)[0]

    print(f"\nProcessing image: {img_name}")
    
    # Move the shortest axis (channels) to the last index
    shortest_axis = np.argmin(image.shape)
    image = np.moveaxis(image, shortest_axis, -1)
    
    # Extract configuration
    channels_of_interest = config.get('channels_of_interest', list(range(image.shape[-1])))
    
    # 1. BACKGROUND ESTIMATION USING GMM
    print("Estimating background using GMM...")
    bg_models = {}
    for ch in tqdm(range(image.shape[-1]), desc="Background estimation", leave=False):
        bg_models[ch] = fast_estimate_background_gmm(image[:,:,ch])
        # Save background mask visualization
        if config.get('visualize_steps', True):
            visualize_background_mask(image[:,:,ch], bg_models[ch], 
                                     os.path.join(output_dir, f"{img_base}_bg_mask_ch{ch+1}.png"))
    print("Background estimation complete.")
  
    
    # 2. CELL SEGMENTATION USING CELLPOSE
    cell_masks = segment_cells_with_downsampling(image, config)

    # 3. MEASURE CTCF FOR EACH CELL AND CHANNEL
    cell_measurements = measure_cells_ctcf(image, cell_masks, bg_models)
    
    # 4. GENERATE VISUALIZATIONS
    create_visualization(image, cell_masks, cell_measurements, 
                         os.path.join(output_dir, os.path.basename(image_path) + "_analysis.png"))
    
    # 5. SAVE RESULTS
    img_name = os.path.basename(image_path)
    results = {
        'image_name': img_name,
        'total_cells': len(cell_measurements),
        'image_path': image_path,
    }
    
    # Add summarized measurements
    for ch in channels_of_interest:
        ch_ctcf = [cell['ctcf'][ch] for cell in cell_measurements]
        results[f'channel_{ch+1}_mean_ctcf'] = np.mean(ch_ctcf)
        results[f'channel_{ch+1}_median_ctcf'] = np.median(ch_ctcf)
        results[f'channel_{ch+1}_std_ctcf'] = np.std(ch_ctcf)
    
    # Save detailed cell measurements
    cell_df = pd.DataFrame([
        {
            'image': img_name,
            'cell_id': i,
            'area': cell['area'],
            **{f'channel_{ch+1}_ctcf': cell['ctcf'][ch] for ch in channels_of_interest},
            **{f'channel_{ch+1}_mean': cell['mean'][ch] for ch in channels_of_interest},
            'centroid_x': cell['centroid'][0],
            'centroid_y': cell['centroid'][1]
        }
        for i, cell in enumerate(cell_measurements)
    ])
    cell_df.to_csv(os.path.join(output_dir, f"{os.path.splitext(img_name)[0]}_cells.csv"), index=False)
    
    return results

def fast_estimate_background_gmm(channel_image, n_components=3, sample_ratio=0.1, max_iter=100):
    """
    Faster background estimation using GMM with downsampling
    
    Parameters:
    - channel_image: 2D array with image channel
    - n_components: Number of GMM components
    - sample_ratio: Fraction of pixels to sample (smaller = faster)
    - max_iter: Maximum EM iterations
    """
    # Flatten image
    flat_img = channel_image.flatten()
    
    # Downsample by random sampling (much faster)
    n_samples = max(10000, int(sample_ratio * flat_img.size))
    indices = np.random.choice(flat_img.size, size=n_samples, replace=False)
    sample_data = flat_img[indices].reshape(-1, 1)
    
    # Initialize with K-means for faster convergence
    kmeans = KMeans(n_clusters=n_components, n_init=1, max_iter=100)
    kmeans.fit(sample_data)
    
    # Configure GMM with performance parameters
    gmm = GaussianMixture(
        n_components=n_components,
        random_state=42,
        n_init=1,
        max_iter=max_iter,
        tol=1e-3,
        init_params='kmeans'
    )
    
    # Fit on sample data
    gmm.fit(sample_data)
    
    # Get background component (lowest mean)
    means = gmm.means_.flatten()
    bg_component = np.argmin(means)
    bg_mean = means[bg_component]
    bg_std = np.sqrt(gmm.covariances_[bg_component].flatten()[0])
    
    # Predict on full image for mask
    pixel_labels = gmm.predict(flat_img.reshape(-1, 1))
    bg_mask = (pixel_labels == bg_component).reshape(channel_image.shape)
    
    return {
        'mean': bg_mean,
        'std': bg_std,
        'mask': bg_mask,
        'gmm': gmm,
        'bg_percentage': np.sum(bg_mask) / bg_mask.size * 100,
        'component_means': means,
        'n_components': n_components,
        'bg_component': bg_component
    }

def estimate_background_gmm(channel_image, n_components=3, adaptive=False, max_components=6):
    """
    Estimate background using Gaussian Mixture Model
    
    Parameters:
    - channel_image: 2D array with image channel
    - n_components: Number of GMM components (default=3)
    - adaptive: If True, select optimal components using BIC (default=False)
    - max_components: Maximum number of components to try if adaptive=True
    
    Returns:
    - Dictionary with background statistics and fitted GMM model
    """
    
    # Flatten image for GMM
    flat_img = channel_image.flatten().reshape(-1, 1)
    
    # If adaptive component selection is requested
    if adaptive:
        bic_scores = []
        models = []
        
        # Try different numbers of components
        for n in range(1, max_components + 1):
            gmm = GaussianMixture(n_components=n, random_state=42, n_init=3)
            gmm.fit(flat_img)
            bic_scores.append(gmm.bic(flat_img))
            models.append(gmm)
        
        # Select model with lowest BIC score
        best_idx = np.argmin(bic_scores)
        gmm = models[best_idx]
        n_components = best_idx + 1  # Update n_components to the selected value
        print(f"Adaptive GMM selected {n_components} components with BIC: {bic_scores[best_idx]:.2f}")
    else:
        # Fit GMM model with specified components
        gmm = GaussianMixture(n_components=n_components, random_state=42, n_init=3)
        gmm.fit(flat_img)
    
    # Identify background component (lowest mean)
    means = gmm.means_.flatten()
    bg_component = np.argmin(means)
    
    # Get background statistics
    bg_mean = means[bg_component]
    bg_std = np.sqrt(gmm.covariances_[bg_component].flatten()[0])
    
    # Create background mask 
    pixel_labels = gmm.predict(flat_img)
    bg_mask = (pixel_labels == bg_component).reshape(channel_image.shape)
    
    # Calculate background percentage
    bg_percentage = np.sum(bg_mask) / bg_mask.size * 100
    
    return {
        'mean': bg_mean,
        'std': bg_std,
        'mask': bg_mask,
        'gmm': gmm,  # Store the fitted model
        'bg_percentage': bg_percentage,
        'component_means': means,
        'n_components': n_components,
        'bg_component': bg_component
    }

def segment_cells_cellpose(image, config):
    """
    Optimized cell segmentation using CellPose with proper channel mapping
    
    Parameters:
    - image: multi-channel image
    - config: dict with channel mappings and model settings
    """

    io.logger_setup()
    
    # Extract the right channels based on your staining
    cyto_channel_idx = config.get('cytoplasm_channel', 4)  # Default to channel 4 (Far Red)
    nuc_channel_idx = config.get('nucleus_channel', 1)     # Default to channel 1 (Blue)
    
    # Get the right model based on what's visible in your images
    if config.get('segmentation_type') == 'nuclei_only':
        model = models.CellposeDenoiseModel(model_type='nuclei', gpu=config.get('use_gpu', True), restore_type = 'denoise_nuclei')
        # For nuclei-only segmentation, use the same channel twice
        cellpose_channels = [0, 0]
        img_to_segment = image[:,:,nuc_channel_idx]
    else:
        # For cytoplasm+nuclei segmentation
        model = models.Cellpose(gpu=config.get('use_gpu', True), model_type="cyto3")
        img_to_segment = image
        cellpose_channels = [nuc_channel_idx, cyto_channel_idx]  # Tell CellPose which is which
    
    
    # Run segmentation with optimized parameters
    diameter = config.get('cell_diameter', 30)
    flow_th = config.get('flow_threshold', 0.4)
    cell_prob_th = config.get('cellprob_threshold', 0.0)

    masks, flows, styles, imgs_dn = model.eval(
        img_to_segment, 
        channels=cellpose_channels,
        channel_axis=-1,
        normalize=True,
        diameter=diameter,
        flow_threshold=flow_th,
        cellprob_threshold= cell_prob_th
    )
    
    return masks

def segment_cells_with_downsampling(image, config):
    """Segment cells with optional downsampling for speed"""
    # Get downsampling factor from config
    downsample_factor = config.get('downsample_factor', 1.0)
    
    # Skip if no downsampling requested
    if downsample_factor >= 1.0:
        return segment_cells_cellpose(image, config)
    
    # Downsample image
    small_image = resample_image(image, downsample_factor)
    
    # Adjust cell diameter for downsampled image
    small_config = config.copy()
    small_config['cell_diameter'] = config.get('cell_diameter', 30) * downsample_factor
    
    # Run segmentation on smaller image
    small_masks = segment_cells_cellpose(small_image, small_config)
    
    # Upsample masks to original size
    masks_upscaled = resize(small_masks, image.shape[0:2], order=0, preserve_range=True)
    
    # Post-process to ensure integer labels are preserved
    masks_upscaled = masks_upscaled.astype(np.int32)
    
    return masks_upscaled

def measure_cells_ctcf(image, cell_masks, bg_models):
    """Measure CTCF for all cells and channels"""
    # Get properties of each cell
    props = measure.regionprops(cell_masks)
    measurements = []
    
    for prop in props:
        # Basic cell properties
        cell_data = {
            'label': prop.label,
            'area': prop.area,
            'centroid': prop.centroid,
            'ctcf': {},
            'mean': {},
            'total': {},
            'bg_value': {}
        }
        
        # Get cell mask
        mask = cell_masks == prop.label
        
        # Calculate measurements for each channel
        for ch in range(image.shape[-1]):
            # Extract channel and cell region
            channel = image[:,:,ch]
            cell_region = channel[mask]
            
            # Background value
            bg_value = bg_models[ch]['mean']
            cell_data['bg_value'][ch] = bg_value
            
            # Calculate measurements
            total_intensity = np.sum(cell_region)
            mean_intensity = np.mean(cell_region)
            
            # CTCF = Integrated Density - (Area × Mean Background)
            ctcf = total_intensity - (prop.area * bg_value)
            
            # Store results
            cell_data['total'][ch] = total_intensity
            cell_data['mean'][ch] = mean_intensity
            cell_data['ctcf'][ch] = ctcf
        
        measurements.append(cell_data)
    
    return measurements


## Process Images

In [5]:
folder_path = select_folder()
print(f'>>> Selected folder: {folder_path}')

>>> Selected folder: H:/01_Colaboration/02_Slide_Scanner/20250605_TestFolder


In [None]:
# Define the configuration for the pipeline
config = {
    # CellPose settings
    'segmentation_type': 'cyto_and_nuclei',  # or 'nuclei_only'
    'cytoplasm_channel': 4,  # Far Red channel for cytoplasm/membrane
    'nucleus_channel': 1,    # Blue channel (DAPI) for nuclei
    'cell_diameter': 20,     # Approximate diameter in pixels
    'use_gpu': True,         # Use GPU acceleration
    'flow_threshold': 0.4,   # Flow threshold for CellPose
    'cellprob_threshold': 0.0,  # Cell probability threshold for CellPose
    'downsample_factor': 0.5,  # Downsample factor for speed (1.0 = no downsampling)
    
    # Visualization settings
    'visualize_steps': True,  # Set to False if you don't want intermediate visualizations
    
    # Other pipeline settings
    'channels_of_interest': [0, 1, 2, 3]  # All channels to measure
}

In [None]:
# Process all experiments in the selected folder
batch_results = process_experiments_batch(folder_path, config)
print(f'>>> Batch processing complete. Results saved to: {batch_results}')


Processing experiment: Exp1

Processing image: EXP10_2_1_1_1_2_3_scFl_DoxCont_Light-OME_s5.ome.tiff
Estimating background using GMM...


Background estimation:   0%|          | 0/4 [00:00<?, ?it/s]