In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.ndimage import label, find_objects
from sklearn.metrics import confusion_matrix, classification_report

print("Libraries imported successfully")

## 1. Load Synthetic Plate Image Dataset

In [None]:
# Load synthetic plate image dataset
with open('resources/datasets/synthetic-plate-images-500-samples.json', 'r') as f:
    dataset = json.load(f)

print(f"Dataset: {dataset['metadata']['dataset_id']}")
print(f"Total images: {dataset['metadata']['data_composition']['synthetic_images'] + dataset['metadata']['data_composition']['annotated_real_images']}")
print(f"\nOrganism distribution:")
for org, count in dataset['dataset_statistics']['organism_distribution'].items():
    print(f"  {org}: {count}")

## 2. Simulate Plate Image Preprocessing

In [None]:
def generate_synthetic_plate_image(colony_count, image_size=256, noise_level=0.05):
    """
    Generate a synthetic plate image with colonies.
    
    Parameters:
    - colony_count: target number of colonies
    - image_size: image dimensions (square)
    - noise_level: Gaussian noise amplitude
    
    Returns:
    - image: grayscale plate image (0-255)
    - true_colonies: list of (y, x) colony centers
    """
    # Start with tan background (typical agar color)
    img = np.ones((image_size, image_size)) * 200
    true_colonies = []
    
    # Distribute colonies randomly
    np.random.seed(None)  # Remove determinism
    for _ in range(colony_count):
        # Random position (avoid edges)
        y = np.random.randint(20, image_size - 20)
        x = np.random.randint(20, image_size - 20)
        true_colonies.append((y, x))
        
        # Colony size: 3-15 pixels radius
        radius = np.random.randint(3, 15)
        yy, xx = np.ogrid[:image_size, :image_size]
        dist = np.sqrt((yy - y)**2 + (xx - x)**2)
        mask = dist <= radius
        
        # Colony color gradient (white center -> cream)
        intensity = 255 - (dist[mask] / radius) * 50
        img[mask] = np.maximum(img[mask], intensity)
    
    # Add realistic noise and artifacts
    noise = np.random.normal(0, noise_level * 255, img.shape)
    img = np.clip(img + noise, 0, 255).astype(np.uint8)
    
    return img, true_colonies

# Generate test images
test_counts = [25, 50, 100]
test_images = {}

for count in test_counts:
    img, colonies = generate_synthetic_plate_image(count)
    test_images[count] = {'image': img, 'true_colonies': colonies, 'true_count': len(colonies)}

print(f"Generated {len(test_images)} synthetic plate images for testing")

## 3. Image Preprocessing Pipeline

In [None]:
def preprocess_plate_image(img):
    """
    Preprocess plate image: normalize, subtract background, clean artifacts.
    """
    # Normalize to 0-1
    img_norm = img.astype(np.float32) / 255.0
    
    # Background subtraction (rolling ball approximation)
    from scipy.ndimage import maximum_filter
    background = maximum_filter(img_norm, size=51)
    img_bg_sub = background - img_norm  # Invert: colonies are dark on light background
    
    # Morphological cleaning (closing to fill holes)
    from scipy.ndimage import binary_closing, binary_opening
    # Simple threshold
    threshold = np.mean(img_bg_sub) + 0.5 * np.std(img_bg_sub)
    img_binary = img_bg_sub > threshold
    
    # Morphological operations
    img_clean = binary_closing(img_binary, iterations=2)
    img_clean = binary_opening(img_clean, iterations=1)
    
    return img_norm, img_bg_sub, img_binary, img_clean

# Test preprocessing
test_count = 50
img = test_images[test_count]['image']
img_norm, img_bg_sub, img_binary, img_clean = preprocess_plate_image(img)

# Visualize preprocessing steps
fig, axes = plt.subplots(2, 3, figsize=(14, 8))

axes[0, 0].imshow(img, cmap='gray')
axes[0, 0].set_title('Original Image')
axes[0, 0].axis('off')

axes[0, 1].imshow(img_norm, cmap='gray')
axes[0, 1].set_title('Normalized')
axes[0, 1].axis('off')

axes[0, 2].imshow(img_bg_sub, cmap='gray')
axes[0, 2].set_title('Background Subtracted')
axes[0, 2].axis('off')

axes[1, 0].imshow(img_binary, cmap='gray')
axes[1, 0].set_title('Binary Threshold')
axes[1, 0].axis('off')

axes[1, 1].imshow(img_clean, cmap='gray')
axes[1, 1].set_title('Morphologically Cleaned')
axes[1, 1].axis('off')

axes[1, 2].axis('off')

plt.tight_layout()
plt.savefig('resources/notebooks/preprocessing-pipeline.png', dpi=100, bbox_inches='tight')
plt.show()

print("Preprocessing pipeline visualized")

## 4. Colony Counting via Connected Component Analysis

In [None]:
def count_colonies(img_binary, min_size=5, max_size=500):
    """
    Count colonies using connected component analysis (CCA).
    
    Parameters:
    - img_binary: binary segmentation mask
    - min_size: minimum colony size (pixels)
    - max_size: maximum colony size (pixels)
    
    Returns:
    - colony_count: integer count
    - labeled_img: labeled image with unique colony IDs
    - colony_features: list of dicts with area, centroid, solidity
    """
    # Connected component labeling
    labeled_img, num_features = label(img_binary)
    
    # Extract features
    colony_features = []
    valid_colonies = 0
    
    for i in range(1, num_features + 1):
        colony_mask = labeled_img == i
        area = np.sum(colony_mask)
        
        # Size filtering
        if area < min_size or area > max_size:
            labeled_img[colony_mask] = 0  # Remove
            continue
        
        # Calculate features
        coords = np.where(colony_mask)
        centroid = (np.mean(coords[0]), np.mean(coords[1]))
        
        # Solidity (filled area / convex hull area)
        from scipy.spatial import ConvexHull
        try:
            if len(coords[0]) > 3:
                points = np.column_stack(coords)
                hull = ConvexHull(points)
                solidity = area / hull.volume if hull.volume > 0 else 1.0
            else:
                solidity = 1.0
        except:
            solidity = 1.0
        
        # Eccentricity (elongation)
        if area > 2:
            cov = np.cov(coords)
            eigenvalues = np.linalg.eigvals(cov)
            if min(eigenvalues) > 0:
                eccentricity = np.sqrt(1 - min(eigenvalues) / max(eigenvalues))
            else:
                eccentricity = 0
        else:
            eccentricity = 0
        
        colony_features.append({
            'id': valid_colonies + 1,
            'area_pixels': area,
            'centroid': centroid,
            'solidity': solidity,
            'eccentricity': eccentricity
        })
        valid_colonies += 1
    
    return valid_colonies, labeled_img, colony_features

# Count colonies in test images
results = {}

for true_count in test_images.keys():
    img = test_images[true_count]['image']
    img_norm, img_bg_sub, img_binary, img_clean = preprocess_plate_image(img)
    
    predicted_count, labeled_img, features = count_colonies(img_clean)
    
    results[true_count] = {
        'predicted_count': predicted_count,
        'labeled_img': labeled_img,
        'features': features,
        'error': predicted_count - test_images[true_count]['true_count'],
        'error_percent': 100 * (predicted_count - test_images[true_count]['true_count']) / test_images[true_count]['true_count']
    }

# Report
print("Colony Counting Results:")
print("-" * 60)
for true_count in test_images.keys():
    true = test_images[true_count]['true_count']
    pred = results[true_count]['predicted_count']
    err = results[true_count]['error']
    err_pct = results[true_count]['error_percent']
    print(f"True: {true:3d} | Predicted: {pred:3d} | Error: {err:+3d} ({err_pct:+6.1f}%)")

# Calculate metrics
errors = [results[c]['error'] for c in results.keys()]
print(f"\nMAE: {np.mean(np.abs(errors)):.2f} colonies")
print(f"Median AE: {np.median(np.abs(errors)):.2f} colonies")
print(f"RMSE: {np.sqrt(np.mean(np.array(errors)**2)):.2f} colonies")

## 5. Visualize Detected Colonies

In [None]:
# Visualize colony detection for one image
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

true_count = 50
img = test_images[true_count]['image']
labeled_img = results[true_count]['labeled_img']
features = results[true_count]['features']

# Original image with true colony locations
axes[0].imshow(img, cmap='gray')
for yc, xc in test_images[true_count]['true_colonies']:
    axes[0].plot(xc, yc, 'r+', markersize=10, markeredgewidth=2)
axes[0].set_title(f'True Colonies (n={len(test_images[true_count]["true_colonies"])})')
axes[0].axis('off')

# Labeled image with detected colonies
axes[1].imshow(labeled_img, cmap='nipy_spectral')
for feat in features:
    y, x = feat['centroid']
    axes[1].plot(x, y, 'g.', markersize=8)
axes[1].set_title(f'Detected Colonies (n={len(features)})')
axes[1].axis('off')

plt.tight_layout()
plt.savefig('resources/notebooks/colony-detection.png', dpi=100, bbox_inches='tight')
plt.show()

print("Colony detection visualization saved")

## 6. CFU Calculation

In [None]:
def calculate_cfu(colony_count, dilution_factor, well_volume_ml=0.1):
    """
    Calculate CFU/mL from colony count and dilution.
    
    Formula: CFU/mL = (colony_count / well_volume_mL) × dilution_factor
    """
    cfu_per_ml = (colony_count / well_volume_ml) * dilution_factor
    return cfu_per_ml

# Example calculation
dilution_factor = 1e6  # 10^-6 dilution
well_volume_ml = 0.1

print("CFU Estimation:")
print("-" * 60)
for true_count in test_images.keys():
    pred_count = results[true_count]['predicted_count']
    cfu_per_ml = calculate_cfu(pred_count, dilution_factor, well_volume_ml)
    print(f"Colonies: {pred_count:3d} | CFU/mL: {cfu_per_ml:.2e}")

## 7. Bootstrap Uncertainty Quantification

In [None]:
def bootstrap_uncertainty(labeled_img, n_resamples=1000, confidence=0.95):
    """
    Estimate uncertainty in colony count using bootstrap resampling.
    Resample segmentation mask to account for edge detection uncertainty.
    """
    counts = []
    
    for _ in range(n_resamples):
        # Add small random noise to segmentation mask
        noise = np.random.normal(0, 0.05, labeled_img.shape)
        img_perturbed = (labeled_img > 0).astype(float) + noise
        img_perturbed = (img_perturbed > 0.5).astype(float)
        
        # Recount with perturbation
        labeled_pert, num_features = label(img_perturbed)
        
        # Apply size filtering
        valid_count = 0
        for i in range(1, num_features + 1):
            area = np.sum(labeled_pert == i)
            if 5 <= area <= 500:
                valid_count += 1
        
        counts.append(valid_count)
    
    counts = np.array(counts)
    mean_count = np.mean(counts)
    
    # Confidence interval
    alpha = 1 - confidence
    ci_lower = np.percentile(counts, 100 * alpha / 2)
    ci_upper = np.percentile(counts, 100 * (1 - alpha / 2))
    
    return counts, mean_count, ci_lower, ci_upper

# Calculate uncertainty for one image
true_count = 50
labeled_img = results[true_count]['labeled_img']

counts_boot, mean_count, ci_lower, ci_upper = bootstrap_uncertainty(labeled_img, n_resamples=500)

print(f"\nBootstrap Uncertainty Quantification (n={true_count} true colonies):")
print(f"  Mean count: {mean_count:.1f}")
print(f"  95% CI: [{ci_lower:.1f}, {ci_upper:.1f}]")
print(f"  CI width: {ci_upper - ci_lower:.1f} colonies")
print(f"  CV: {np.std(counts_boot) / np.mean(counts_boot) * 100:.1f}%")

# Plot bootstrap distribution
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(counts_boot, bins=20, alpha=0.7, color='steelblue', edgecolor='black')
ax.axvline(mean_count, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_count:.1f}')
ax.axvline(ci_lower, color='green', linestyle=':', linewidth=2, label=f'95% CI: [{ci_lower:.1f}, {ci_upper:.1f}]')
ax.axvline(ci_upper, color='green', linestyle=':', linewidth=2)
ax.axvline(results[true_count]['predicted_count'], color='orange', linestyle='-', linewidth=2, label=f'Point est: {results[true_count]["predicted_count"]}')
ax.set_xlabel('Colony Count')
ax.set_ylabel('Frequency')
ax.set_title('Bootstrap Distribution of Colony Count')
ax.legend()
ax.grid(True, alpha=0.3)
plt.savefig('resources/notebooks/bootstrap-distribution.png', dpi=100, bbox_inches='tight')
plt.show()

print("\nBootstrap visualization saved")

## 8. Replicate Homogeneity Testing

In [None]:
# Simulate replicate measurements
np.random.seed(42)
n_replicates = 3
replicates = []

true_count = 50
for rep in range(n_replicates):
    # Generate image with some variation
    img, _ = generate_synthetic_plate_image(true_count)
    img_norm, img_bg_sub, img_binary, img_clean = preprocess_plate_image(img)
    count, _, _ = count_colonies(img_clean)
    replicates.append(count)

replicates = np.array(replicates)

print(f"Replicate Colony Counts: {replicates}")
print(f"Mean: {np.mean(replicates):.1f}")
print(f"Std Dev: {np.std(replicates, ddof=1):.2f}")
print(f"CV: {np.std(replicates, ddof=1) / np.mean(replicates) * 100:.1f}%")
print(f"SEM: {np.std(replicates, ddof=1) / np.sqrt(n_replicates):.2f}")

# One-way ANOVA (against expected value)
# In real scenario, would compare multiple conditions
expected = true_count
t_stat, p_value = stats.ttest_1samp(replicates, expected)

print(f"\nOne-sample t-test against expected ({expected}):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Homogeneous? {p_value > 0.05}")

# Acceptance criterion
cv_threshold = 15  # %
cv = np.std(replicates, ddof=1) / np.mean(replicates) * 100

print(f"\nAcceptance Criterion (CV ≤ {cv_threshold}%):")
print(f"  Measured CV: {cv:.1f}%")
print(f"  Status: {'✓ PASS' if cv <= cv_threshold else '✗ FAIL'}")

## 9. Summary Report

In [None]:
print("="*70)
print("VIABLE COUNTS IMAGE ANALYSIS SUMMARY REPORT")
print("="*70)

print(f"\nIMAGE ANALYSIS RESULTS:")
print(f"  Images analyzed: {len(results)}")
print(f"  Mean Absolute Error: {np.mean(np.abs([r['error'] for r in results.values()])):.2f} colonies")
print(f"  Median Absolute Error: {np.median(np.abs([r['error'] for r in results.values()])):.2f} colonies")
print(f"  Max error: {max([abs(r['error']) for r in results.values()]):.0f} colonies")

print(f"\nCFU ESTIMATION:")
print(f"  Dilution factor: {dilution_factor:.0e}")
print(f"  Well volume: {well_volume_ml} mL")

print(f"\nUNCERTAINTY QUANTIFICATION:")
print(f"  Bootstrap resamples: 500")
print(f"  Confidence level: 95%")
print(f"  CI width: {ci_upper - ci_lower:.1f} colonies ({(ci_upper - ci_lower)/mean_count*100:.1f}% of estimate)")

print(f"\nREPLICATE HOMOGENEITY:")
print(f"  Replicates: {n_replicates}")
print(f"  CV: {cv:.1f}%")
print(f"  Threshold: {cv_threshold}%")
print(f"  Status: {'✓ PASS' if cv <= cv_threshold else '✗ FAIL'}")

print(f"\nACCEPTANCE CRITERIA:")
criteria = [
    ('MAE ≤ 2 colonies', np.mean(np.abs([r['error'] for r in results.values()])) <= 2),
    ('Median AE ≤ 1 colony', np.median(np.abs([r['error'] for r in results.values()])) <= 1),
    ('Replicate CV ≤ 15%', cv <= cv_threshold),
    ('CI coverage adequate', len(counts_boot) > 0)
]

passed = 0
for criterion, result in criteria:
    status = '✓ PASS' if result else '✗ FAIL'
    print(f"  {criterion}: {status}")
    if result:
        passed += 1

print(f"\nOVERALL DECISION: {passed}/{len(criteria)} criteria met")
if passed == len(criteria):
    print("STATUS: ✓ APPROVED - Ready for production use")
else:
    print("STATUS: ✗ HOLD - Address failures before deployment")

print("="*70)