---
title: "Selection Bias & Missing Data Challenge"
subtitle: "Creating a Statistics Meme: Write Your Own Functions"
format:
  html: default
execute:
  echo: false
  eval: true
---


In [None]:
#| label: step1-prepare
#| echo: false
#| fig-cap: Original image prepared for processing
#| include: false
import numpy as np
import matplotlib.pyplot as plt
from step1_prepare_image import prepare_image

# Load and prepare the image
# CHANGE THIS to use your own image!
img_path = 'IMG_0417.JPG'  # Example image - replace with your own image
gray_image = prepare_image(img_path, max_size=512)

# Display the prepared image
fig, ax = plt.subplots(figsize=(6.5, 5))
ax.imshow(gray_image, cmap='gray', vmin=0, vmax=1)
ax.axis('off')
ax.set_title('Step 1: Prepared Image', fontsize=14, fontweight='bold', pad=10)
plt.tight_layout()
plt.show()

In [None]:
#| label: step2-stipple
#| echo: false
#| fig-cap: Blue noise stippling pattern
#| warning: false
#| include: false
from step2_create_stipple import create_stipple

# Create stippled image
stipple_pattern, samples = create_stipple(
    gray_image,
    percentage=0.08,  # 8% of pixels will be stippled
    sigma=0.9,  # Repulsion radius
    content_bias=0.9,  # Strongly follow importance map
    noise_scale_factor=0.1,  # Moderate exploration
    extreme_downweight=0.5,  # Moderate downweighting of extremes
    extreme_threshold_low=0.2,  # Downweight tones below 0.2
    extreme_threshold_high=0.8,  # Downweight tones above 0.8
    extreme_sigma=0.1  # Smooth transition width
)

# Display the stippled image
fig, ax = plt.subplots(figsize=(6.5, 5))
ax.imshow(stipple_pattern, cmap='gray', vmin=0, vmax=1)
ax.axis('off')
ax.set_title('Step 2: Stippled Image', fontsize=14, fontweight='bold', pad=10)
plt.tight_layout()
plt.show()

In [None]:
#| label: step3-tonal
#| echo: false
#| fig-cap: Box-averaged tonal analysis showing brightness distribution
#| include: false
from step3_create_tonal import create_tonal
import matplotlib.pyplot as plt

# Create tonal analysis with a 16Ã—12 grid
grid_rows = 16
grid_cols = 12
tonal_image, average_tones, tonal_stats = create_tonal(
    gray_image,
    grid_rows=grid_rows,
    grid_cols=grid_cols,
    return_full_image=True
)

# Display the box-averaged tonal image with text annotations
fig, ax = plt.subplots(figsize=(6.5, 5))

# Show box-averaged tonal image
ax.imshow(tonal_image, cmap='gray', vmin=0, vmax=1)
ax.axis('off')
ax.set_title('Step 3: Box-Averaged Tonal Analysis', fontsize=14, fontweight='bold', pad=10)

# Calculate grid cell dimensions for text placement
h, w = gray_image.shape
section_h = h / grid_rows
section_w = w / grid_cols

# Add text annotations showing tone values (2 decimals) at the center of each grid cell
for i in range(grid_rows):
    for j in range(grid_cols):
        tone = average_tones[i, j]
        # Calculate center position of the grid cell
        y_center = (i + 0.5) * section_h
        x_center = (j + 0.5) * section_w
        # Use white text for dark sections, black text for light sections
        text_color = 'white' if tone < 0.5 else 'black'
        ax.text(x_center, y_center, f'{tone:.2f}', 
                ha='center', va='center', 
                color=text_color, fontsize=6, fontweight='bold')

plt.tight_layout()
plt.show()

# Print key statistics for parameter tuning
print(f"\nðŸ“Š Tonal Statistics for Parameter Tuning:")
print(f"  Mean brightness: {tonal_stats['mean']:.3f}")
print(f"  Standard deviation: {tonal_stats['std']:.3f}")
print(f"  Brightness range: [{tonal_stats['min']:.3f}, {tonal_stats['max']:.3f}]")
print(f"\nðŸ’¡ Tuning Tips:")
print(f"  - If mean < 0.4: Image is dark, consider lowering extreme_threshold_low")
print(f"  - If mean > 0.6: Image is light, consider raising extreme_threshold_high")
print(f"  - If std > 0.2: High contrast, may need stronger extreme_downweight")
print(f"  - Use mid_tone_center around {tonal_stats['mean']:.2f} to emphasize average tones")

In [None]:
#| label: step4-block-letter
#| echo: false
#| fig-cap: Block letter S representing selection bias
#| include: false
from step4_create_block_letter import create_block_letter_s

# Get image dimensions
h, w = gray_image.shape

# Create block letter S
block_letter = create_block_letter_s(h, w, letter="S", font_size_ratio=0.9)

# Display the block letter
fig, ax = plt.subplots(figsize=(6.5, 5))
ax.imshow(block_letter, cmap='gray', vmin=0, vmax=1)
ax.axis('off')
ax.set_title('Step 4: Selection Bias (Block Letter S)', fontsize=14, fontweight='bold', pad=10)
plt.tight_layout()
plt.show()

In [None]:
#| label: step5-masked
#| echo: false
#| fig-cap: Masked stippled image showing selection bias effect
#| include: false
from step5_create_masked import create_masked_stipple

# Create masked stippled image
masked_stipple = create_masked_stipple(
    stipple_pattern,
    block_letter,
    threshold=0.5  # Pixels below 0.5 are considered part of the mask
)

# Display the masked image
fig, ax = plt.subplots(figsize=(6.5, 5))
ax.imshow(masked_stipple, cmap='gray', vmin=0, vmax=1)
ax.axis('off')
ax.set_title('Step 5: Masked Stippled Image (Estimate)', fontsize=14, fontweight='bold', pad=10)
plt.tight_layout()
plt.show()

In [None]:
#| label: create-final-meme
#| echo: false
#| include: false
from create_meme import create_statistics_meme
from IPython.display import Image, display

# Create the final meme
create_statistics_meme(
    original_img=gray_image,
    stipple_img=stipple_pattern,
    block_letter_img=block_letter,
    masked_stipple_img=masked_stipple,
    output_path="my_statistics_meme.png",
    dpi=150,
    background_color="white"  # or "pink", "lightgray", etc.
)

# Display the final meme
display(Image("my_statistics_meme.png"))

## Understanding Selection Bias Through This Meme

This meme demonstrates selection bias by showing how systematic missing data patterns distort our understanding of reality. The original image (Reality) represents the true population, while the stippled version (Your Model) shows our data collection. When selection bias removes data points in a systematic "S" pattern, the resulting estimate becomes biased and no longer represents the true population, just as missing data in real-world studies can lead to incorrect conclusions.