# Calculate the background per plate

In [4]:
import os
import re
import sys
import glob
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from skimage.io import imread
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

sys.path.append("../scripts")
from img_utils import letter_dict, channel_dict
letter_dict_rev = {v: k for k, v in letter_dict.items()}
channel_dict_rev = {v: k for k, v in channel_dict.items()}

In [5]:
letter_dict_rev

{'01': 'A',
 '02': 'B',
 '03': 'C',
 '04': 'D',
 '05': 'E',
 '06': 'F',
 '07': 'G',
 '08': 'H',
 '09': 'I',
 '10': 'J',
 '11': 'K',
 '12': 'L',
 '13': 'M',
 '14': 'N',
 '15': 'O',
 '16': 'P'}

In [None]:
def summarize_tiff_img(tiff_image_path):
    img = imread(tiff_image_path)
    arr = img.ravel()
    # cast to float64 for safe sum
    s = arr.sum(dtype=np.float64)
    ss = (arr.astype(np.float64) ** 2).sum()
    n = arr.size
    # integer images → bincount histogram
    # minlength covers full dynamic range e.g. 0–65535 for uint16
    hist = np.bincount(arr, minlength=65536)
    return n, s, ss, hist


def summarize_img_group(tiff_imgs, output_dict, workers=1):
    """
    Summarize a list of TIFF images in parallel, returning list of (n, s, ss, hist) tuples.
    """
    if type(tiff_imgs)==str:
        tiff_imgs = glob.glob(tiff_imgs, recursive=True)

    if (tiff_imgs):
        results = []
        # process in parallel with a progress bar
        with ProcessPoolExecutor(max_workers=workers) as exe:
            futures = {exe.submit(summarize_tiff_img, path): path for path in tiff_imgs}
            for fut in tqdm(as_completed(futures), total=len(futures), desc="Summarizing TIFFs"):
                results.append(fut.result())
        # reduce
        total_n    = sum(r[0] for r in results)
        total_sum  = sum(r[1] for r in results)
        total_sumsq= sum(r[2] for r in results)
        # element-wise sum of histograms
        total_hist = sum(r[3] for r in results)
        # mean & std
        mean = total_sum / total_n
        std  = np.sqrt(total_sumsq/total_n - mean**2)
        # median: find intensity bin where cumsum ≥ N/2
        cum = np.cumsum(total_hist)
        median = np.searchsorted(cum, total_n//2)

        output_dict.update({"median": median, "mean": mean, "std": std})
        return output_dict
    
    else:
        output_dict.update({"median": np.nan, "mean": np.nan, "std": np.nan})
        return output_dict

In [None]:
TIFF_IMG_DIR = "../inputs/images"
print("Summarize the per-channel well-level summary statistics:")
output_dir = "../outputs/1.plate_bg_summary/"
batches = ["2025_01_27_Batch_13", "2025_01_28_Batch_14"]

for batch in batches:
    tiff_img_dict_mapper = []
    plates = os.listdir(f"{TIFF_IMG_DIR}/{batch}/images")
    for plate in plates:
        all_tiffs = glob.glob(f"{TIFF_IMG_DIR}/{batch}/images/{plate}/Images/*.tiff")
        unique_wells = sorted(set([tiff.split('/')[-1][:6] for tiff in all_tiffs]))
        print(len(unique_wells)) ## 384
        for well in tqdm(unique_wells):
            well_letter = letter_dict_rev[re.search(r'(?<=r)(\d{2})(?=c)', well)[0]]
            well_num = re.search(r'(?<=c)(\d{2})', well)[0]
            # print(well, result_dict["well"])
            for channel in channel_dict_rev.keys():
                result_dict = {"plate": plate.split("__")[0], 
                               "well": f"{well_letter}{well_num}",
                               "channel": channel_dict_rev[channel]}
                channel_tiffs = f"{TIFF_IMG_DIR}/{batch}/images/{plate}/Images/{well}*-ch{channel}sk*.tiff" # glob.glob(f"{TIFF_IMG_DIR}/{batch}/images/{plate}/Images/{well}*_ch{channel}sk*.tiff", recursive=True)[:100]
                # print(channel_tiffs)
                tiff_img_dict_mapper.append((channel_tiffs, result_dict))

    results_per_well = []
    with ProcessPoolExecutor(max_workers=384) as exe:
        # submit each (tiff_imgs, output_dict) pair as separate job
        futures = [
            exe.submit(summarize_img_group, tiff_imgs, output_dict)
            for tiff_imgs, output_dict in tiff_img_dict_mapper
        ]
        for fut in tqdm(as_completed(futures), total=len(futures), desc="Processing tiffs per well"):
            result = fut.result()
            results_per_well.append(result)

    df = pl.DataFrame(results_per_well)
    df.write_parquet(os.path.join(output_dir, batch, "plate_well_sum_stats.parquet"))

Summarize the per-channel well-level summary statistics:
384


100%|██████████| 384/384 [00:00<00:00, 174327.60it/s]


384


100%|██████████| 384/384 [00:00<00:00, 161805.58it/s]


384


100%|██████████| 384/384 [00:00<00:00, 197912.60it/s]


384


100%|██████████| 384/384 [00:00<00:00, 171105.15it/s]


384


100%|██████████| 384/384 [00:00<00:00, 174082.66it/s]


384


100%|██████████| 384/384 [00:00<00:00, 195178.47it/s]


384


100%|██████████| 384/384 [00:00<00:00, 170525.44it/s]


384


100%|██████████| 384/384 [00:00<00:00, 169022.22it/s]


In [24]:
TIFF_IMG_DIR = "../inputs/images"
print("Summarize the per-channel well-level summary statistics:")
output_dir = "../outputs/1.plate_bg_summary/"
batches = ["2025_01_27_Batch_13", "2025_01_28_Batch_14"]

print("Summarize the per-channel plate-level summary statistics:")
for batch in batches:
    tiff_img_dict_mapper = []
    plates = os.listdir(f"{TIFF_IMG_DIR}/{batch}/images")
    for plate in tqdm(plates):
        for channel in channel_dict_rev.keys():
            channel_tiffs = glob.glob(f"{TIFF_IMG_DIR}/{batch}/images/{plate}/Images/*ch{channel}sk*.tiff", recursive=True)[:100]
            if channel_tiffs:
                result_dict = {"plate": plate.split("__")[0], 
                               "channel": channel_dict_rev[channel]}
                result_dict["channel"] = channel_dict_rev[channel]
                tiff_img_dict_mapper.append((channel_tiffs, result_dict))
                
    results_per_plate = []
    with ProcessPoolExecutor(max_workers=384) as exe:
        # submit each (tiff_imgs, output_dict) pair as separate job
        futures = [
            exe.submit(summarize_img_group, tiff_imgs, output_dict)
            for tiff_imgs, output_dict in tiff_img_dict_mapper
        ]
        for fut in tqdm(as_completed(futures), total=len(futures), desc="Processing tiffs per plate"):
            result = fut.result()
            results_per_plate.append(result)

    df_plate = pl.DataFrame(results_per_plate)
    df_plate.write_parquet(os.path.join(output_dir, batch, "plate_sum_stats.parquet"))

Summarize the per-channel well-level summary statistics:
Summarize the per-channel plate-level summary statistics:


100%|██████████| 8/8 [00:00<00:00, 12.63it/s]
Summarizing TIFFs: 100%|██████████| 100/100 [07:14<00:00,  4.34s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:24<00:00,  3.85s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [07:07<00:00,  4.27s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:59<00:00,  4.19s/it]3.38s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:10<00:00,  3.70s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [07:07<00:00,  4.27s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [07:00<00:00,  4.20s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [07:12<00:00,  4.32s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:59<00:00,  4.19s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:28<00:00,  3.89s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:51<00:00,  4.11s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:15<00:00,  3.75s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [06:39<00:00,  4.00s/it]
Summarizing TIFFs: 100%|██████████| 100/100 [07:

In [None]:
plate_bg = pl.read_parquet("../outputs/1.plate_bg_summary/2025_01_27_Batch_13/plate_bg.parquet")
plate_bg.join(
    plate_bg.group_by("plate","channel").agg(
        pl.col("perc_50").median().alias("plate_channel_median")
    ).with_columns(
        (pl.col("plate_channel_median")-pl.col("plate_channel_median").median())
        .abs().median()
        .alias("plate_channel_mad")
    ),
    on=["plate","channel"]
).filter((pl.col("channel")=="GFP")&(pl.col("perc_95")<(pl.col("plate_channel_median")+pl.col("plate_channel_mad")))).sort(pl.col("well")).unique(subset="well", keep="first")

plate,img_path,site,channel,well,perc_50,perc_60,perc_70,perc_75,perc_80,perc_90,perc_95,perc_99,plate_channel_median,plate_channel_mad
str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""02""","""GFP""","""P16""",159.0,165.0,170.0,173.0,175.0,182.0,188.0,197.0,174.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""09""","""GFP""","""D20""",156.0,162.0,167.0,170.0,172.0,179.0,184.0,194.0,174.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""08""","""GFP""","""J24""",152.0,157.0,163.0,166.0,169.0,176.0,181.0,193.0,174.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""04""","""GFP""","""F14""",156.0,161.0,167.0,169.0,172.0,179.0,184.0,193.0,174.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""09""","""GFP""","""B24""",153.0,158.0,163.0,166.0,168.0,175.0,180.0,189.0,174.0,15.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2025_01_27_B13A7A8P1_T1""","""../inputs/images//2025_01_27_B…","""09""","""GFP""","""A23""",155.0,159.0,164.0,166.0,169.0,175.0,180.0,189.0,169.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""06""","""GFP""","""B16""",159.0,164.0,169.0,172.0,175.0,182.0,187.0,196.0,174.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""08""","""GFP""","""D14""",160.0,165.0,170.0,173.0,176.0,183.0,188.0,198.0,174.0,15.0
"""2025_01_27_B13A7A8P2_T2""","""../inputs/images//2025_01_27_B…","""03""","""GFP""","""I03""",156.0,162.0,167.0,170.0,173.0,181.0,186.0,196.0,174.0,15.0


In [None]:
## tiff_image
tiff_image_path = "../inputs/images/2025_01_27_Batch_13/images/2025_01_27_B13A7A8P1_T1__2025_01_27T08_46_50_Measurement_1/Images/r01c01f01p01-ch1sk1fk1fl1.tiff"
def process_tiff_img(tiff_image_path):
    img = imread(tiff_image_path)
    print(img.shape)
    # median = float(np.median(img))
    percentiles = np.percentile(img, q=np.array([50,90,95,99]))
    tiff_img_name = tiff_image_path.split("/")[-1]
    site = re.search(r"(?<=f)(\d{2})(?=p)", tiff_img_name.split('-')[0])[0]
    channel = channel_dict_rev[re.search(r"(?<=ch)(\d+)(?=sk)", tiff_img_name.split('-')[1])[0]]
    well_letter = letter_dict_rev[re.search(r'(?<=r)(\d{2})(?=c)', tiff_img_name.split('-')[0])[0]]
    well_num = re.search(r'(?<=c)(\d{2})(?=f)', tiff_img_name.split('-')[0])[0]
    well = f"{well_letter}{well_num}"

    return {"img_path": tiff_image_path, "plate": tiff_image_path.split('/')[-3].split("__")[0], 
            "median": percentiles[0], "perc_90": percentiles[1], "perc_95": percentiles[2], "perc_99": percentiles[3],
            "site": site, "channel": channel, "well": well}

process_tiff_img(tiff_image_path)

# find all TIFFs
# tiff_img_dir = "../inputs/images/"
# paths = glob.glob(f"{tiff_img_dir}/*/images/*/Images/*.tiff", recursive=True)[:100]
# records = []
# # process in parallel with a progress bar
# with ProcessPoolExecutor(max_workers=384) as exe:
#     futures = {exe.submit(process_tiff_img, p): p for p in paths}
#     for fut in tqdm(as_completed(futures), total=len(futures), desc="Processing TIFFs"):
#         rec = fut.result()
#         records.append(rec)

# # build DataFrame and write Parquet
# df = pl.DataFrame(records)

(1080, 1080)


{'img_path': '../inputs/images/2025_01_27_Batch_13/images/2025_01_27_B13A7A8P1_T1__2025_01_27T08_46_50_Measurement_1/Images/r01c01f01p01-ch1sk1fk1fl1.tiff',
 'plate': '2025_01_27_B13A7A8P1_T1',
 'median': 188.0,
 'perc_90': 223.0,
 'perc_95': 235.0,
 'perc_99': 401.0,
 'site': '01',
 'channel': 'DAPI',
 'well': 'A01'}

In [None]:
img = imread(tiff_image_path)
arr = img.ravel()
# cast to float64 for safe sum
s = arr.sum(dtype=np.float64)
ss = (arr.astype(np.float64) ** 2).sum()
n = arr.size
# integer images → bincount histogram
# minlength covers full dynamic range e.g. 0–65535 for uint16
hist = np.bincount(arr, minlength=65536)