# Explore single-cell filters and visualize cropped cells

Examine filters prior to curve fitting.

In [2]:
# Imports
import polars as pl
import os

# Paths
VARCHAMP_CP_DIR = "../../0_data_prep/inputs/varchamp_cellpainting_gallery"
ZARR_IMG_DIR = f"../../0_data_prep/outputs/zarr_images"

In [4]:
BATCH_ID = "2024_01_23_Batch_7"
VARCHAMP_PROF_DIR = f"/home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/8.2_updated_snakemake_pipeline/outputs/batch_profiles/{BATCH_ID}/profiles.parquet"
IMAGECSV_DIR = f"{VARCHAMP_CP_DIR}/{BATCH_ID}"

## Filter Cells

In [3]:
# Get metadata
profiles = pl.scan_parquet(prof_path).select(
    ['Metadata_well_position', 'Metadata_plate_map_name', 'Metadata_ImageNumber', 'Metadata_ObjectNumber', 'Metadata_symbol', 'Metadata_gene_allele', 'Metadata_control_type', 'Metadata_Plate', 
    'Nuclei_AreaShape_Area', 'Cells_AreaShape_Area', 'Nuclei_AreaShape_Center_X', 'Nuclei_AreaShape_Center_Y', 'Cells_Intensity_MedianIntensity_GFP', 'Cells_Intensity_IntegratedIntensity_GFP']
).collect()
profiles.shape

(1866461, 14)

In [4]:
# Filter based on cell to nucleus area
profiles = profiles.with_columns(
                (pl.col("Nuclei_AreaShape_Area")/pl.col("Cells_AreaShape_Area")).alias("Nucleus_Cell_Area"),
                pl.concat_str(['Metadata_Plate', 'Metadata_well_position', 'Metadata_ImageNumber', 'Metadata_ObjectNumber'], separator="_").alias("Metadata_CellID")
        ).filter((pl.col("Nucleus_Cell_Area") > 0.15) & (pl.col("Nucleus_Cell_Area") < 0.3))
profiles.shape

(1256972, 16)

In [5]:
# Filter cells too close to image edge
profiles = profiles.filter(
    (pl.col("Nuclei_AreaShape_Center_X") > 50) & (pl.col("Nuclei_AreaShape_Center_X") < 1030) & (pl.col("Nuclei_AreaShape_Center_Y") > 50) & (pl.col("Nuclei_AreaShape_Center_Y") < 1030)
)
profiles.shape

(1122944, 16)

In [6]:
# Calculate median and mad of gfp intensity for each allele
medians = profiles.group_by(["Metadata_Plate", "Metadata_well_position"]).agg(
    pl.col("Cells_Intensity_MedianIntensity_GFP").median().alias("WellIntensityMedian")
)

profiles = profiles.join(medians, on=["Metadata_Plate", "Metadata_well_position"])

profiles = profiles.with_columns(
    (pl.col("Cells_Intensity_MedianIntensity_GFP") - pl.col("WellIntensityMedian")).abs().alias("Abs_dev")
)
mad = profiles.group_by(["Metadata_Plate", "Metadata_well_position"]).agg(
    pl.col("Abs_dev").median().alias("Intensity_MAD")
)
profiles = profiles.join(mad, on=["Metadata_Plate", "Metadata_well_position"])

# Threshold is 5X
profiles = profiles.with_columns(
    (pl.col("WellIntensityMedian") + 5*pl.col("Intensity_MAD")).alias("Intensity_upper_threshold"),
    (pl.col("WellIntensityMedian") - 5*pl.col("Intensity_MAD")).alias("Intensity_lower_threshold")
)

In [7]:
# Filter by intensity MAD
profiles = profiles.filter(
    pl.col("Cells_Intensity_MedianIntensity_GFP") <= pl.col("Intensity_upper_threshold")
).filter(
    pl.col("Cells_Intensity_MedianIntensity_GFP") >= pl.col("Intensity_lower_threshold")
)
profiles.shape

(1037703, 21)

In [8]:
# Filter out allele set 5 (mismatched metadata)
profiles = profiles.filter(pl.col("Metadata_plate_map_name") != "B7A2R1_P1")
profiles.shape

(899375, 21)

In [9]:
# Filter out alleles with fewer than 250 cells
keep_alleles = profiles.group_by("Metadata_gene_allele").count().filter(pl.col("count") >= 250).select("Metadata_gene_allele").to_series().to_list()
profiles = profiles.filter(pl.col("Metadata_gene_allele").is_in(keep_alleles))
profiles.shape

(869914, 21)