# Protein abundance change

This notebook runs the analyses on protein abundance changes between variant and reference alleles using Cell Painting assays

In [24]:
### imports
import os
import polars as pl
import numpy as np
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

PLATEMAP_DIR = "../../../8.2_updated_snakemake_pipeline/inputs/metadata/platemaps/{batch_id}/platemap"
PROF_DIR = "../../../8.2_updated_snakemake_pipeline/outputs/batch_profiles"
CLASS_DIR = "../../../8.2_updated_snakemake_pipeline/outputs/classification_analyses"

TRN_IMBAL_THRES = 3
MIN_CLASS_NUM = 2

## Disable truncation globally
# pl.Config.set_tbl_rows(20)  # Show all rows
# pl.Config.set_tbl_cols(40)  # Show all columns

In [25]:
BIO_REP_BATCHES = ["2024_01_23_Batch_7", "2024_02_06_Batch_8"]
COMBINED_BIO_REP_DIR = "2024_02_Batch_7-8"

# BIO_REP_BATCHES = ["2025_03_17_Batch_15", "2025_03_17_Batch_16"]
# COMBINED_BIO_REP_DIR = "2025_03_Batch_15-16"

# BIO_REP_BATCHES = ["2025_01_27_Batch_13", "2025_01_28_Batch_14"]
# OUTPUT_DIR = "../../outputs/2025_01_Batch_13-14"

OUTPUT_DIR = f"../../outputs/{COMBINED_BIO_REP_DIR}"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## 1. Get the comparable REF-VAR pairs

The cell counts between Ref. and Var. alleles should be comparable (using a TRN_IMBAL_THRES = 3).

In [26]:
# Paths
metrics_dir = "{}/{}/profiles_tcdropped_filtered_var_mad_outlier_featselect_filtcells"

metrics_df, metrics_wtvar = pl.DataFrame(), pl.DataFrame()
for batch in BIO_REP_BATCHES:
    met_dir = metrics_dir.format(CLASS_DIR, batch)
    metrics_df_batch = pl.read_csv(f"{met_dir}/metrics.csv")
    metrics_df = pl.concat([metrics_df, metrics_df_batch])
    # metrics_wtvar_batch = pl.read_csv(f"{met_dir}/metrics_summary.csv")
    # metrics_wtvar = pl.concat([metrics_wtvar, metrics_wtvar_batch])

## get individual classifiers pass the training imbalance threshold
balanced_classifiers = metrics_df.filter(
    (~pl.col("Metadata_Control"))
    & (pl.col("Training_imbalance") < TRN_IMBAL_THRES)
    & (
        (pl.col("Full_Classifier_ID").str.contains("true")) ## protein_localization detection
    )
)
balanced_class_alleles = balanced_classifiers.select(pl.col("allele_0","allele_1")).unique().to_numpy()
balanced_class_alleles = np.unique(balanced_class_alleles.flatten())
len(balanced_class_alleles)

681

## 2. CellProfiler Features

### Get the CP features for cells that passed the QC

In [27]:
pass_qc_prof_dir = "{}/{}/profiles_tcdropped_filtered_var_mad_outlier_featselect_filtcells.parquet"
cell_alleles = pl.DataFrame()

for batch_id in BIO_REP_BATCHES:
    # Get meta features
    batch_alleles = (
        pl.scan_parquet(
            pass_qc_prof_dir.format(PROF_DIR, batch_id)
        )
        # .filter(pl.col("Metadata_gene_allele").is_in(all_alleles))
        .with_columns(
            pl.concat_str(
                [
                    "Metadata_Plate",
                    "Metadata_Well",
                    "Metadata_ImageNumber",
                    "Metadata_ObjectNumber",
                ],
                separator="_",
            ).alias("Metadata_CellID")
        )
        .select([
            "Metadata_CellID",
            "Metadata_gene_allele",
            "Metadata_Well",
            "Metadata_Plate",
        ])
    )
    cell_alleles = pl.concat([cell_alleles, batch_alleles.collect()])

### Get the Cells_Intensity CP features per all cells

Merge the Cells_Intensity features to the pass-QC cells

In [28]:
combined_gfp_profiles = pl.DataFrame()
for batch_id in BIO_REP_BATCHES:
    # Get meta features
    batch_gfp_prof = (
        pl.scan_parquet(
            f"{PROF_DIR}/{batch_id}/profiles.parquet"
        ).with_columns(
            pl.concat_str(
                [
                    "Metadata_Plate",
                    "Metadata_Well",
                    "Metadata_ImageNumber",
                    "Metadata_ObjectNumber",
                ],
                separator="_",
            ).alias("Metadata_CellID")
        )
    )
    gfp_int = [i for i in batch_gfp_prof.collect_schema().names() if "Cells_Intensity" in i]
    gfp_int = ["Metadata_CellID"] + [i for i in gfp_int if "GFP" in i]

    combined_gfp_profiles = pl.concat([
        combined_gfp_profiles, 
        batch_gfp_prof.select(gfp_int).collect()
    ])

profiles = cell_alleles.join(combined_gfp_profiles, on="Metadata_CellID", how="left")

### Aggregate the cells to well profiles

In [29]:
well_profiles = (
    profiles.group_by(["Metadata_Plate", "Metadata_Well", "Metadata_gene_allele"])
    .agg(
        pl.col(col).median().alias(col)
        for col in profiles.columns
        if not col.startswith("Metadata_")
    )
    .filter(pl.col("Metadata_gene_allele").is_in(balanced_class_alleles))
    .unique()
)

well_profiles

Metadata_Plate,Metadata_Well,Metadata_gene_allele,Cells_Intensity_IntegratedIntensityEdge_GFP,Cells_Intensity_IntegratedIntensity_GFP,Cells_Intensity_LowerQuartileIntensity_GFP,Cells_Intensity_MADIntensity_GFP,Cells_Intensity_MassDisplacement_GFP,Cells_Intensity_MaxIntensityEdge_GFP,Cells_Intensity_MaxIntensity_GFP,Cells_Intensity_MeanIntensityEdge_GFP,Cells_Intensity_MeanIntensity_GFP,Cells_Intensity_MedianIntensity_GFP,Cells_Intensity_MinIntensityEdge_GFP,Cells_Intensity_MinIntensity_GFP,Cells_Intensity_StdIntensityEdge_GFP,Cells_Intensity_StdIntensity_GFP,Cells_Intensity_UpperQuartileIntensity_GFP
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""2024_02_02_B8A1R2_P3T1""","""J13""","""PLA2G6_Arg600Gln""",1.010232,40.541913,0.003577,0.001655,2.797511,0.0046,0.154887,0.002229,0.006523,0.00517,0.001542,0.001542,0.000482,0.005692,0.006939
"""2024_01_23_B7A1R1_P4T2""","""E01""","""SH3BP2""",1.410219,42.471292,0.003372,0.001052,3.255952,0.013581,0.041271,0.003379,0.005038,0.004481,0.00166,0.001638,0.001657,0.002698,0.006069
"""2024_01_19_B7A1R1_P4T1""","""G14""","""TH_Pro218Leu""",1.043115,46.929632,0.004008,0.002468,2.220636,0.00772,0.018587,0.002728,0.00717,0.00673,0.001637,0.001615,0.00095,0.003339,0.009572
"""2024_01_19_B7A1R1_P3T4""","""D08""","""PSAP""",1.086332,28.940239,0.002446,0.000824,4.004424,0.009884,0.037701,0.002463,0.004384,0.003167,0.001594,0.001577,0.000988,0.003439,0.00461
"""2024_01_17_B7A1R1_P1T3""","""L05""","""COMP_Thr585Arg""",1.285733,88.390073,0.00363,0.003904,5.496468,0.014109,0.064159,0.003054,0.012199,0.007014,0.00157,0.001532,0.00217,0.010655,0.015623
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024_01_17_B7A1R1_P2T2""","""H06""","""KRT6A_Asn171Asp""",1.326435,45.564932,0.003247,0.001415,4.791578,0.01652,0.074705,0.003076,0.005899,0.004561,0.001589,0.001577,0.002053,0.005386,0.006331
"""2024_01_22_B7A1R1_P4T4""","""F11""","""TTPA_Gly246Arg""",1.263591,36.312624,0.003625,0.001316,2.667939,0.007165,0.012551,0.002774,0.005081,0.004993,0.001682,0.00167,0.001041,0.001779,0.006287
"""2024_02_02_B8A1R2_P3T4""","""D13""","""PLA2G6""",0.960816,45.364024,0.0017,0.000116,4.517059,0.002095,0.024519,0.001708,0.003354,0.001795,0.001369,0.001336,0.000116,0.004367,0.001955
"""2024_02_01_B8A1R2_P2T3""","""P21""","""KLHL3_Cys164Phe""",5.01224,290.981525,0.014256,0.018798,5.30746,0.079724,0.124565,0.009487,0.03731,0.035263,0.001759,0.001751,0.012433,0.02456,0.054699


## 3. Calculate abundance hits

Use paired t-test to call abundance hits

In [32]:
from scipy.stats import shapiro
import re
from scipy.stats import wilcoxon
from scipy.stats import ttest_rel
import pandas as pd


# Convert letter rows to numbers
def well_to_coordinates(well):
    row_letter, col_number = re.match(r"([A-P])(\d{2})", well).groups()
    row_index = ord(row_letter) - ord('A') + 1  # Convert 'A'->1, 'B'->2, ..., 'P'->16
    col_index = int(col_number)  # Convert string column to integer
    return well, row_index, col_index


# Compute distances from edges and find the most centered well
def compute_distance(row, col):
    return min(row - 1, 16 - row, col - 1, 24 - col)  # Distance from nearest edge


## Abundance recalculation test: by Cell MeanIntensity
def paired_ttest(dat, reference: str, var: str, intensity_type: str="Cells_Intensity_IntegratedIntensity_GFP"):
    # pivot to wide: one row per plate
    wide_gfp = dat.pivot(index="Metadata_Plate",
                        columns="Metadata_gene_allele",
                        values=intensity_type)
    # drop any plate that doesn’t have both measurements
    wide_gfp = wide_gfp.dropna(subset=[reference, var])
    if wide_gfp.shape[0] >= 3:
        # now run paired t-test
        t_stat, p_val = ttest_rel(wide_gfp[reference], wide_gfp[var])
    else:
        t_stat, p_val = None, None

    # ## perform stat test
    # dat = dat.dropna().sort_values(["Metadata_Plate","Metadata_gene_allele"])
    # # Assuming well_abun_stats is a DataFrame with columns 'reference_abundance' and 'variant_abundance'
    # reference_abundance = dat[dat["Metadata_gene_allele"]==reference][intensity_type].values
    # variant_abundance = dat[dat["Metadata_gene_allele"]==var][intensity_type].values
    # t_stat, p_val = ttest_rel(variant_abundance, reference_abundance)
    
    # Calculate Cohen's d
    mean_diff = np.mean(wide_gfp[var]) - np.mean(wide_gfp[reference])
    pooled_std = np.sqrt((np.std(wide_gfp[var], ddof=1) ** 2 + np.std(wide_gfp[reference], ddof=1) ** 2) / 2)
    cohen_d = mean_diff / pooled_std

    summary_df = pl.DataFrame(
        {
            "t_stat": t_stat,
            "p_val": p_val,
            "cohen_d": cohen_d
        }
    )
    summary_df = summary_df.with_columns(
        pl.lit(reference).alias("Gene"), pl.lit(var).alias("Variant")
    )
    return summary_df

In [33]:
well_abun_stats = []
for allele in tqdm(well_profiles.select(pl.col("Metadata_gene_allele")).to_pandas()["Metadata_gene_allele"].unique()):
    if allele is None or allele.split("_")[0] == allele:
        continue

    reference = allele.split("_")[0]
    temp_prof = well_profiles.filter(
        (pl.col("Metadata_gene_allele") == allele) | (pl.col("Metadata_gene_allele") == reference)
    ).to_pandas()
    
    if (temp_prof["Metadata_gene_allele"].unique().shape[0] < 2):
        # print(temp_prof)
        continue

    var_profiles = temp_prof[temp_prof["Metadata_gene_allele"]==allele]
    ref_profiles = temp_prof[(temp_prof["Metadata_gene_allele"]==reference)&(temp_prof["Metadata_Plate"].isin(var_profiles["Metadata_Plate"].unique()))]
    temp_prof = pd.concat([var_profiles, ref_profiles])

    ref_wells = ref_profiles["Metadata_Well"].unique()
    var_wells = var_profiles["Metadata_Well"].unique()
    ref_var_pairs = [(ref_well, var_well) for ref_well in ref_wells for var_well in var_wells]
    
    ## Per each ref-var well pair on the SAME plate, train and test the classifier
    for ref_var in ref_var_pairs:
        ## sort the wells to make sure they are from the same plate
        df_sampled = temp_prof[temp_prof["Metadata_Well"].isin(ref_var)].dropna().sort_values(["Metadata_Plate","Metadata_gene_allele"])
        paired_t_res = paired_ttest(
            dat=df_sampled,
            reference=reference,
            var=allele
        ).with_columns(
            pl.lit(ref_var[0]).alias("Ref_well"),
            pl.lit(ref_var[1]).alias("Var_well")
        )
        well_abun_stats.append(
            paired_t_res
        )

well_abun_stats = pl.concat(well_abun_stats, how="vertical")
well_abun_stats = well_abun_stats.rename({"t_stat": "U2OS_t"})
well_abun_stats = well_abun_stats.sort(["Gene", "Variant", "U2OS_t", "p_val", "cohen_d"])
well_abun_stats

100%|██████████| 681/681 [00:03<00:00, 226.67it/s]


U2OS_t,p_val,cohen_d,Gene,Variant,Ref_well,Var_well
f64,f64,f64,str,str,str,str
-1.536429,0.168314,0.694536,"""ACACB""","""ACACB_Val1611Met""","""B08""","""F08"""
-0.243523,0.814583,0.08098,"""ACACB""","""ACACB_Val958Met""","""B08""","""D08"""
10.01713,0.00017,-2.347347,"""ACSF3""","""ACSF3_Ala197Thr""","""A01""","""C01"""
2.948178,0.021462,-0.875841,"""ACSF3""","""ACSF3_Arg10Trp""","""A01""","""C03"""
8.171615,0.00008,-2.546632,"""ACSF3""","""ACSF3_Arg471Trp""","""A01""","""G03"""
…,…,…,…,…,…,…
1.191583,0.272253,-0.57157,"""WRAP53""","""WRAP53_Arg68Gly""","""J17""","""N17"""
7.98476,0.000092,-1.878124,"""XPA""","""XPA_Cys108Phe""","""P17""","""B19"""
1.345042,0.220555,-0.57783,"""XRCC2""","""XRCC2_Phe270Val""","""D19""","""H19"""
-4.531025,0.002696,1.203445,"""ZMYND10""","""ZMYND10_Arg243Cys""","""L19""","""P19"""


In [34]:
well_abun_stats.write_csv(f"../../outputs/{COMBINED_BIO_REP_DIR}/well-level_prot-abundance_changes.csv")