# Compute abundance change for each allele

This script takes in the single-cell FACS data as input, applies various filtering criteria, and then produces a z-score for the abundance of each allele relative to the reference on the same plate.

In [1]:
import pandas as pd
import numpy as np
from random import choice
import matplotlib.pyplot as plt
import polars as pl
import process_dualipa as proc

In [2]:
dualipa_inputs = "../1_inputs"
dualipa_outputs = "../3_outputs"
meta_outputs = "../../../1_allele_collection/3_outputs"

n_cell_threshold = 800
wt_gfp_threshold = 100

In [3]:
# Read in single-cell measurements
pDEST_DUAL_df = pd.read_parquet(f"{dualipa_outputs}/facs_single_cell.parquet")

# Filter to only keep wells with more than 800 cells
pDEST_DUAL_df = pDEST_DUAL_df[pDEST_DUAL_df['n_cells'] > n_cell_threshold]

In [4]:
# get the mean and median measurements per well
keep_cols = ['symbol', 'node_type', 'nt_change','aa_change', 'pla', 'well', 'coordinates', 'n_cells', 'orf_id', 'mut_id', 'valid_well']
pDEST_DUAL_avg_df = pDEST_DUAL_df[keep_cols + ['avg_gfp', 'avg_mcherry','avg_GFP_mCherry_ratio']].drop_duplicates()
pDEST_DUAL_median_df = pDEST_DUAL_df[keep_cols + ['median_gfp', 'median_mcherry','median_GFP_mCherry_ratio']].drop_duplicates()

## 1. Compute the mean and mediam scores

__Functions for computing is in ```process_dualipa.py```__

Maxime's notes:

    Instead of using Georges' approach to compute the assay's variability, which uses a step with random pairings, 
    Luke suggested to compute a STD from the Log2FC of all individual WT measurements, relative to the mean of the WT of each gene.
    This is a more robust approach, as it does not rely on random pairings.
    This function computes the STD of the Log2FC of all individual WT measurements, relative to the mean of the WT of each gene.
    Returns a tuple with the mean and the STD of the Log2FC of all individual WT measurements, relative to the mean of the WT of each gene.

In [5]:
# _d objects: mean or median GFP_mCherry ratio of each unique (wt orf, plate) combination
# _l objects: list of WT:WT abundances (GFP:mCherry ratios) to estimate assay variability
wt_avg_d, wt_ratio_l = proc.get_wt_variability_d(pDEST_DUAL_avg_df, pDEST_DUAL_df)
wt_median_d, wt_ratio_l_median = proc.get_wt_variability_d_median(pDEST_DUAL_median_df, pDEST_DUAL_df)

In [6]:
wt_std, wt_mean = proc.wt_log2fc_variability(pDEST_DUAL_avg_df, pDEST_DUAL_df)

In [7]:
# Must be two WT replicates on same plate to compute the ratio, otherwise we return a NaN
pDEST_DUAL_avg_allele_df = proc.get_pDEST_DUAL_avg_allele_df(pDEST_DUAL_avg_df, wt_avg_d, wt_ratio_l, wt_std, wt_mean)
pDEST_DUAL_median_allele_df = proc.get_pDEST_DUAL_median_allele_df(pDEST_DUAL_median_df, wt_median_d, wt_ratio_l_median)

In [32]:
mean_df = pl.DataFrame(pDEST_DUAL_avg_allele_df)
median_df = pl.DataFrame(pDEST_DUAL_median_allele_df)

# append metadata and write out files
metadata = pl.read_csv(f"{meta_outputs}/varchamp_alleles_with_pillar_annotations.tsv", separator="\t", infer_schema_length=1000000).with_columns(
    pl.col("orf_id").cast(pl.Int64).alias("orf_id"),
    pl.col("mut_id").cast(pl.Int64).alias("mut_id")
)

## 1.1 Merge the same allele together

In [33]:
median_meta_df = median_df.with_columns(
    pl.col("orf_id").cast(pl.Int64).alias("orf_id"),
    pl.col("mut_id").cast(pl.Int64).alias("mut_id")
).filter(
    pl.col("valid_well"),
    pl.col("zscore_log2").is_not_null()
).join(
    metadata.select(pl.col(["orf_id","mut_id"]+[c for c in metadata.columns if c not in median_df.columns])), on=["orf_id", "mut_id"], how="left"
)

In [34]:
median_df_dup = median_meta_df.filter(pl.col("gene_variant").is_duplicated())
median_df_no_dup = median_meta_df.filter(~pl.col("gene_variant").is_duplicated())

median_df_dup = median_df_dup.group_by("symbol","nt_change","aa_change","gene_variant").agg(
    [pl.col(col).mean().alias(col) for col in median_df.columns if "median" in col or "zscore_log2" in col]
)
median_df_dup = median_df_dup.join(
    median_meta_df.select(["symbol","nt_change","aa_change","gene_variant"]+[col for col in median_meta_df.columns if col not in median_df_dup and col not in median_df]),
    on=["symbol","nt_change","aa_change","gene_variant"],
    how="left"
).unique(subset=["symbol","nt_change","aa_change","gene_variant"])

median_final_df = pl.concat([
    median_df_no_dup.select(pl.col(median_df_dup.columns)),
    median_df_dup,
], how="vertical_relaxed")

In [35]:
median_final_df

symbol,nt_change,aa_change,gene_variant,median_gfp,median_mcherry,median_GFP_mCherry_ratio,wt_GFP_mCherry_ratio_median,allele_wt_ratio_median,zscore_median,zcat_median,zscore_log2,ensembl_gene_id,ccsb_mutation_id,ccsb_allele_id,spdi,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,n2h_ref_sequence_confirmation_class,…,Naive_prior_prob_PMID:31131967,Calcualted_post_prob_PMID:31131967,calculated_classification_PMID:31131967,SGR LR_PMID:34273903,Personal and Family History LR (Combined)_PMID:34273903,Breast Tumor Pathology LR_PMID:34273903,Population Allele Frequency LR_PMID:34273903,BS2 LR_PMID:34273903,Calculated_combined_LR_PMID:34273903,Naive_prior_prob_PMID:34273903,Calculated_post_prob_PMID:34273903,calculated_classification_PMID:34273903,Reference Group*_presumed_PMID:34273903,Component_presumed_PMID:34273903,Rationale_presumed_PMID:34273903,ClinVar Variation Id_ClinGen_repo,Allele Registry Id_ClinGen_repo,Disease_ClinGen_repo,Mondo Id_ClinGen_repo,Mode of Inheritance_ClinGen_repo,Assertion_ClinGen_repo,Applied Evidence Codes (Met)_ClinGen_repo,Applied Evidence Codes (Not Met)_ClinGen_repo,Summary of interpretation_ClinGen_repo,PubMed Articles_ClinGen_repo,Expert Panel_ClinGen_repo,Guideline_ClinGen_repo,Approval Date_ClinGen_repo,Published Date_ClinGen_repo,Retracted_ClinGen_repo,Evidence Repo Link_ClinGen_repo,Uuid_ClinGen_repo,Updated_Classification_ClinGen_repo,Updated_Evidence Codes_ClinGen_repo,gene_variant_pp,aa_change_pp,pillar_tested
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,…,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,str,str,bool
"""KCNJ2""","""899G>A""","""Gly300Asp""","""KCNJ2_Gly300Asp""",693.679993,5069.049805,0.133067,0.841771,0.15808,-1.835174,-1.0,-5.882805,"""ENSG00000123700""","""CCSBVarC008343""","""ALE00008329""","""NC_000017.11:70175938:G:A""","""Edgotyping3""","""VUSMutGDEh010""","""G06""","""GDEhDisVCh_40005""","""F10""","""1""","""VUSMutGDDh010""","""G06""","""VUSN2Hmut_GDN2h_03""","""H09""","""VUSMutpDEST2_16""","""G04""","""VUSmut_GDEh03""","""H09""",1,2,1,2,1,5,1,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""HBD""","""174C>A""","""Asn58Lys""","""HBD_Asn58Lys""",33.84,2474.280029,0.012192,0.077713,0.156882,-1.837623,-1.0,-5.907103,"""ENSG00000223609""","""CCSBVarC006560""","""ALE00006559""","""NC_000011.10:5234132:G:T""","""CEGS2""","""CegsMutGDEh1047""","""D02""","""GDEhDisVCh_40036""","""D03""","""2""","""CegsMutGDDh1047""","""D02""","""VUSN2Hmut_GDN2h_10""","""C04""","""VUSMutpDEST2_11""","""H05""","""VUSmut_GDEh10""","""C04""",1,1,1,1,1,5,1,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""SMAD3""","""376C>T""","""His126Tyr""","""SMAD3_His126Tyr""",993.600037,3757.339966,0.267095,1.033889,0.25834,-1.630252,-1.0,-4.313822,"""ENSG00000166949""","""CCSBVarC007714""","""ALE00007700""","""NC_000015.10:67165064:C:T""","""Edgotyping3""","""VUSMutGDEh004""","""D11""","""GDEhDisVCh_40014""","""D06""","""2""","""VUSMutGDDh004""","""D11""","""NULL""","""NULL""","""VUSMutpDEST2_09""","""B02""","""NULL""","""NULL""",1,1,1,1,0,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""HBD""","""110C>A""","""Pro37His""","""HBD_Pro37His""",31.959999,3535.25,0.008351,0.077713,0.107457,-1.938643,-1.0,-7.115858,"""ENSG00000223609""","""CCSBVarC006559""","""ALE00006558""","""NC_000011.10:5234196:G:T""","""CEGS2""","""CegsMutGDEh1046""","""E11""","""GDEhDisVCh_40036""","""C03""","""2""","""CegsMutGDDh1046""","""E11""","""VUSN2Hmut_GDN2h_10""","""F03""","""VUSMutpDEST2_11""","""G05""","""VUSmut_GDEh10""","""F03""",1,1,1,1,1,1,1,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""KRT2""","""558C>A""","""Asn186Lys""","""KRT2_Asn186Lys""",214.76001,2279.899902,0.095686,0.304336,0.314409,-1.515652,-1.0,-3.686394,"""ENSG00000172867""","""CCSBVarC002475""","""ALE00002475""","""NC_000012.12:52651585:G:T""","""RC4""","""RC4_Mut_GDEh1028""","""F01""","""GDEhDisVCh_40073""","""E11""","""1""","""RC4_Mut_GDDh1028""","""F01""","""NULL""","""NULL""","""VUSMutpDEST2_13""","""C06""","""NULL""","""NULL""",1,1,1,1,0,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""SMAD3""","""335C>T""","""Ala112Val""","""SMAD3_Ala112Val""",963.700012,2708.780029,0.357096,1.033889,0.345391,-1.452326,-1.0,-3.396248,"""ENSG00000166949""","""CCSBVarC004692""","""ALE00004692""","""NC_000015.10:67165023:C:T""","""CEGS2""","""CegsMutGDEh1035""","""D01""","""NULL""","""NULL""","""NULL""","""CegsMutGDDh1035""","""D01""","""NULL""","""NULL""","""VUSMutpDEST2_09""","""G08""","""NULL""","""NULL""",0,,1,1,0,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GCK""","""131G>A""","""Gly44Asp""","""GCK_Gly44Asp""",2746.079956,2448.382416,1.085445,1.730655,0.660002,-0.809289,-0.5,-1.357472,"""ENSG00000106633""","""CCSBVarC006686""","""ALE00006684""","""NC_000007.14:44153378:C:T""","""CEGS2""","""CegsMutGDEh1046""","""C02""","""GDEhDisVCh_40037""","""C06""","""1""","""CegsMutGDDh1046""","""C02""","""VUSN2Hmut_GDN2h_09""","""D12""","""VUSMutpDEST2_03""","""E08""","""VUSmut_GDEh09""","""D12""",1,1,1,1,1,1,7,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""PKP2""","""1114G>C""","""Ala372Pro""","""PKP2_Ala372Pro""",1100.090027,1425.280029,0.698059,0.629182,1.109471,0.109389,0.0,0.32963,"""ENSG00000057294""","""CCSBVarC003105""","""ALE00003105""","""NC_000012.12:32868983:C:G""","""CEGS2""","""CegsMutGDEh1040""","""A09""","""NULL""","""NULL""","""NULL""","""CegsMutGDDh1040""","""A09""","""VUSN2Hmut_GDN2h_08""","""B06""","""VUSMutpDEST2_13""","""C02""","""VUSmut_GDEh08""","""B06""",0,,1,1,1,1,1,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""STXBP1""","""539G>A""","""Cys180Tyr""","""STXBP1_Cys180Tyr""",612.869995,2037.875,0.2893,1.265654,0.228578,-1.691083,-1.0,-4.768856,"""ENSG00000136854""","""CCSBVarC001220""","""ALE00001220""","""NC_000009.12:127663314:G:A""","""RC4""","""RC4_Mut_GDEh1027""","""A07""","""GDEhDisVCh_40016""","""D04""","""2""","""RC4_Mut_GDDh1027""","""A07""","""VUSN2Hmut_GDN2h_10""","""H09""","""VUSMutpDEST2_06""","""D11""","""NULL""","""NULL""",1,1,1,6,1,6,1,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [36]:
median_final_df.write_csv(f'{dualipa_outputs}/DUALIPA_median_zscore.csv')

In [45]:
# mean_meta_df = mean_df.with_columns(
#     pl.col("orf_id").cast(pl.Int64).alias("orf_id"),
#     pl.col("mut_id").cast(pl.Int64).alias("mut_id")
# ).filter(
#     pl.col("valid_well"),
#     pl.col("zscore_log2").is_not_null()
# ).join(
#     metadata.select(pl.col(["orf_id", "mut_id"]+[c for c in metadata.columns if c not in mean_df.columns])), on=["orf_id", "mut_id"], how="left"
# )
# mean_df.write_csv(f'{dualipa_outputs}/DUALIPA_mean_zscore.csv')