# Compile single-cell FACS into minimal dataframe

This script compiles the raw FACS results from each well into one dataframe and writes it out as a compressed parquet file. 

In [1]:
from glob import glob
from os import stat
import pandas as pd
import numpy as np
import polars as pl
import os

In [2]:
dualipa_inputs = "../1_inputs"
dualipa_outputs = "../3_outputs"

n_cell_threshold = 800
wt_gfp_threshold = 100

In [3]:
# Download raw FACS data from Zenodo if haven't done so
# bash_file = f"{dualipa_inputs}/download_raw_FACS.sh"
# os.system(f"bash {bash_file}")

In [4]:
# Compile all data into single df
facs_data = f"{dualipa_inputs}/raw_inputs/FACS_single_cell_data_mCherry_positive_threshold_500/*"
all_df = []
for file_name in glob(facs_data):
    pla_id = file_name.split('/')[-1].split('_')[-3]
    well = file_name.split('/')[-1].split('_')[-2]
    if stat(file_name).st_size > 0:
        df = pd.read_csv(file_name,sep='\t',header=None).rename(columns={0:'GFP',1:'mCherry'})
        df['pla'] = int(pla_id)
        df['well'] = well
        df['n_cells'] = len(df)
        all_df.append(df)

pDEST_DUAL_df = pd.concat(all_df)
# Replace zeros with minimum non-zero value to avoid undefined after taking log
min_nonzero_val = pDEST_DUAL_df[pDEST_DUAL_df["GFP"] > 0]["GFP"].min()
pDEST_DUAL_df.loc[pDEST_DUAL_df["GFP"] == 0, "GFP"] = min_nonzero_val

pDEST_DUAL_df = pl.DataFrame(pDEST_DUAL_df)

In [5]:
# Get GFP:mCherry ratio after replacing zeros with min-value
pDEST_DUAL_df = pDEST_DUAL_df.with_columns(
    (pl.col("GFP")/pl.col("mCherry")).alias("GFP_mCherry_ratio")
)

In [6]:
# Calculate means and medians while grouping by plate and well
agg_df = pDEST_DUAL_df.group_by(["pla", "well"]).agg(
    pl.col("GFP").mean().alias('avg_gfp'),
    pl.col("mCherry").mean().alias("avg_mcherry"),
    pl.col("GFP").median().alias("median_gfp"),
    pl.col("mCherry").median().alias("median_mcherry"),
    pl.col("GFP_mCherry_ratio").mean().alias("avg_GFP_mCherry_ratio"),
    pl.col("GFP_mCherry_ratio").median().alias("median_GFP_mCherry_ratio")
)
pDEST_DUAL_df = pDEST_DUAL_df.join(agg_df, on=["pla", "well"])#.to_pandas()

In [7]:
pDEST_DUAL_df

GFP,mCherry,pla,well,n_cells,GFP_mCherry_ratio,avg_gfp,avg_mcherry,median_gfp,median_mcherry,avg_GFP_mCherry_ratio,median_GFP_mCherry_ratio
f64,f64,i64,str,i64,f64,f64,f64,f64,f64,f64,f64
8.190001,609.700012,3,"""C09""",9149,0.013433,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973
42.77,5862.5,3,"""C09""",9149,0.007296,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973
54.600002,2845.5,3,"""C09""",9149,0.019188,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973
27.300001,6203.399902,3,"""C09""",9149,0.004401,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973
49.140003,7377.299805,3,"""C09""",9149,0.006661,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973
…,…,…,…,…,…,…,…,…,…,…,…
747.960022,1572.900024,12,"""A10""",8933,0.475529,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973
8532.080078,11872.0,12,"""A10""",8933,0.718673,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973
4703.959961,2354.800049,12,"""A10""",8933,1.997605,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973
1008.320007,1071.0,12,"""A10""",8933,0.941475,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973


In [8]:
# Merge with minimal metadata
pdest_layout_df = pl.read_csv(f"{dualipa_outputs}/dualipa_experimental_layout.csv")
pDEST_DUAL_df = pDEST_DUAL_df.join(pdest_layout_df, 
                                   left_on=['pla','well'], 
                                   right_on=['dest_pla_id','dest_well'], 
                                   suffix="_sc"
)
pDEST_DUAL_df = pDEST_DUAL_df.with_columns(
    pl.concat_str([pl.col("pla").cast(pl.String), pl.col("well").cast(pl.String)], separator="_").alias("coordinates"),
    pl.when(
        ((pl.col("n_cells")>=n_cell_threshold) & (pl.col("mut_id")>0))
        | ((pl.col("mut_id") == 0) & (pl.col("avg_gfp") >= wt_gfp_threshold))
    )
    .then(pl.lit(True))
    .otherwise(pl.lit(False))
    .alias("valid_well_no_seq_conf")
)
# pDEST_DUAL_df["coordinates"] = pDEST_DUAL_df["dest_pla_id"].astype(str) + "_" + pDEST_DUAL_df["well"].astype(str)

In [9]:
# pdest_layout_df["dualip_ref_sequence_confirmation_class"].unique()
pDEST_DUAL_df.filter(pl.col("valid_well_no_seq_conf"))

GFP,mCherry,pla,well,n_cells,GFP_mCherry_ratio,avg_gfp,avg_mcherry,median_gfp,median_mcherry,avg_GFP_mCherry_ratio,median_GFP_mCherry_ratio,orf_id,mut_id,node_type,dest_pla,symbol,ensembl_gene_id,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,n2h_ref_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,dualip_ref_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,mislocalization_ref_sequence_confirmation_class,gene_variant,chr_num,nuc_loc,ref_allele,alt_allele,Chrom,coordinates,valid_well_no_seq_conf
f64,f64,i64,str,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,bool
8.190001,609.700012,3,"""C09""",9149,0.013433,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""","""1.0""","""1""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true
42.77,5862.5,3,"""C09""",9149,0.007296,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""","""1.0""","""1""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true
54.600002,2845.5,3,"""C09""",9149,0.019188,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""","""1.0""","""1""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true
27.300001,6203.399902,3,"""C09""",9149,0.004401,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""","""1.0""","""1""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true
49.140003,7377.299805,3,"""C09""",9149,0.006661,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""","""1.0""","""1""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
747.960022,1572.900024,12,"""A10""",8933,0.475529,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""","""1.0""","""1""","""1.0""","""0""",,,"""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true
8532.080078,11872.0,12,"""A10""",8933,0.718673,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""","""1.0""","""1""","""1.0""","""0""",,,"""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true
4703.959961,2354.800049,12,"""A10""",8933,1.997605,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""","""1.0""","""1""","""1.0""","""0""",,,"""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true
1008.320007,1071.0,12,"""A10""",8933,0.941475,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""","""1.0""","""1""","""1.0""","""0""",,,"""1""","""1.0""","""1.0""","""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true


In [11]:
# define valid wells
# Convert columns to integer (NULL -> None)
columns_to_cast = [
    "entry_sequence_confirmation_class",
    "dualip_sequence_confirmation_class",
    "dualip_ref_sequence_confirmation_class",
]

pDEST_DUAL_seq_confirm_df = pDEST_DUAL_df.with_columns([
    pl.col(col).cast(pl.Float64, strict=False) for col in columns_to_cast
])

# Define masks for allele filtering
# At least one entry sequence vector is good
any_expression_good = (
    pl.col("entry_sequence_confirmation_class").is_in([1, 2])
)

mutant_mask = (pl.col('n_cells') >= n_cell_threshold) & (pl.col('mut_id') > 0)
entry_good = pl.col("dualip_sequence_confirmation_class").is_in([1, 2])
# entry_discard = pl.col("dualip_sequence_confirmation_class").is_in([3, 4, 5, 6])
entry_null_like = pl.col("dualip_sequence_confirmation_class").is_in([7, 99]) | pl.col("dualip_sequence_confirmation_class").is_null()
mutant_mask_all = mutant_mask & (entry_good | (entry_null_like & any_expression_good))

wt_mask = (pl.col('mut_id') == 0) & (pl.col('avg_gfp') >= wt_gfp_threshold)
## Define masks for allele filtering not applicable for WT
# entry_ref_good = pl.col("dualip_ref_sequence_confirmation_class").is_in([1, 2])
# entry_ref_null_like = pl.col("dualip_ref_sequence_confirmation_class").is_in([7, 99]) | pl.col("dualip_ref_sequence_confirmation_class").is_null()
# wt_mask_all = wt_mask & (entry_ref_good | (entry_ref_null_like & any_expression_good))

# Combine logic
pDEST_DUAL_seq_confirm_df = pDEST_DUAL_seq_confirm_df.with_columns(
    pl.when(mutant_mask_all | wt_mask)
    .then(True)
    .otherwise(False)
    .alias("valid_well")
)

# pDEST_DUAL_df['valid_well'] = (mutant_mask | wt_mask) & class_mask

In [12]:
pDEST_DUAL_seq_confirm_df.filter(pl.col("valid_well"))

GFP,mCherry,pla,well,n_cells,GFP_mCherry_ratio,avg_gfp,avg_mcherry,median_gfp,median_mcherry,avg_GFP_mCherry_ratio,median_GFP_mCherry_ratio,orf_id,mut_id,node_type,dest_pla,symbol,ensembl_gene_id,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,n2h_ref_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,dualip_ref_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,mislocalization_ref_sequence_confirmation_class,gene_variant,chr_num,nuc_loc,ref_allele,alt_allele,Chrom,coordinates,valid_well_no_seq_conf,valid_well
f64,f64,i64,str,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,bool,bool
8.190001,609.700012,3,"""C09""",9149,0.013433,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""",1.0,"""1""","""1.0""","""1""","""1.0""","""1.0""","""1""",1.0,1.0,"""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true,true
42.77,5862.5,3,"""C09""",9149,0.007296,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""",1.0,"""1""","""1.0""","""1""","""1.0""","""1.0""","""1""",1.0,1.0,"""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true,true
54.600002,2845.5,3,"""C09""",9149,0.019188,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""",1.0,"""1""","""1.0""","""1""","""1.0""","""1.0""","""1""",1.0,1.0,"""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true,true
27.300001,6203.399902,3,"""C09""",9149,0.004401,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""",1.0,"""1""","""1.0""","""1""","""1.0""","""1.0""","""1""",1.0,1.0,"""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true,true
49.140003,7377.299805,3,"""C09""",9149,0.006661,41.933584,4860.99931,32.760002,3332.0,0.016161,0.008973,8151,202847,"""allele""","""VUSDUALXP_03""","""IL36RN""","""ENSG00000136695""","""CCSBVarC006168""","""ALE00006168""","""NC_000002.12:113062577:C:G""","""368C>G""","""Thr123Arg""","""CEGS2""","""CegsMutGDEh1036""","""E07""","""GDEhDisVCh_40084""","""B05""","""2.0""","""CegsMutGDDh1036""","""E07""","""VUSN2Hmut_GDN2h_07""","""E05""","""VUSMutpDEST2_07""","""F09""","""VUSmut_GDEh07""","""E05""","""1""",1.0,"""1""","""1.0""","""1""","""1.0""","""1.0""","""1""",1.0,1.0,"""1""","""1.0""","""6.0""","""IL36RN_Thr123Arg""","""2""","""113062577""","""C""","""G""","""2""","""3_C09""",true,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
747.960022,1572.900024,12,"""A10""",8933,0.475529,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""",1.0,"""1""","""1.0""","""0""",,,"""1""",1.0,1.0,"""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true,true
8532.080078,11872.0,12,"""A10""",8933,0.718673,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""",1.0,"""1""","""1.0""","""0""",,,"""1""",1.0,1.0,"""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true,true
4703.959961,2354.800049,12,"""A10""",8933,1.997605,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""",1.0,"""1""","""1.0""","""0""",,,"""1""",1.0,1.0,"""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true,true
1008.320007,1071.0,12,"""A10""",8933,0.941475,2397.877414,5726.17151,1781.119995,4270.699707,0.76086,0.447973,11374,19295,"""allele""","""VUSDUALXP_12""","""RP2""","""ENSG00000102218""","""CCSBVarC001709""","""ALE00001709""","""NC_000023.11:46860063:C:T""","""844C>T""","""Arg282Trp""","""RC4""","""RC4_Mut_GDEh1017""","""B08""","""GDEhDisVCh_40085""","""C09""","""1.0""","""RC4_Mut_GDDh1017""","""B08""",,,"""VUSMutpDEST2_08""","""D11""","""VUSmut_GDEh13""","""H01""","""1""",1.0,"""1""","""1.0""","""0""",,,"""1""",1.0,1.0,"""1""","""1.0""","""1.0""","""RP2_Arg282Trp""","""23""","""46860063""","""C""","""T""","""X""","""12_A10""",true,true


In [13]:
pDEST_DUAL_seq_confirm_df.filter(pl.col("mut_id")==2638)

GFP,mCherry,pla,well,n_cells,GFP_mCherry_ratio,avg_gfp,avg_mcherry,median_gfp,median_mcherry,avg_GFP_mCherry_ratio,median_GFP_mCherry_ratio,orf_id,mut_id,node_type,dest_pla,symbol,ensembl_gene_id,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,n2h_ref_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,dualip_ref_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,mislocalization_ref_sequence_confirmation_class,gene_variant,chr_num,nuc_loc,ref_allele,alt_allele,Chrom,coordinates,valid_well_no_seq_conf,valid_well
f64,f64,i64,str,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,f64,f64,str,str,str,str,str,str,str,str,str,str,bool,bool
1830.800049,10242.399414,11,"""H06""",4976,0.178747,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
400.200012,9449.299805,11,"""H06""",4976,0.042352,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
83.720001,935.899963,11,"""H06""",4976,0.089454,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
1451.76001,2517.199951,11,"""H06""",4976,0.576736,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
824.320007,3024.699951,11,"""H06""",4976,0.27253,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
869.400024,1155.699951,11,"""H06""",4976,0.752271,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
2712.160156,7137.199707,11,"""H06""",4976,0.380003,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
1789.400024,673.399963,11,"""H06""",4976,2.657262,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false
115.920006,1004.5,11,"""H06""",4976,0.115401,1736.815259,5002.388278,897.920013,2918.650024,0.851111,0.253414,100016069,2638,"""allele""","""VUSDUALXP_11""",,,,,,"""WT""","""WT""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"""11_H06""",true,false


In [14]:
# write out single-cell data
# pDEST_DUAL_df.to_parquet(f"{dualipa_outputs}/facs_single_cell.parquet", compression="zstd")
pDEST_DUAL_seq_confirm_df.write_parquet(f"{dualipa_outputs}/facs_single_cell.parquet", compression="zstd")