# Compile single-cell FACS into minimal dataframe

This script compiles the raw FACS results from each well into one dataframe and writes it out as a compressed parquet file. 

In [1]:
from glob import glob
from os import stat
import pandas as pd
import numpy as np
import polars as pl
import os

In [2]:
dualipa_inputs = "../1_inputs"
dualipa_outputs = "../3_outputs"

n_cell_threshold = 800
wt_gfp_threshold = 100
valid_dualip_sequence_confirmation_class = [1., 2., 7., 99.]

In [3]:
# Download raw FACS data from Zenodo if haven't done so
# bash_file = f"{dualipa_inputs}/download_raw_FACS.sh"
# os.system(f"bash {bash_file}")

In [4]:
# Compile all data into single df
facs_data = f"{dualipa_inputs}/raw_inputs/FACS_single_cell_data_mCherry_positive_threshold_500/*"
all_df = []
for file_name in glob(facs_data):
    pla_id = file_name.split('/')[-1].split('_')[-3]
    well = file_name.split('/')[-1].split('_')[-2]
    if stat(file_name).st_size > 0:
        df = pd.read_csv(file_name,sep='\t',header=None).rename(columns={0:'GFP',1:'mCherry'})
        df['pla'] = int(pla_id)
        df['well'] = well
        df['n_cells'] = len(df)
        all_df.append(df)

pDEST_DUAL_df = pd.concat(all_df)
# Replace zeros with minimum non-zero value to avoid undefined after taking log
min_nonzero_val = pDEST_DUAL_df[pDEST_DUAL_df["GFP"] > 0]["GFP"].min()
pDEST_DUAL_df.loc[pDEST_DUAL_df["GFP"] == 0, "GFP"] = min_nonzero_val

pDEST_DUAL_df = pl.DataFrame(pDEST_DUAL_df)

In [5]:
# Get GFP:mCherry ratio after replacing zeros with min-value
pDEST_DUAL_df = pDEST_DUAL_df.with_columns(
    (pl.col("GFP")/pl.col("mCherry")).alias("GFP_mCherry_ratio")
)

In [6]:
# Calculate means and medians while grouping by plate and well
agg_df = pDEST_DUAL_df.group_by(["pla", "well"]).agg(
    pl.col("GFP").mean().alias('avg_gfp'),
    pl.col("mCherry").mean().alias("avg_mcherry"),
    pl.col("GFP").median().alias("median_gfp"),
    pl.col("mCherry").median().alias("median_mcherry"),
    pl.col("GFP_mCherry_ratio").mean().alias("avg_GFP_mCherry_ratio"),
    pl.col("GFP_mCherry_ratio").median().alias("median_GFP_mCherry_ratio")
)

pDEST_DUAL_df = pDEST_DUAL_df.join(agg_df, on=["pla", "well"]).to_pandas()

In [7]:
# Merge with minimal metadata
pdest_layout_df = pd.read_csv(f"{dualipa_outputs}/dualipa_experimental_layout.csv")
pDEST_DUAL_df = pDEST_DUAL_df.merge(pdest_layout_df, left_on=['pla','well'], right_on=['dest_pla_id','dest_well'], suffixes=("","_sc"))
pDEST_DUAL_df["coordinates"] = pDEST_DUAL_df["dest_pla_id"].astype(str) + "_" + pDEST_DUAL_df["well"].astype(str)

In [8]:
# pDEST_DUAL_df.head()
# pDEST_DUAL_df["dualip_sequenced"].unique()
# pDEST_DUAL_df["dualip_sequence_confirmation_class"].unique()

In [9]:
# define valid wells no seq conf
pDEST_DUAL_df['valid_well_no_seq_conf'] = (((pDEST_DUAL_df.n_cells >= n_cell_threshold) & (pDEST_DUAL_df.mut_id > 0)) | 
                                           ((pDEST_DUAL_df.mut_id == 0) & (pDEST_DUAL_df.avg_gfp >= wt_gfp_threshold)))

# define valid wells
class_mask = pDEST_DUAL_df['dualip_sequence_confirmation_class'].isin(valid_dualip_sequence_confirmation_class) & ~pDEST_DUAL_df['dualip_sequence_confirmation_class'].isna()
mutant_mask = (pDEST_DUAL_df['n_cells'] >= n_cell_threshold) & (pDEST_DUAL_df['mut_id'] > 0)
wt_mask = (pDEST_DUAL_df['mut_id'] == 0) & (pDEST_DUAL_df['avg_gfp'] >= wt_gfp_threshold)

pDEST_DUAL_df['valid_well'] = (mutant_mask | wt_mask) & class_mask

In [10]:
pDEST_DUAL_df[pDEST_DUAL_df['valid_well']].shape

(10294869, 28)

In [11]:
pDEST_DUAL_df[pDEST_DUAL_df['valid_well_no_seq_conf']].shape

(15449152, 28)

In [12]:
# write out single-cell data
pDEST_DUAL_df.to_parquet(f"{dualipa_outputs}/facs_single_cell.parquet", compression="zstd")