# Summarize assayed alleles

In [9]:
import polars as pl
import numpy as np

In [2]:
# Paths
home_dir = "/dgx1nas1/storage/data/jess/repos/2021_09_01_VarChAMP"
corr_meta = (
    f"{home_dir}/6.downstream_analysis_snakemake/inputs/metadata/metadata_correction"
)
metrics_dir = "/dgx1nas1/storage/data/jess/varchamp/sc_data/classification_results/B7B8_1percent_updatedmeta"

In [3]:
# Read in files
meta = pl.read_csv(f"{corr_meta}/onepercent_metadata_update_20240814.csv")
metrics_wtvar = pl.read_csv(f"{metrics_dir}/metrics_summary.csv")

In [6]:
meta.columns

['symbol',
 'gene_allele',
 'imaging_well',
 'imaging_plate_R1',
 'imaging_plate_R2',
 'final_symbol',
 'final_gene_allele']

In [5]:
# total original count
total_alleles = meta.filter(pl.col("gene_allele").str.contains("_"))
assayed_alleles = total_alleles.select("gene_allele").to_series().unique().to_list()
assayed_genes = total_alleles.select("symbol").to_series().unique().to_list()

print(len(assayed_alleles))
print(len(assayed_genes))

1027
173


In [7]:
# total after metadata mishaps
corr_alleles = meta.filter(pl.col("final_gene_allele").is_in(assayed_alleles))
corr_assayed_alleles = corr_alleles.select("gene_allele").to_series().unique().to_list()
corr_assayed_genes = corr_alleles.select("symbol").to_series().unique().to_list()

print(len(corr_assayed_alleles))
print(len(corr_assayed_genes))

994
169


In [21]:
# Count with matching WT on same plate
pair_alleles = meta.filter(
    (pl.col("final_gene_allele").is_in(corr_assayed_alleles))
    | (pl.col("final_symbol").is_in(corr_assayed_genes))
)

paired_alleles = []
paired_symbols = []
for allele in corr_assayed_alleles:
    temp_plate = (
        pair_alleles.filter(pl.col("final_gene_allele") == allele)
        .select("imaging_plate_R1")
        .to_series()
        .unique()
        .to_list()
    )
    for plate in temp_plate:
        temp_symbol = (
            pair_alleles.filter(
                (pl.col("final_gene_allele") == allele)
                & (pl.col("imaging_plate_R1") == plate)
            )
            .select("final_symbol")
            .item(0, 0)
        )
        plate_alleles = (
            pair_alleles.filter(pl.col("imaging_plate_R1") == plate)
            .select("final_gene_allele")
            .to_series()
            .to_list()
        )

        if temp_symbol in plate_alleles:
            paired_alleles.append(allele)
            paired_symbols.append(temp_symbol)

paired_alleles = np.unique(paired_alleles)
paired_symbols = np.unique(paired_symbols)

print(len(paired_alleles))
print(len(paired_symbols))

752
138
