# Summary of Imaging Analyses Results

In [1]:
# imports
import os
import polars as pl
import pandas as pd
import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from functools import reduce
import operator
from adjustText import adjust_text

In [2]:
BATCH_LIST_DICT = {
    # "2024_01_23_Batch_7": "2024_02_Batch_7-8", 
    # "2024_02_06_Batch_8": "2024_02_Batch_7-8",
    # "2024_12_09_Batch_11": "2024_12_Batch_11-12", 
    # "2024_12_09_Batch_12": "2024_12_Batch_11-12",
    "2025_01_27_Batch_13": "2025_01_Batch_13-14", 
    "2025_01_28_Batch_14": "2025_01_Batch_13-14",
    # "2025_03_17_Batch_15": "2025_03_Batch_15-16", 
    # "2025_03_17_Batch_16": "2025_03_Batch_15-16"
}

BIO_REP_BATCHES_DICT = {
    "2025_01_Batch_13-14": ("2025_01_27_Batch_13", "2025_01_28_Batch_14")
}

METADATA_INPUT = "../../../../1_allele_collection/3_outputs"
FEAT_SETS = ["DNA", "Mito", "GFP", "AGP", "Morph"]
CLASS_RES_OUTDIR = "../../3_outputs/1_snakemake_pipeline/3.smp_results_analyses"
INTEGRATIVE_INPUT = "../../../../3_integrated_assay_analyses/1_inputs/imaging"

In [3]:
clin_var_df = pl.read_csv(f"{METADATA_INPUT}/varchamp_clinvar_gnomad.tsv", 
                          separator="\t", infer_schema_length=10000
).with_columns(
    pl.col("orf_id_wt").alias("orf_id"),
    pl.col("mutation_id_old").alias("mut_id"),
    pl.col("gene_variant").alias("gene_allele"),
)
clin_var_df.head()

symbol,ensembl_gene_id,orf_id_wt,mutation_id_old,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,gene_variant,chr_num,nuc_loc,ref_allele,…,ChromosomeAccession,Chromosome,Start,Stop,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity,SCVsForAggregateGermlineClassification,SCVsForAggregateSomaticClinicalImpact,SCVsForAggregateOncogenicityClassification,clinvar_nt_change,clinvar_aa_change,RefSeq_mRNA,StarStatus,clinvar_clnsig_clean,chr,chr_pos_38,ref_right,alt_right,gnomad_af,orf_id,mut_id,gene_allele
str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,i64,f64,str,…,str,str,i64,i64,str,str,i64,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,str,str,str,str,i64,i64,str
"""GBA1""","""ENSG00000177628""",2,6,"""CCSBVarC000001""","""ALE0000584""","""NC_000001.11:155240033:C:G""","""160G>C""","""Val54Leu""","""RC4""","""RC4_Mut_GDEh1026""","""H01""","""GDEhDisVCh_40054""","""F12""","""2""","""RC4_Mut_GDDh1026""","""H01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""2""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Val54Leu""",1,155240033.0,"""C""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,6,"""GBA1_Val54Leu"""
"""GBA1""","""ENSG00000177628""",2,73,"""CCSBVarC000002""","""ALE00000002""","""NC_000001.11:155238225:G:A""","""670C>T""","""Leu224Phe""","""RC4""","""RC4_Mut_GDEh1026""","""E01""","""GDEhDisVCh_40054""","""C12""","""2""","""RC4_Mut_GDDh1026""","""E01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""1""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Leu224Phe""",1,155238225.0,"""G""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,73,"""GBA1_Leu224Phe"""
"""GBA1""","""ENSG00000177628""",2,113,"""CCSBVarC000003""","""ALE00000003""","""NC_000001.11:155237453:C:T""","""887G>A""","""Arg296Gln""","""RC4""","""RC4_Mut_GDEh1026""","""F01""","""GDEhDisVCh_40054""","""D12""","""2""","""RC4_Mut_GDDh1026""","""F01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""7""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Arg296Gln""",1,155237453.0,"""C""",…,"""NC_000001.11""","""1""",155237453.0,155237453.0,"""1q22""","""criteria provided, multiple su…",15.0,"""-""","""N""","""ClinGen:CA221417,UniProtKB:P04…",3.0,4328.0,155237453.0,"""C""","""T""","""-""","""-""","""-""","""-""","""-""","""-""","""SCV000232587|SCV000321701|SCV0…","""-""","""-""","""887G>A ""","""Arg296Gln""","""NM_000157.4""",2.0,"""1_Pathogenic""",,,,,,2,113,"""GBA1_Arg296Gln"""
"""GBA1""","""ENSG00000177628""",2,231,"""CCSBVarC000004""","""ALE00000004""","""NC_000001.11:155235252:A:G""","""1448T>C""","""Leu483Pro""","""RC4""","""RC4_Mut_GDEh1026""","""G01""","""GDEhDisVCh_40054""","""E12""","""2""","""RC4_Mut_GDDh1026""","""G01""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""2""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Leu483Pro""",1,155235252.0,"""A""",…,"""NC_000001.11""","""1""",155235252.0,155235252.0,"""1q22""","""criteria provided, multiple su…",36.0,"""-""","""Y""","""UniProtKB:P04062#VAR_003321,OM…",3.0,4288.0,155235252.0,"""A""","""G""","""-""","""-""","""-""","""-""","""-""","""-""","""SCV000111209|SCV000491300|SCV0…","""-""","""-""","""1448T>C ""","""Leu483Pro""","""NM_000157.4""",2.0,"""1_Pathogenic""",,,,,,2,231,"""GBA1_Leu483Pro"""
"""GBA1""","""ENSG00000177628""",2,213510,"""CCSBVarC003869""","""ALE00003869""","""NC_000001.11:155239934:G:A""","""259C>T""","""Arg87Trp""","""CEGS2""","""CegsMutGDEh1035""","""B03""","""GDEhDisVCh_40054""","""B02""","""2""","""CegsMutGDDh1035""","""B03""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""","""NULL""",1,"""1""",1,"""2""",0,"""NULL""",0,"""NULL""",0,"""NULL""","""GBA1_Arg87Trp""",1,155239934.0,"""G""",…,"""NC_000001.11""","""1""",155239934.0,155239934.0,"""1q22""","""criteria provided, multiple su…",13.0,"""-""","""N""","""ClinGen:CA253098,UniProtKB:P04…",3.0,4321.0,155239934.0,"""G""","""A""","""-""","""-""","""-""","""-""","""-""","""-""","""SCV000697586|SCV001422687|SCV0…","""-""","""-""","""259C>T ""","""Arg87Trp""","""NM_000157.4""",2.0,"""1_Pathogenic""",,,,,,2,213510,"""GBA1_Arg87Trp"""


In [4]:
priority_col = ["orf_id", "mut_id", "symbol", "aa_change", "gene_allele", "gene_variant", "ensembl_gene_id", "clinvar_clnsig_clean", "gnomad_af", "StarStatus"]
clin_var_df = clin_var_df.select(priority_col).filter(~pl.col("mut_id").is_null()).with_columns(
    pl.col("mut_id").cast(pl.Int64).alias("mut_id"),
    pl.col("orf_id").cast(pl.Int64).alias("orf_id")
)

In [5]:
cell_count_summary = pl.read_csv(f"{CLASS_RES_OUTDIR}/2.cell_count_abundance_change/2025_01_Batch_13-14/well-level_cell-count_changes.csv").rename(
    {"U2OS_paired_t_stat": "U2OS_cc_t_stat", "U2OS_paired_t_pval": "U2OS_cc_t_pval", "Variant": "gene_allele"}
)
prot_abund_summary = pl.read_csv(f"{CLASS_RES_OUTDIR}/2.cell_count_abundance_change/2025_01_Batch_13-14/well-level_prot-abundance_changes.csv").rename(
    {"U2OS_paired_t_stat": "U2OS_abun_t_stat", "U2OS_paired_t_pval": "U2OS_abun_t_pval", "Variant": "gene_allele"}
)
auroc_summary_df = pl.read_csv(f"{CLASS_RES_OUTDIR}/3.classification_analyses/2025_01_Batch_13-14/imaging_analyses_classification_summary.csv")
auroc_summary_df

gene_allele,Metadata_Bio_Batch,AUROC_BioRep1_Morph,AUROC_BioRep1_AGP,AUROC_BioRep1_GFP,AUROC_BioRep1_Mito,AUROC_BioRep1_DNA,AUROC_BioRep2_Morph,AUROC_BioRep2_AGP,AUROC_BioRep2_GFP,AUROC_BioRep2_Mito,AUROC_BioRep2_DNA,AUROC_Mean_Morph,AUROC_Mean_AGP,AUROC_Mean_GFP,AUROC_Mean_Mito,AUROC_Mean_DNA,Altered_95th_perc_Morph,Altered_95th_perc_AGP,Altered_95th_perc_GFP,Altered_95th_perc_Mito,Altered_95th_perc_DNA,Altered_99th_perc_Morph,Altered_99th_perc_AGP,Altered_99th_perc_GFP,Altered_99th_perc_Mito,Altered_99th_perc_DNA,Gene
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""F9_Arg75Gln""","""2025_01_Batch_13-14""",0.780674,0.572997,0.580592,0.664169,0.656791,0.606106,0.46497,0.547248,0.511962,0.615514,0.69339,0.518983,0.56392,0.588065,0.636153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""F9"""
"""RAF1_Asn140His""","""2025_01_Batch_13-14""",0.87685,0.800864,0.752669,0.856646,0.749831,0.727624,0.769805,0.740513,0.727092,0.658786,0.802237,0.785334,0.746591,0.791869,0.704308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""RAF1"""
"""BRAF_Leu245Phe""","""2025_01_Batch_13-14""",0.85424,0.824602,0.794776,0.80055,0.775687,0.638821,0.749189,0.778616,0.766429,0.607602,0.74653,0.786896,0.786696,0.783489,0.691645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""BRAF"""
"""SDHD_His102Leu""","""2025_01_Batch_13-14""",0.831368,0.947253,0.965809,0.93814,0.82791,0.945721,0.930364,0.957539,0.922561,0.868815,0.888544,0.938809,0.961674,0.930351,0.848362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""SDHD"""
"""CCM2_Gly188Arg""","""2025_01_Batch_13-14""",0.865411,0.735885,0.787176,0.828515,0.72239,0.767164,0.740903,0.739107,0.716027,0.707375,0.816288,0.738394,0.763142,0.772271,0.714882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""CCM2"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BRCA1_Ala102Gly""","""2025_01_Batch_13-14""",0.879109,0.908061,0.979088,0.964904,0.825545,0.71799,0.800892,0.955645,0.917267,0.746292,0.79855,0.854476,0.967366,0.941085,0.785919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""BRCA1"""
"""RET_Gly321Arg""","""2025_01_Batch_13-14""",0.977325,0.913325,0.850919,0.951219,0.896996,0.674151,0.757551,0.752931,0.748862,0.621094,0.825738,0.835438,0.801925,0.850041,0.759045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""RET"""
"""RET_Arg313Gln""","""2025_01_Batch_13-14""",0.91614,0.80721,0.729536,0.737901,0.902768,0.793211,0.836292,0.835072,0.895138,0.644683,0.854675,0.821751,0.782304,0.816519,0.773726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""RET"""
"""F9_Val374Phe""","""2025_01_Batch_13-14""",0.667381,0.732072,0.829521,0.802352,0.758955,0.616664,0.791396,0.921977,0.888609,0.703312,0.642022,0.761734,0.875749,0.845481,0.731133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""F9"""


In [6]:
imaging_summary = cell_count_summary.join(
    prot_abund_summary,
    on=["Gene", "gene_allele"],
    how="full",
    coalesce=True
).join(
    auroc_summary_df,
    on=["Gene", "gene_allele"],
    how="full",
    coalesce=True
).with_columns(
    pl.lit(True).alias("image_assayed")
).join(
    clin_var_df,
    on=["gene_allele"],
    how="left"
)
imaging_summary.write_csv("../../../../3_integrated_assay_analyses/1_inputs/imaging/imaging_analyses_summary_clinvar.tsv", separator="\t")