# CCM2 Alleles

In [5]:
# Imports
import polars as pl
import pandas as pd
from tqdm import tqdm
import statsmodels.formula.api as smf
from scipy.stats import ttest_rel

In [9]:
corrected_meta_df = pl.read_csv("../../../../8.1_upstream_analysis_runxi/1.metadata_qc/outputs/corrected_metadata/batch13-14/corrected_metadata.csv")
len(corrected_meta_df.filter(pl.col("final_gene_allele").str.contains("CCM2_"))["final_gene_allele"].unique().to_list())

164

In [4]:
log_file = "../../../../8.2_updated_snakemake_pipeline/outputs/results/2025_01_27_Batch_13/profiles_tcdropped_filtered_var_mad_outlier_featselect/classify.log"
with open(log_file) as f:
    lines = f.readlines()

dropped_wells = [l for l in lines if l.startswith("2025")]
# lines[0].split(':')
len(dropped_wells)
[well for well in dropped_wells if "CCM2" in well]

['2025_01_27_B13A7A8P2_T1, G22:CCM2_Glu108Lys\n',
 '2025_01_27_B13A7A8P2_T2, N09:CCM2_Phe217Leu\n',
 '2025_01_27_B13A7A8P2_T2, G22:CCM2_Glu108Lys\n',
 '2025_01_27_B13A7A8P2_T2, G24:CCM2_Glu128Ala\n',
 '2025_01_27_B13A7A8P2_T3, G22:CCM2_Glu108Lys\n',
 '2025_01_27_B13A7A8P2_T3, N09:CCM2_Phe217Leu\n',
 '2025_01_27_B13A7A8P2_T4, G22:CCM2_Glu108Lys\n',
 '2025_01_27_B13A7A8P2_T4, N09:CCM2_Phe217Leu\n']

In [18]:
# Paths
metrics_dir = "../../output/classify_reimplement/classification_results/2025_01_Batch13-14/je_wAGP/"
prof_dir = "/home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/6.downstream_analysis_snakemake/outputs/batch_profiles"
metrics_df = pl.read_csv(f"{metrics_dir}/metrics.csv", schema_overrides={"Metadata_Control": pl.Utf8})
metrics_wtvar = pl.read_csv(f"{metrics_dir}/metrics_summary.csv", schema_overrides={"Metadata_Control": pl.Utf8})
thresh = 3  # previously 10
min_class_num = 2

In [25]:
# Get meta features
def get_well_profiles(parquet_file):
    cell_profiles = (
        pl.scan_parquet(
            parquet_file
        )
        .filter(pl.col("Metadata_gene_allele").str.contains_any(["CCM2"]))
    )

    well_profiles = (
        cell_profiles.group_by(["Metadata_Plate", "Metadata_Well", "Metadata_gene_allele"])
        .agg(
            pl.col(col).median().alias(col)
                for col in cell_profiles.collect_schema().names()
                if not col.startswith("Metadata_")
        )
    ).collect().sort(by=["Metadata_Plate", "Metadata_gene_allele", "Metadata_Well"])

    return well_profiles


for parquet_file in [f"{prof_dir}/2025_01_27_Batch_13/profiles_tcdropped_filtered_var_mad_outlier_featselect_correct_meta.parquet", 
                     f"{prof_dir}/2025_01_28_Batch_14/profiles_tcdropped_filtered_var_mad_outlier_featselect_correct_meta.parquet"]:
    batch_id = parquet_file.split('/')[-2]
    well_profiles = get_well_profiles(parquet_file)
    print(well_profiles.select("Metadata_gene_allele").unique())
    well_profiles.write_csv(f"CCM2_alleles_{batch_id}_cp_profiles.csv")

shape: (165, 1)
┌──────────────────────┐
│ Metadata_gene_allele │
│ ---                  │
│ str                  │
╞══════════════════════╡
│ CCM2_Asp440Asn       │
│ CCM2_Gly103Arg       │
│ CCM2_Arg52Cys        │
│ CCM2_Gly24Asp        │
│ CCM2_Ala179Ser       │
│ …                    │
│ CCM2_Glu264Asp       │
│ CCM2_Phe91Ile        │
│ CCM2_Ile92Thr        │
│ CCM2_Arg412Gln       │
│ CCM2_Gly436Ser       │
└──────────────────────┘
shape: (165, 1)
┌──────────────────────┐
│ Metadata_gene_allele │
│ ---                  │
│ str                  │
╞══════════════════════╡
│ CCM2_Glu433Lys       │
│ CCM2_Pro77Gln        │
│ CCM2_Val45Gly        │
│ CCM2_Asp411Asn       │
│ CCM2_Glu108Lys       │
│ …                    │
│ CCM2_Gln72Arg        │
│ CCM2_Asn94Ser        │
│ CCM2_Arg423His       │
│ CCM2_Ile131Asn       │
│ CCM2_Asp110Asn       │
└──────────────────────┘


In [26]:
# b7_alleles.collect().sort(by=["Metadata_plate_map_name","Metadata_gene_allele"])
# b8_alleles.collect().sort(by=["Metadata_plate_map_name","Metadata_gene_allele"])

## Analysis Results

In [None]:
misloc_auroc_df = pd.read_csv("/home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/7.analysis_runxi/output/classify_reimplement/classification_results/2025_01_Batch13-14/je_wAGP/misloc_summary_auroc.csv")
misloc_auroc_df = misloc_auroc_df[misloc_auroc_df["allele_0"].str.contains("CCM2")].sort_values(by="allele_0").copy()
misloc_auroc_df.reset_index(drop=True, inplace=True)
# misloc_auroc_df.to_csv("CCM2_alleles_misloc_summary_auroc.csv", index=False)

misloc_auroc_df[(misloc_auroc_df["mislocalized_batch14"]==1)|(misloc_auroc_df["mislocalized_batch13"]==1)]

Unnamed: 0,allele_0,Allele_set,mislocalized_batch13,mislocalized_batch14,mislocalized_both_batches,auroc_batch13,auroc_batch14,mean_auroc
11,CCM2_Arg412Trp,A7A8P2,0,1,False,0.756599,0.817449,0.787024
17,CCM2_Asp376His,A7A8P2,0,1,False,0.852681,0.83844,0.845561
48,CCM2_Ile432Phe,A7A8P2,0,1,False,0.788307,0.820918,0.804613
49,CCM2_Ile432Thr,A7A8P2,1,1,True,0.986814,0.985951,0.986383
68,CCM2_Ser251Tyr,A7A8P2,0,1,False,0.767383,0.865958,0.816671
80,CCM2_Val190Met,A7A8P2,0,1,False,0.787347,0.819877,0.803612


In [None]:
misloc_auroc_df = pd.read_csv("/home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/7.analysis_runxi/output/classify_reimplement/classification_results/2025_01_Batch13-14/je_wAGP/morph_summary_auroc.csv")
misloc_auroc_df = misloc_auroc_df[misloc_auroc_df["allele_0"].str.contains("CCM2")].sort_values(by="allele_0").copy()
misloc_auroc_df.reset_index(drop=True, inplace=True)
# misloc_auroc_df.to_csv("CCM2_alleles_morph_summary_auroc.csv", index=False)
display(misloc_auroc_df[(misloc_auroc_df["morphological_change_batch14"]==1)|(misloc_auroc_df["morphological_change_batch13"]==1)])

Unnamed: 0,allele_0,Allele_set,morphological_change_batch13,morphological_change_batch14,morphological_change_both_batches,auroc_batch13,auroc_batch14,mean_auroc
63,CCM2_Ser164Phe,A7A8P2,1,0,False,0.978929,0.742814,0.860872
79,CCM2_Tyr261Asp,A7A8P2,1,0,False,0.982638,0.861139,0.921889
80,CCM2_Val190Met,A7A8P2,1,0,False,0.941919,0.571079,0.756499


In [41]:
abundance_df = pd.read_csv("/home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/7.analysis_runxi/output/classify_reimplement/classification_results/2025_01_Batch13-14/je_wAGP/well-level_abundance_changes.csv")
abundance_df = abundance_df[abundance_df["Gene"].str.contains("CCM2")].reset_index(drop=True).sort_values(by="Variant").copy()
abundance_df.reset_index(drop=True, inplace=True)
abundance_df.to_csv("CCM2_alleles_protein_abundance_change.csv", index=False)

In [42]:
cc_df = pd.read_csv("/home/shenrunx/igvf/varchamp/2021_09_01_VarChAMP/7.analysis_runxi/output/classify_reimplement/classification_results/2025_01_Batch13-14/je_wAGP/well-level_cell-count_changes.csv")
cc_df = cc_df[cc_df["Gene"].str.contains("CCM2")].dropna(subset="U2OS_t").reset_index(drop=True).sort_values(by="Variant").copy()
cc_df.reset_index(drop=True, inplace=True)
cc_df.to_csv("CCM2_alleles_cell_count_change.csv", index=False)