# Create edgotype profiles

In this step, we make one edgotype profile per gene of interactors by alleles. Later, we will explore metrics that summarize overall perturbation for each allele (quantitative score like imaging / abundance)

In [1]:
import polars as pl

In [2]:
inputs_dir = "../1_inputs/sqY2H"
outputs_dir = "../3_outputs/sqY2H"

In [3]:
scores = pl.read_csv(f"{outputs_dir}/4_normalized_consensus_scores.csv")
scores

ad_orf_id,db_orf_id,db_mut_id,retest_batch,normalized_score,assay_id,seq_ad,seq_db,db_sequenced,db_sequence_confirmation_class,ad_symbol,symbol,aa_change,nt_change,ensembl_gene_id,collection,clinvar_clnsig_clean,gnomad_af,StarStatus,allele_0
i64,i64,i64,str,f64,i64,i64,i64,f64,f64,str,str,str,str,str,str,str,f64,f64,str
14221,14221,218283,"""VUSAPWT1B2""",0.9375,1,1,1,1.0,1.0,"""KCTD1""","""KCTD1""","""Gly62Asp""","""185G>A""","""ENSG00000134504""","""CEGS2""","""1_Pathogenic""",,1.0,"""KCTD1_Gly62Asp"""
2098,71642,0,"""VUSAPWT1B2""",0.4375,1,1,1,,,"""EFHC2""","""CACNB4""",,,"""ENSG00000182389""",,,,,"""CACNB4"""
8357,71001,205675,"""VUSAPWT6B1""",0.75,6,1,1,1.0,1.0,"""ERP27""","""UBQLN2""","""Pro506Thr""","""1516C>A""","""ENSG00000188021""","""CEGS2""","""1_Pathogenic""",,1.0,"""UBQLN2_Pro506Thr"""
13850,71001,205675,"""VUSAPWT6B1""",0.5625,6,1,1,1.0,1.0,"""CSN2""","""UBQLN2""","""Pro506Thr""","""1516C>A""","""ENSG00000188021""","""CEGS2""","""1_Pathogenic""",,1.0,"""UBQLN2_Pro506Thr"""
100066551,7652,1806,"""VUSAPWT6B1""",0.0,6,1,1,1.0,2.0,"""MAPK1""","""STXBP1""","""Asp238Glu""","""714C>A""","""ENSG00000136854""","""Edgotyping3""","""4_VUS""",,1.0,"""STXBP1_Asp238Glu"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
53964,2713,200850,"""VUSAPWT1B1""",0.625,1,1,1,1.0,1.0,"""NFIX""","""MLH1""","""Ser295Thr""","""884G>C""","""ENSG00000076242""","""CEGS2""","""4_VUS""",,3.0,"""MLH1_Ser295Thr"""
100066360,11443,0,"""VUSAPWT1B2""",0.8125,1,1,1,,,"""KRT27""","""CCBE1""",,,"""ENSG00000183287""",,,,,"""CCBE1"""
71001,52920,2662,"""VUSAPWT1B1""",1.0,1,1,1,1.0,1.0,"""UBQLN2""","""LITAF""","""Arg160His""","""479G>A""","""ENSG00000189067""","""Edgotyping3""","""4_VUS""",,2.0,"""LITAF_Arg160His"""
2265,10122,0,"""VUSAPWT1B2""",0.75,1,1,1,,,"""CCDC36""","""EXOC8""",,,"""ENSG00000116903""",,,,,"""EXOC8"""


In [4]:
scores = pl.read_csv(f"{outputs_dir}/4_normalized_consensus_scores.csv")

# keep only scores where there is at least one matched reference-variant set
orfs_with_alleles = scores.filter(pl.col("db_mut_id") != 0).select("db_orf_id").unique().to_series().to_list()
orfs = scores.filter(pl.col("db_mut_id") == 0).select("db_orf_id").unique().to_series().to_list()

complete_orf_pairs = [i for i in orfs if i in orfs_with_alleles]
scores = scores.filter(pl.col("db_orf_id").is_in(complete_orf_pairs))
genes = scores.select("db_orf_id").to_series().unique().to_list()

# print(scores)

In [6]:
# Create profiles for each profiled gene

profiles = {}
shape_summary = []
for gene in genes:
    gene_scores = scores.filter(pl.col("db_orf_id") == gene)
    profile = gene_scores.pivot(
                values="normalized_score",
                index="ad_orf_id",
                on="db_mut_id"
            )
    profiles[gene] = profile

    shape_summary.append({
        "gene": gene,
        "n_interactors": profile.height,
        "n_alleles": profile.width
    })

    profile.write_csv(f"{outputs_dir}/edgotype_profiles/orf_id_{gene}.csv")

shape_summary = pl.DataFrame(shape_summary)

In [7]:
## Add a perturbation category for each edge to use for the N2H validation

# Extract WT scores
wt_scores = scores.filter(pl.col("db_mut_id") == 0).select(["db_orf_id", "ad_orf_id", "normalized_score"])

# Merge WT scores back onto the main df
scores_with_wt = scores.join(wt_scores, on=["db_orf_id", "ad_orf_id"], how="left", suffix="_wt")

# Compute allele/WT ratio
scores_with_wt = scores_with_wt.with_columns(
    (pl.col("normalized_score") / pl.col("normalized_score_wt")).alias("ratio")
)

# Determine if the allele is perturbed - binary output
threshold = 0.5
scores_with_wt = scores_with_wt.with_columns(
    ((pl.col("db_mut_id") != 0) & (pl.col("ratio") < threshold)).alias("perturbed")
)

# Build perturbation_classes expression
bins = [
    (1.0, float("inf"), "increased edge"),
    (0.8, 1.0, "similar"),
    (0.6, 0.8, "slightly perturbed"),
    (0.4, 0.6, "perturbed"),
    (0.2, 0.4, "highly perturbed"),
    (0.0, 0.2, "very highly perturbed"),
]

perturbation_expr = pl.when(pl.col("db_mut_id") == 0).then(pl.lit("WT"))

perturbation_expr = perturbation_expr.when(pl.col("normalized_score_wt") == 0.0).then(pl.lit("WT edge lost"))

perturbation_expr = perturbation_expr.when(pl.col("ratio") == 0.0).then(pl.lit("edge lost"))

perturbation_expr = perturbation_expr.when(pl.col("ratio") == 1.0).then(pl.lit("WT-like"))

for lower, upper, label in bins:
    perturbation_expr = perturbation_expr.when((pl.col("ratio") > lower) & (pl.col("ratio") <= upper)).then(pl.lit(label))


perturbation_expr = perturbation_expr.otherwise(pl.lit("unclassified"))

scores_with_wt = scores_with_wt.with_columns(
    perturbation_expr.alias("perturbation_classes")
)

scores_with_perturbed = scores_with_wt.select("*")

cols_to_keep = ['ad_orf_id',
 'db_orf_id',
 'db_mut_id',
 'normalized_score',
 'ad_symbol',
 'symbol',
 'clinvar_clnsig_clean',
 'normalized_score_wt',
 'ratio',
 'perturbed',
 'perturbation_classes']

scores_with_perturbed = scores_with_perturbed.select(cols_to_keep)

scores_with_perturbed.write_csv(f"{outputs_dir}/7_edge_perturbation_categorical.csv")

## Examine profile characteristics

In [9]:
# Number of profiled alleles with more than 5 interactors
hq_orfs = shape_summary.filter(pl.col("n_interactors") >= 5)
num_orfs = hq_orfs.shape[0]
print(num_orfs)

print(hq_orfs.select(pl.col("n_alleles")).sum() - num_orfs)

26
shape: (1, 1)
┌───────────┐
│ n_alleles │
│ ---       │
│ i64       │
╞═══════════╡
│ 258       │
└───────────┘


In [10]:
# Total number of profiled alleles
prof_genes = shape_summary.shape[0]
print(prof_genes)

print(shape_summary.select(pl.col("n_alleles")).sum() - prof_genes)

84
shape: (1, 1)
┌───────────┐
│ n_alleles │
│ ---       │
│ i64       │
╞═══════════╡
│ 659       │
└───────────┘
