In [1]:
#!pip install --user pyarrow

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from scipy.stats import spearmanr, mannwhitneyu, fisher_exact
from scipy.special import expit as sigmoid

In [3]:
conservation = pd.read_csv("../data/vep/conservation/Chr5.tsv.gz", sep="\t")
conservation

Unnamed: 0,pos,PhastCons,PhyloP
0,73,0.137841,0.486723
1,74,0.120058,0.630370
2,75,0.082986,-0.672660
3,76,0.083501,0.630370
4,77,0.070315,0.410230
...,...,...,...
24279498,26975493,0.149043,0.383414
24279499,26975494,0.139126,0.383414
24279500,26975495,0.121064,-0.819180
24279501,26975496,0.144156,0.383414


In [None]:
def compute_vep_scores(vep_mode=None, model_name=None):
    if vep_mode in ["mlm", "chromatin"]:
        df = pd.read_parquet(f"../plantbert/{vep_mode}/vep_full_{model_name}.parquet")
    else:
        df = pd.read_parquet("../data/vep/variants/filt.parquet")
    print(df.shape)
    df = df[df.AN >= 2000]
    df = df[(df.AC != 0) & (df.AC != df.AN)]
    df["AF"] = df.AC / df.AN
    df = df[df.AF < 0.5]
    print(df.shape)
    df = df.merge(conservation, how="inner", on="pos")
    print(df.shape)
    df.consequence = df.consequence.apply(lambda x: ','.join(sorted(list(set(x.split(","))))))
    
    df["Status"] = "Neither"
    df.loc[df.AC <= 10, "Status"] = "Rare"
    df.loc[df.AC > 100, "Status"] = "Common"
    
    if vep_mode == "chromatin":
        pred_ref_cols = df.columns[df.columns.str.startswith("model_pred_ref_")]
        pred_alt_cols = df.columns[df.columns.str.startswith("model_pred_alt_")]
        pred_cols = np.concatenate([pred_ref_cols, pred_alt_cols])
        df[pred_cols] = sigmoid(df[pred_cols])
        feature_names = ["_".join(col.split("_")[3:]) for col in pred_ref_cols]
        delta_pred_cols = [f"delta_pred_{f}" for f in feature_names]
        df.loc[:, delta_pred_cols] = df[pred_alt_cols].values - df[pred_ref_cols].values
        df[delta_pred_cols] = df[delta_pred_cols].abs()
        df.loc[:, "model_score"] = np.linalg.norm(values, ord=2, axis=1)
    elif vep_mode == "mlm":
        df["model_score"] = df.model_llr
    elif vep_mode == "PhastCons":
        df["model_score"] = -df.PhastCons
    elif vep_mode == "PhyloP":
#df.model_llr = -(df.PhyloP.abs())