In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [13]:
subset_renaming = {
    "600886": "Hyperferritinemia",
    "613985": "Beta-thalassemia",
    "614743": "Pulmonary fibrosis",
    "306900": "Hemophilia B",
    "250250": "Cartilage-hair hypoplasia",
    "174500": "Preaxial polydactyly II",
    "143890": "Hypercholesterolemia-1",
    "210710": "Dwarfism (MOPD1)",

    "Mono": "Monocyte count",
    "HbA1c": "Hemoglobin A1c",
    "HDLC": "High density lipoprotein cholesterol",
}

In [14]:
borzoi_metadata = pd.read_csv(
    "../../results/metadata/Borzoi.csv", usecols=["name", "description", "assay", "sample"]
)
borzoi_metadata

Unnamed: 0,name,description,assay,sample
0,CNhs10608+,CAGE:Clontech Human Universal Reference Total ...,CAGE,"Clontech Human Universal Reference Total RNA, ..."
1,CNhs10608-,CAGE:Clontech Human Universal Reference Total ...,CAGE,"Clontech Human Universal Reference Total RNA, ..."
2,CNhs10610+,CAGE:SABiosciences XpressRef Human Universal T...,CAGE,SABiosciences XpressRef Human Universal Total ...
3,CNhs10610-,CAGE:SABiosciences XpressRef Human Universal T...,CAGE,SABiosciences XpressRef Human Universal Total ...
4,CNhs10612+,CAGE:Universal RNA - Human Normal Tissues Bioc...,CAGE,"Universal RNA - Human Normal Tissues Biochain,..."
...,...,...,...,...
7606,GTEX-13FTX-1026-SM-5J2O5.1,RNA:uterus,RNA,uterus
7607,GTEX-1MA7W-1526-SM-DHXKS.1,RNA:uterus,RNA,uterus
7608,GTEX-11EMC-1926-SM-5A5JU.1,RNA:vagina,RNA,vagina
7609,GTEX-12WSB-2426-SM-5EGJC.1,RNA:vagina,RNA,vagina


In [27]:
#dataset = "mendelian_traits_matched_9"
#subsets = [
#    "613985",  # Beta-thalassemia
#    #"614743",
#    "306900",  # Hemophilia B
#    #"250250",
#    "143890",  # Hypercholesterolemia-1
#]

dataset = "complex_traits_matched_9"
subsets = [
    "Mono",
    "HbA1c",
    "HDLC",
    ##"Alb",
    ##"DVT",
]

base_dir = f"../../results/dataset/{dataset}"
V = pd.read_parquet(f"{base_dir}/test.parquet")

## Marginal performance

In [28]:
models = pd.DataFrame(
    [
        ["Borzoi", "Borzoi_L2", "C1"],
    ],
    columns=["Model", "path", "color"]
).set_index("Model")
models

Unnamed: 0_level_0,path,color
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Borzoi,Borzoi_L2,C1


In [29]:
dfs = []
sample_size = {}

for subset in tqdm(subsets):
    s = pd.read_parquet(f"{base_dir}/subset/{subset}.parquet")
    V_s = s.merge(V, on=["chrom", "pos", "ref", "alt"], how="left")
    sample_size[subset] = V_s.label.sum(), (~V_s.label).sum()
    for model in models.index:
        df = pd.read_csv(f"{base_dir}/unsupervised_metrics/{subset}/{models.loc[model, 'path']}.csv")
        df = df.merge(borzoi_metadata, left_on="feature", right_on="name", how="inner")
        #df = df[df.assay.isin(["RNA", "CAGE"])]
        df = df.drop_duplicates("sample")
        df = df.head(50)
        df["subset"] = subset_renaming.get(subset, subset)
        df["Model"] = model
        dfs.append(df)
df = pd.concat(dfs)
df

100%|██████████████████████████████████████████████| 3/3 [00:00<00:00, 28.03it/s]


Unnamed: 0,AUPRC,feature,sign,name,description,assay,sample,subset,Model
0,0.559428,ENCFF909JRL,plus,ENCFF909JRL,CHIP:H3K27ac:neutrophil,CHIP,H3K27ac:neutrophil,Monocyte count,Borzoi
1,0.559420,CNhs13543-,plus,CNhs13543-,"CAGE:CD14+ monocytes - treated with BCG,",CAGE,"CD14+ monocytes - treated with BCG,",Monocyte count,Borzoi
2,0.559317,ENCFF634ZUJ,plus,ENCFF634ZUJ,DNASE:HL-60,DNASE,HL-60,Monocyte count,Borzoi
3,0.556510,CNhs13544-,plus,CNhs13544-,CAGE:CD14+ monocytes - treated with Trehalose ...,CAGE,CD14+ monocytes - treated with Trehalose dimyc...,Monocyte count,Borzoi
4,0.555901,CNhs11959-,plus,CNhs11959-,"CAGE:Neutrophils,",CAGE,"Neutrophils,",Monocyte count,Borzoi
...,...,...,...,...,...,...,...,...,...
45,0.386305,kai17,plus,kai17,ATAC:Cardiac Pericyte 1,ATAC,Cardiac Pericyte 1,High density lipoprotein cholesterol,Borzoi
46,0.386263,ENCFF869EBO,plus,ENCFF869EBO,CHIP:NR2F2:liver female child (4 years),CHIP,NR2F2:liver female child (4 years),High density lipoprotein cholesterol,Borzoi
47,0.386036,kai41,plus,kai41,ATAC:Fasciculata,ATAC,Fasciculata,High density lipoprotein cholesterol,Borzoi
48,0.385797,kai158,plus,kai158,ATAC:Fibro Nerve,ATAC,Fibro Nerve,High density lipoprotein cholesterol,Borzoi


In [30]:
for subset in df.subset.unique():
    print(subset)
    df2 = df[df.subset==subset]
    print(df2[["AUPRC", "sample"]].values)

Monocyte count
[[0.5594275679632358 'H3K27ac:neutrophil']
 [0.5594197731790993 'CD14+ monocytes - treated with BCG,']
 [0.5593168759372902 'HL-60']
 [0.5565096595616823
  'CD14+ monocytes - treated with Trehalose dimycolate (TDM),']
 [0.5559008678844445 'Neutrophils,']
 [0.5547687078340207
  'CD14+ monocytes - treated with Group A streptococci,']
 [0.5539842593614636 'CD14+ monocytes - treated with Candida,']
 [0.551645784835633 'CD14-positive monocyte female']
 [0.5513570010354896 'CD14+ Monocytes,']
 [0.5509086358972433 'CD14+ monocytes - treated with Salmonella,']
 [0.5493412296336347 'CD14-positive monocyte female adult (34 years)']
 [0.5472064606151904 'CD14+ monocytes - treated with lipopolysaccharide,']
 [0.5459859946116948 'CD14+ monocytes - treated with Cryptococcus,']
 [0.5449150549395325 'CD14-positive monocyte male adult (21 year)']
 [0.5442359008530452 'CD14-positive monocyte male adult (37 years)']
 [0.5442307897286576 'CD14+ monocytes - treated with B-glucan,']
 [0.54213