In [1]:
import numpy as np
import polars as pl

## Mendelian traits

In [2]:
dataset = "mendelian_traits_matched_9"
V = (
    pl.read_parquet(f"../../results/dataset/{dataset}/test.parquet")
    .filter(pl.col("label"))
    .with_columns(
        pl.col("OMIM").str.split_exact(" ", 1).struct.rename_fields(["dummy", "trait"])
    )
    .unnest("OMIM").drop("dummy")
)
V

chrom,pos,ref,alt,trait,consequence,label,tss_dist,match_group
str,i64,str,str,str,str,bool,i64,str
"""1""",7961859,"""C""","""G""","""606324""","""PLS""",true,34,"""PLS_0"""
"""1""",9943502,"""A""","""T""","""608553""","""5_prime_UTR_variant""",true,26,"""5_prime_UTR_variant_0"""
"""1""",9943503,"""C""","""T""","""608553""","""5_prime_UTR_variant""",true,27,"""5_prime_UTR_variant_1"""
"""1""",11023351,"""G""","""A""","""612069""","""3_prime_UTR_variant""",true,1206,"""3_prime_UTR_variant_0"""
"""1""",21509427,"""C""","""T""","""241500""","""5_prime_UTR_variant""",true,0,"""5_prime_UTR_variant_2"""
…,…,…,…,…,…,…,…,…
"""X""",155022770,"""A""","""G""","""306700""","""PLS""",true,46,"""PLS_57"""
"""X""",155022771,"""G""","""A""","""306700""","""PLS""",true,47,"""PLS_62"""
"""X""",155022773,"""A""","""T""","""306700""","""PLS""",true,49,"""PLS_58"""
"""X""",155022807,"""T""","""C""","""306700""","""PLS""",true,83,"""PLS_59"""


In [3]:
V["trait"].n_unique()

113

In [4]:
V["trait"].value_counts().sort("count", descending=True)

trait,count
str,u32
"""600886""",25
"""613985""",23
"""614743""",22
"""306900""",21
"""250250""",20
…,…
"""612069""",1
"""604625""",1
"""176100""",1
"""606324""",1


In [5]:
df = V["trait"].value_counts().sort("count", descending=True).filter(pl.col("count") >= 10)
df

trait,count
str,u32
"""600886""",25
"""613985""",23
"""614743""",22
"""306900""",21
"""250250""",20
"""174500""",14
"""143890""",12
"""210710""",10


In [30]:
df.select("trait").write_csv("../../config/omim/filtered_traits.txt", include_header=False)

## Complex traits

In [6]:
dataset = "complex_traits_matched_9"
V = (
    pl.read_parquet(f"../../results/dataset/{dataset}/test.parquet")
    .filter(pl.col("label"))
)
V = V.with_columns(
    pl.col("trait").str.split(","),
    id=V["chrom"] + "_" + V["pos"].cast(str) + "_" + V["ref"] + "_" + V["alt"]
)
V

chrom,pos,ref,alt,pip,trait,label,maf,ld_score,consequence,tss_dist,match_group,id
str,i64,str,str,f64,list[str],bool,f64,f64,str,i64,str,str
"""1""",2293397,"""G""","""A""",0.999932,"[""Height""]",true,0.37057,32.302,"""dELS""",65077,"""dELS_0""","""1_2293397_G_A"""
"""1""",3080038,"""T""","""C""",0.999895,"[""MCH"", ""MCV"", … ""RBC""]",true,0.23272,31.606,"""dELS""",10826,"""dELS_1""","""1_3080038_T_C"""
"""1""",3774964,"""A""","""G""",0.999973,"[""Hb"", ""HbA1c"", … ""RBC""]",true,0.23057,95.317,"""dELS""",2138,"""dELS_2""","""1_3774964_A_G"""
"""1""",9181780,"""G""","""A""",1.0,"[""AST"", ""Mono""]",true,0.074322,35.472,"""non_coding_transcript_exon_var…",52677,"""non_coding_transcript_exon_var…","""1_9181780_G_A"""
"""1""",9295877,"""G""","""T""",0.993319,"[""DVT""]",true,0.26506,46.307,"""dELS""",1348,"""dELS_3""","""1_9295877_G_T"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""22""",45969257,"""G""","""A""",0.999905,"[""eBMD""]",true,0.0026413,9.5349,"""dELS""",3457,"""dELS_207""","""22_45969257_G_A"""
"""22""",45979731,"""C""","""T""",0.98026,"[""eBMD""]",true,0.16714,37.776,"""dELS""",2568,"""dELS_208""","""22_45979731_C_T"""
"""22""",46219479,"""G""","""A""",0.945802,"[""BW""]",true,0.12871,164.85,"""dELS_flank""",28805,"""dELS_flank_108""","""22_46219479_G_A"""
"""22""",47990921,"""C""","""T""",0.997536,"[""BMI""]",true,0.43761,53.514,"""intron_variant""",498631,"""intron_variant_124""","""22_47990921_C_T"""


In [7]:
def n_common_hits(x, y):
    return len(
        set(V.filter(pl.col("trait").list.contains(x))["id"])
        &
        set(V.filter(pl.col("trait").list.contains(y))["id"])
    )

In [8]:
traits = np.unique(np.concatenate(V["trait"]))
traits

array(['AFib', 'AG', 'AID_Combined', 'ALP', 'ALT', 'AST',
       'Age_at_Menarche', 'Age_at_Menopause', 'Alb', 'Alzheimer_LTFH',
       'ApoA', 'ApoB', 'Asthma', 'BFP', 'BMI', 'BW', 'Balding_Type4',
       'Baso', 'BrC', 'CAD', 'CRC', 'CRP', 'Ca', 'Cholelithiasis', 'DBP',
       'DVT', 'Eosino', 'FEV1FVC', 'Fibroblastic_Disorders', 'GGT',
       'Glaucoma_Combined', 'Glucose', 'HDLC', 'Hb', 'HbA1c', 'Height',
       'Ht', 'Hypothyroidism', 'IBD', 'IGF1', 'Insomnia', 'Irritability',
       'LDLC', 'LOY', 'LipoA', 'Lym', 'MAP', 'MCH', 'MCHC', 'MCP', 'MCV',
       'Migraine_Self', 'Miserableness', 'Mono', 'Mood_Swings',
       'Morning_Person', 'Neutro', 'PP', 'Plt', 'PrC', 'RBC',
       'Risk_Taking', 'SBP', 'SHBG', 'Sensitivity', 'Smoking_Ever_Never',
       'Suffer_from_Nerves', 'T2D', 'T2D_BMI', 'TBil', 'TC', 'TG', 'TP',
       'Testosterone', 'UA', 'Urea', 'VitD', 'WBC', 'WHRadjBMI',
       'Worrier', 'eBMD', 'eGFR', 'eGFRcys'], dtype='<U22')

In [9]:
len(traits)

83

In [10]:
n_hits = [
    len(V.filter(pl.col("trait").list.contains(trait)))
    for trait in traits
]
n_chroms = [
    len(V.filter(pl.col("trait").list.contains(trait))["chrom"].unique())
    for trait in traits
]
df = pl.DataFrame(
    {
        "trait": traits,
        "n_hits": n_hits,
        "n_chroms": n_chroms,
    }
)
df

trait,n_hits,n_chroms
str,i64,i64
"""AFib""",4,3
"""AG""",40,15
"""AID_Combined""",2,2
"""ALP""",38,17
"""ALT""",23,16
…,…,…
"""WHRadjBMI""",23,12
"""Worrier""",1,1
"""eBMD""",67,21
"""eGFR""",28,16


In [11]:
df2 = df.filter(pl.col("n_hits") >= 10).sort("n_hits", descending=True)
df2

trait,n_hits,n_chroms
str,i64,i64
"""Height""",77,18
"""Plt""",67,17
"""eBMD""",67,21
"""MCV""",66,20
"""Mono""",65,19
…,…,…
"""TBil""",14,8
"""SBP""",13,9
"""Baso""",12,7
"""BFP""",11,8


In [12]:
exclude = np.full(len(df2), False)

for i, trait1 in enumerate(df2["trait"]):
    if exclude[i]: continue
    n1 = df2.filter(trait=trait1)[0, "n_hits"]
    for j, trait2 in enumerate(df2["trait"]):
        if i < j:
            if exclude[j]: continue
            n2 = df2.filter(trait=trait2)[0, "n_hits"]
            n = n_common_hits(trait1, trait2)
            if n > n1 * 0.1 or n > n2 * 0.1:
                exclude[j] = True
                

In [13]:
df3 = df2.filter(~exclude)
df3

trait,n_hits,n_chroms
str,i64,i64
"""Height""",77,18
"""Plt""",67,17
"""eBMD""",67,21
"""MCV""",66,20
"""Mono""",65,19
…,…,…
"""Ca""",22,11
"""Alb""",20,15
"""BMI""",17,13
"""Balding_Type4""",15,10


In [14]:
df3.select("trait").write_csv("../../config/gwas/independent_traits_filtered.csv", include_header=False)

In [15]:
df4 = df3.filter(pl.col("n_hits") >= 30)
df4

trait,n_hits,n_chroms
str,i64,i64
"""Height""",77,18
"""Plt""",67,17
"""eBMD""",67,21
"""MCV""",66,20
"""Mono""",65,19
…,…,…
"""AG""",40,15
"""HDLC""",39,16
"""eGFRcys""",39,17
"""ALP""",38,17


In [16]:
df4.select("trait").write_csv("../../config/gwas/independent_traits_filtered_n30.csv", include_header=False)