# Load Allele Collection

In [2]:
import polars as pl

In [2]:
def map_chr(num):
    if num == 23:
        return "X"
    elif num == 24:
        return "Y"
    elif num == 12920:
        return "MT"
    else:
        return str(num)

## 1. Loading CAVA PPI dataset

In [3]:
cava_ppi_df = pl.read_csv("../1_inputs/Edgotyping_CAVA_alleles.csv")
cava_ppi_df = cava_ppi_df.with_columns(
    pl.col("spdi").str.extract(r"NC_0*([0-9]+)", 1).cast(pl.Int32).alias("chr_num"),
    pl.col("spdi").str.split(":").list.get(1).str.strip_chars().cast(pl.Int64).alias("nuc_loc"),
    pl.col("spdi").str.split(":").list.get(2).str.strip_chars().alias("ref_allele"),
    pl.col("spdi").str.split(":").list.get(3).str.strip_chars().alias("alt_allele"),
).with_columns(
    pl.col("chr_num")
    .map_elements(map_chr, return_dtype=pl.String)
    .str.strip_chars()
    .alias("Chrom"),
    pl.concat_str([pl.col("symbol"),pl.col("aa_change")], separator="_").alias("gene_variant")
)
cava_ppi_df.shape

(438, 16)

In [4]:
## load clinvar
df_clinvar_dedup_struct = pl.scan_parquet("/home/shenrunx/igvf/varchamp/2025_laval_submitted/4_compare_ai_scores/3_outputs/processed_data/dbnsfp/dbNSFP5.0a_variant.clin_var_re-annot_pdb_variants.parquet")
df_clinvar_dedup_struct = df_clinvar_dedup_struct.with_columns([
    # Fix: Use concat_str for the varid string concatenation
    pl.concat_str([
        pl.col("#chr"),
        pl.lit("-"),
        pl.col("pos(1-based)").cast(pl.String),
        pl.lit("-"),
        pl.col("ref"),
        pl.lit("-"),
        pl.col("alt")
    ]).alias("varid"),
    pl.concat_str([
        pl.col("genename"), pl.col("clinvar_aa_change")
    ], separator="_").alias("gene_variant")
])
# df_clinvar_dedup_struct.collect().head()

## load gnomad
CHROMOSOMES = [str(c) for c in list(range(1, 23))] + ['X', 'Y']
allele_freq_dfs = []
for chrom in CHROMOSOMES:
    try:
        allele_freq = pl.read_csv(f"/home/shenrunx/igvf/varchamp/2025_laval_submitted/1_allele_collection/3_outputs/processed_data/gnomad_allele_frequencies/chr{chrom}_allele_frequencies.txt",
                                separator="\t", has_header=False, 
                                new_columns=["chr","chr_pos_38","ref","alt","allele_freq"])
        allele_freq = allele_freq.with_columns(
            pl.col("chr").str.replace(r"\'", "").str.replace(r"\'", "").str.replace("chr", "")
        ).drop_nulls()
        # allele_freq = allele_freq.filter(~pl.all_horizontal(pl.all().is_null()))
        # print(allele_freq)
        allele_freq_dfs.append(allele_freq)
    except:
        continue

allele_freq_all = pl.concat(allele_freq_dfs)
allele_freq_gnomad = allele_freq_all.with_columns(
    pl.concat_str([pl.col("chr"),
                   pl.lit('-'),
                   pl.col("chr_pos_38").cast(pl.String),
                   pl.lit('-'),
                   pl.col("ref"),
                   pl.lit('-'),
                   pl.col("alt")
                   ]).alias("varid"),
    pl.col("allele_freq").alias("gnomad_af")
).drop(pl.col("allele_freq"))#.to_pandas()

## merge them
clinvar_gnomad_df = df_clinvar_dedup_struct.join(
    allele_freq_gnomad.select(["chr", "chr_pos_38", "varid", "gnomad_af"]).lazy(),
    on="varid",
    how="left"
)

In [5]:
cava_ppi_clinvar_spdi = cava_ppi_df.lazy().filter(
    pl.col("spdi").is_not_null()
).join(
    clinvar_gnomad_df.select([c for c in clinvar_gnomad_df.collect_schema().names() if c not in cava_ppi_df.collect_schema().names()]),
    left_on="spdi",
    right_on="SPDI",
    how="left"
)
cava_ppi_clinvar_spdi.collect().shape

(438, 448)

## 2. Loading VarChAMP 1% data

In [6]:
varchamp_seq_code = pl.read_csv("../1_inputs/sequence_confirmation_class_code.tsv", separator="\t")
varchamp_seq_code

id,description
i64,str
1,"""perfectly validated"""
2,"""partially validated, >= 50% co…"
3,"""wild type"""
4,"""partial wild type, target not …"
5,"""target and off-target mutation…"
6,"""off-target mutation, >= 50% co…"
7,"""truncated, < 50% coverage"""
99,"""no reads"""


In [7]:
varchamp_allele_df = pl.read_csv("../1_inputs/varchamp_seq_confirm_alleles_clinvar_gnomad_dbnsfp_struc_scores_OMIM_MOI_disease_modules_with_protein_features.tsv", 
                                separator="\t", infer_schema_length=100000)
varchamp_allele_df = varchamp_allele_df.with_columns(
    pl.concat_str([pl.col("symbol"), pl.col("aa_change")], separator="_").alias("gene_variant")
).with_columns(
    pl.col("spdi").str.extract(r"NC_0*([0-9]+)", 1).cast(pl.Int32).alias("chr_num"),
    pl.col("spdi").str.split(":").list.get(1).str.strip_chars().cast(pl.Int64).alias("nuc_loc"),
    pl.col("spdi").str.split(":").list.get(2).str.strip_chars().alias("ref_allele"),
    pl.col("spdi").str.split(":").list.get(3).str.strip_chars().alias("alt_allele"),
    pl.col("mut_id").cast(pl.Int64).alias("mut_id"),
).with_columns(
    pl.col("chr_num")
    .map_elements(map_chr, return_dtype=pl.String)
    .str.strip_chars()
    .alias("Chrom")
)
# varchamp_allele_df.shape

In [8]:
varchamp_allele_df

symbol,ensembl_gene_id,orf_id,mut_id,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,n2h_ref_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,dualip_ref_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,mislocalization_ref_sequence_confirmation_class,gene_variant,…,protein_00000963-a-1 outliers⁺⁺,protein_00000739-a-1 avg,protein_00000739-a-1 outliers⁺⁺,protein_00001087-a-1 avg,protein_00001087-a-1 outliers⁺⁺,protein_00001053-a-1 avg,protein_00001053-a-1 outliers⁺⁺,protein_00001032-a-1 avg,protein_00001032-a-1 outliers⁺⁺,protein_00001033-a-1 avg,protein_00001033-a-1 outliers⁺⁺,protein_00001034-a-1 avg,protein_00001034-a-1 outliers⁺⁺,protein_00000959-a-1 avg,protein_00000959-a-1 outliers⁺⁺,protein_00001177-a-1 avg,protein_00001177-a-1 outliers⁺⁺,protein_00000001-d-2 avg,protein_00000001-d-2 outliers⁺⁺,protein_00000966-a-1 avg,protein_00000966-a-1 outliers⁺⁺,protein_00000013-a-1 avg,protein_00000013-a-1 outliers⁺⁺,protein_00000101-a-1 avg,protein_00000101-a-1 outliers⁺⁺,protein_00000102-0-1 avg,protein_00000102-0-1 outliers⁺⁺,protein_00000745-a-1 avg,protein_00000745-a-1 outliers⁺⁺,protein_00000746-a-1 avg,protein_00000746-a-1 outliers⁺⁺,protein_00001065-a-1 avg,protein_00001065-a-1 outliers⁺⁺,protein_00000686-a-1 avg,protein_00000686-a-1 outliers⁺⁺,protein_00001112-a-1 avg,protein_00001112-a-1 outliers⁺⁺
str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,…,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,f64,str,f64,str,f64,str,f64,str,f64,str,f64,str,str,str,f64,str
"""GBA1""","""ENSG00000177628""",2,6,"""CCSBVarC000001""","""ALE0000584""","""NC_000001.11:155240033:C:G""","""160G>C""","""Val54Leu""","""RC4""","""RC4_Mut_GDEh1026""","""H01""","""GDEhDisVCh_40054""","""F12""",2.0,"""RC4_Mut_GDDh1026""","""H01""",,,,,,,1.0,1.0,1.0,2.0,0.0,,,0.0,,,0.0,,,"""GBA1_Val54Leu""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,73,"""CCSBVarC000002""","""ALE00000002""","""NC_000001.11:155238225:G:A""","""670C>T""","""Leu224Phe""","""RC4""","""RC4_Mut_GDEh1026""","""E01""","""GDEhDisVCh_40054""","""C12""",2.0,"""RC4_Mut_GDDh1026""","""E01""",,,,,,,1.0,1.0,1.0,1.0,0.0,,,0.0,,,0.0,,,"""GBA1_Leu224Phe""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,113,"""CCSBVarC000003""","""ALE00000003""","""NC_000001.11:155237453:C:T""","""887G>A""","""Arg296Gln""","""RC4""","""RC4_Mut_GDEh1026""","""F01""","""GDEhDisVCh_40054""","""D12""",2.0,"""RC4_Mut_GDDh1026""","""F01""",,,,,,,1.0,1.0,1.0,7.0,0.0,,,0.0,,,0.0,,,"""GBA1_Arg296Gln""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,231,"""CCSBVarC000004""","""ALE00000004""","""NC_000001.11:155235252:A:G""","""1448T>C""","""Leu483Pro""","""RC4""","""RC4_Mut_GDEh1026""","""G01""","""GDEhDisVCh_40054""","""E12""",2.0,"""RC4_Mut_GDDh1026""","""G01""",,,,,,,1.0,1.0,1.0,2.0,0.0,,,0.0,,,0.0,,,"""GBA1_Leu483Pro""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,213510,"""CCSBVarC003869""","""ALE00003869""","""NC_000001.11:155239934:G:A""","""259C>T""","""Arg87Trp""","""CEGS2""","""CegsMutGDEh1035""","""B03""","""GDEhDisVCh_40054""","""B02""",2.0,"""CegsMutGDDh1035""","""B03""",,,,,,,1.0,1.0,1.0,2.0,0.0,,,0.0,,,0.0,,,"""GBA1_Arg87Trp""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""AGXT""","""ENSG00000172482""",70497,205588,"""CCSBVarC004886""","""ALE00004886""","""NC_000002.12:240873995:T:C""","""613T>C""","""Ser205Pro""","""CEGS2""","""CegsMutGDEh1037""","""G06""",,,,"""CegsMutGDDh1037""","""G06""",,,"""VUSMutpDEST2_15""","""A02""",,,0.0,,1.0,1.0,0.0,,,1.0,1.0,1.0,0.0,,,"""AGXT_Ser205Pro""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""AGXT""","""ENSG00000172482""",70497,205589,"""CCSBVarC004888""","""ALE00004888""","""NC_000002.12:240875126:G:A""","""698G>A""","""Arg233His""","""CEGS2""","""CegsMutGDEh1037""","""C12""",,,,"""CegsMutGDDh1037""","""C12""",,,"""VUSMutpDEST2_15""","""B02""",,,0.0,,1.0,1.0,0.0,,,1.0,1.0,1.0,0.0,,,"""AGXT_Arg233His""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""AGXT""","""ENSG00000172482""",70497,205590,"""CCSBVarC004887""","""ALE00004887""","""NC_000002.12:240875125:C:T""","""697C>T""","""Arg233Cys""","""CEGS2""","""CegsMutGDEh1037""","""A08""",,,,"""CegsMutGDDh1037""","""A08""",,,"""VUSMutpDEST2_15""","""C02""",,,0.0,,1.0,5.0,0.0,,,1.0,1.0,1.0,0.0,,,"""AGXT_Arg233Cys""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""AGXT""","""ENSG00000172482""",70497,205591,"""CCSBVarC004889""","""ALE00004889""","""NC_000002.12:240875159:T:C""","""731T>C""","""Ile244Thr""","""CEGS2""","""CegsMutGDEh1038""","""F04""",,,,"""CegsMutGDDh1038""","""F04""",,,"""VUSMutpDEST2_15""","""D02""",,,0.0,,1.0,2.0,0.0,,,1.0,1.0,1.0,0.0,,,"""AGXT_Ile244Thr""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## 3. Merge them

In [9]:
# overlapped_cols = set(varchamp_allele_df.collect_schema().names()).intersection(set(cava_ppi_clinvar_spdi.collect_schema().names()))
# select_cols = list(varchamp_allele_df.columns) + [col for col in overlapped_cols if col not in varchamp_allele_df.columns]

varchamp_allele_df = pl.concat([
    varchamp_allele_df,
    cava_ppi_clinvar_spdi.collect()
], how="diagonal_relaxed")

In [10]:
varchamp_allele_df.head()#.sort("gene_variant")["gene_variant"].unique()

symbol,ensembl_gene_id,orf_id,mut_id,ccsb_mutation_id,ccsb_allele_id,spdi,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,mislocalization_plate,mislocalization_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,n2h_ref_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class,dualip_ref_sequence_confirmation_class,mislocalization_sequenced,mislocalization_sequence_confirmation_class,mislocalization_ref_sequence_confirmation_class,gene_variant,…,protein_00001053-a-1 avg,protein_00001053-a-1 outliers⁺⁺,protein_00001032-a-1 avg,protein_00001032-a-1 outliers⁺⁺,protein_00001033-a-1 avg,protein_00001033-a-1 outliers⁺⁺,protein_00001034-a-1 avg,protein_00001034-a-1 outliers⁺⁺,protein_00000959-a-1 avg,protein_00000959-a-1 outliers⁺⁺,protein_00001177-a-1 avg,protein_00001177-a-1 outliers⁺⁺,protein_00000001-d-2 avg,protein_00000001-d-2 outliers⁺⁺,protein_00000966-a-1 avg,protein_00000966-a-1 outliers⁺⁺,protein_00000013-a-1 avg,protein_00000013-a-1 outliers⁺⁺,protein_00000101-a-1 avg,protein_00000101-a-1 outliers⁺⁺,protein_00000102-0-1 avg,protein_00000102-0-1 outliers⁺⁺,protein_00000745-a-1 avg,protein_00000745-a-1 outliers⁺⁺,protein_00000746-a-1 avg,protein_00000746-a-1 outliers⁺⁺,protein_00001065-a-1 avg,protein_00001065-a-1 outliers⁺⁺,protein_00000686-a-1 avg,protein_00000686-a-1 outliers⁺⁺,protein_00001112-a-1 avg,protein_00001112-a-1 outliers⁺⁺,seq_confirmed_wt_ppi,seq_confirmed_allele_ppi,seq_confirmed_final,chr,chr_pos_38
str,str,i64,i64,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,…,f64,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,f64,str,f64,str,f64,str,f64,str,f64,str,f64,str,str,str,f64,str,i64,i64,i64,str,i64
"""GBA1""","""ENSG00000177628""",2,6,"""CCSBVarC000001""","""ALE0000584""","""NC_000001.11:155240033:C:G""","""160G>C""","""Val54Leu""","""RC4""","""RC4_Mut_GDEh1026""","""H01""","""GDEhDisVCh_40054""","""F12""",2.0,"""RC4_Mut_GDDh1026""","""H01""",,,,,,,1.0,1.0,1.0,2.0,0.0,,,0.0,,,0.0,,,"""GBA1_Val54Leu""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,73,"""CCSBVarC000002""","""ALE00000002""","""NC_000001.11:155238225:G:A""","""670C>T""","""Leu224Phe""","""RC4""","""RC4_Mut_GDEh1026""","""E01""","""GDEhDisVCh_40054""","""C12""",2.0,"""RC4_Mut_GDDh1026""","""E01""",,,,,,,1.0,1.0,1.0,1.0,0.0,,,0.0,,,0.0,,,"""GBA1_Leu224Phe""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,113,"""CCSBVarC000003""","""ALE00000003""","""NC_000001.11:155237453:C:T""","""887G>A""","""Arg296Gln""","""RC4""","""RC4_Mut_GDEh1026""","""F01""","""GDEhDisVCh_40054""","""D12""",2.0,"""RC4_Mut_GDDh1026""","""F01""",,,,,,,1.0,1.0,1.0,7.0,0.0,,,0.0,,,0.0,,,"""GBA1_Arg296Gln""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,231,"""CCSBVarC000004""","""ALE00000004""","""NC_000001.11:155235252:A:G""","""1448T>C""","""Leu483Pro""","""RC4""","""RC4_Mut_GDEh1026""","""G01""","""GDEhDisVCh_40054""","""E12""",2.0,"""RC4_Mut_GDDh1026""","""G01""",,,,,,,1.0,1.0,1.0,2.0,0.0,,,0.0,,,0.0,,,"""GBA1_Leu483Pro""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""GBA1""","""ENSG00000177628""",2,213510,"""CCSBVarC003869""","""ALE00003869""","""NC_000001.11:155239934:G:A""","""259C>T""","""Arg87Trp""","""CEGS2""","""CegsMutGDEh1035""","""B03""","""GDEhDisVCh_40054""","""B02""",2.0,"""CegsMutGDDh1035""","""B03""",,,,,,,1.0,1.0,1.0,2.0,0.0,,,0.0,,,0.0,,,"""GBA1_Arg87Trp""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
varchamp_allele_df.shape

(9591, 875)

In [12]:
# varchamp_allele_df.write_csv("../3_outputs/varchamp_bms_allele_collection.tsv", separator="\t")

## 4. Pillar alleles

In [3]:
pillar_clinvar_df = pl.read_csv(
    "../../1_allele_collection/1_inputs/raw_inputs/pillar_data/final_pillar_data_with_clinvar_gnomad_wREVEL_wAM_gold_standards_080425.csv", 
    schema_overrides={
        "Chrom": pl.String, 
        'aa_pos': pl.String, 
        'auth_reported_score': pl.String
    },
    infer_schema_length=10000
).with_columns(
    pl.col("Gene").str.replace_all(r"[^a-zA-Z0-9]", "").alias("Gene"),
    pl.col("hgvs_p").str.replace_all(r"[^a-zA-Z0-9\.]", "").alias("hgvs_p")
).with_columns(
    pl.col("Gene").str.replace_all("CHK2", "CHEK2").alias("Gene"),
    pl.col("Chrom").cast(pl.String).str.strip_chars().alias("Chrom"),
    pl.col("hg38_start").alias("nuc_loc"),
    pl.col("ref_allele").cast(pl.String).str.strip_chars().alias("ref_allele"),
    pl.col("alt_allele").cast(pl.String).str.strip_chars().alias("alt_allele")
).with_columns(
    pl.concat_str(
        [
            pl.col("Gene"),
            pl.col("hgvs_p").str.replace(r"^p\.", "") # remove leading "p." from hgvs_p
        ],
        separator="_"
    ).alias("gene_variant")
)

## filter the SNP variants only
pillar_clinvar_single_aa_df = pillar_clinvar_df.filter(
    (pl.col("consequence").str.contains("sense")) & \
    (pl.col("ref_allele").str.len_chars()==1) & (pl.col("alt_allele").str.len_chars()==1) & \
    (pl.col("hg38_start").is_not_null()) & (pl.col("hg38_end").is_not_null())
).drop_nulls(subset="hgvs_p").with_columns(
    pl.col("hgvs_p").str.split(".").list.get(-1).alias("aa_change"),
    pl.lit(True).alias("pillar_tested")
)
print("Filtered missense/SNP pillar dataset shape:", pillar_clinvar_single_aa_df.shape)

pillar_clinvar_single_aa_df = pillar_clinvar_single_aa_df.sort(
    "gene_variant", "gnomad_MAF", descending=[False,False]
)
print("Number of unique genes Pillar ALL:", pillar_clinvar_df.unique(subset="Gene").shape[0])
print("Number of unique genes Pillar SNP:", pillar_clinvar_single_aa_df.unique(subset="Gene").shape[0])
print("Number of unique gene variants:", pillar_clinvar_single_aa_df.unique(subset="gene_variant").shape[0])
display(pillar_clinvar_single_aa_df) #.filter(pl.col("Ref_seq_transcript_ID").str.contains("NC_"))

Filtered missense/SNP pillar dataset shape: (176277, 109)
Number of unique genes Pillar ALL: 40
Number of unique genes Pillar SNP: 40
Number of unique gene variants: 94200


ID,Dataset,Gene,HGNC_id,Chrom,STRAND,hg19_pos,hg38_start,hg38_end,ref_allele,alt_allele,auth_transcript_id,transcript_pos,transcript_ref,transcript_alt,aa_pos,aa_ref,aa_alt,hgvs_c,hgvs_p,consequence,simplified_consequence,auth_reported_score,auth_reported_rep_score,auth_reported_func_class,splice_measure,gnomad_MAF,clinvar_sig,clinvar_star,clinvar_date_last_reviewed,nucleotide_or_aa,MaveDB URN (score set),Ensembl_transcript_ID,Ref_seq_transcript_ID,Model_system,Assay Type ontology term,Phenotype Measured ontology term,…,calculated_classification_PMID:31131967,SGR LR_PMID:34273903,Personal and Family History LR (Combined)_PMID:34273903,Breast Tumor Pathology LR_PMID:34273903,Population Allele Frequency LR_PMID:34273903,BS2 LR_PMID:34273903,Classification by multifactorial model_PMID:34273903,Calculated_combined_LR_PMID:34273903,Naive_prior_prob_PMID:34273903,Calculated_post_prob_PMID:34273903,calculated_classification_PMID:34273903,Reference Group*_presumed_PMID:34273903,Component_presumed_PMID:34273903,Rationale_presumed_PMID:34273903,ClinVar Variation Id_ClinGen_repo,Allele Registry Id_ClinGen_repo,Disease_ClinGen_repo,Mondo Id_ClinGen_repo,Mode of Inheritance_ClinGen_repo,Assertion_ClinGen_repo,Applied Evidence Codes (Met)_ClinGen_repo,Applied Evidence Codes (Not Met)_ClinGen_repo,Summary of interpretation_ClinGen_repo,PubMed Articles_ClinGen_repo,Expert Panel_ClinGen_repo,Guideline_ClinGen_repo,Approval Date_ClinGen_repo,Published Date_ClinGen_repo,Retracted_ClinGen_repo,Evidence Repo Link_ClinGen_repo,Uuid_ClinGen_repo,Updated_Classification_ClinGen_repo,Updated_Evidence Codes_ClinGen_repo,nuc_loc,gene_variant,aa_change,pillar_tested
str,str,str,i64,str,f64,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,bool
"""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA""",756,"""17""",1.0,,3.483509e6,3.483509e6,"""C""","""A""",,"""443""","""C""","""A""","""148.0""","""A""","""D""","""c.443C>A""","""p.Ala148Asp""","""missense_variant""","""missense_variant""","""0.9495""",,,"""No""",6.1958e-7,,,,"""aa""","""urn:mavedb:00000657-a-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""protein stability assay (BAO_0…","""fluorescence intensity (BAO_00…",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Asp""","""Ala148Asp""",true
"""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA""",756,"""17""",1.0,,3.483509e6,3.483509e6,"""C""","""A""",,"""443""","""C""","""A""","""148.0""","""A""","""D""","""c.443C>A""","""p.Ala148Asp""","""missense_variant""","""missense_variant""","""-0.1191""",,,"""No""",6.1958e-7,,,,"""aa""","""urn:mavedb:00000657-b-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""cell viability assay (BAO_0003…","""survival rate (OBI_0000789)""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Asp""","""Ala148Asp""",true
"""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA""",756,"""17""",1.0,,3.483509e6,3.483509e6,"""C""","""G""",,"""443""","""C""","""G""","""148.0""","""A""","""G""","""c.443C>G""","""p.Ala148Gly""","""missense_variant""","""missense_variant""","""0.2662""",,,"""No""",,,,,"""aa""","""urn:mavedb:00000657-a-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""protein stability assay (BAO_0…","""fluorescence intensity (BAO_00…",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Gly""","""Ala148Gly""",true
"""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA""",756,"""17""",1.0,,3.483509e6,3.483509e6,"""C""","""G""",,"""443""","""C""","""G""","""148.0""","""A""","""G""","""c.443C>G""","""p.Ala148Gly""","""missense_variant""","""missense_variant""","""0.1248""",,,"""No""",,,,,"""aa""","""urn:mavedb:00000657-b-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""cell viability assay (BAO_0003…","""survival rate (OBI_0000789)""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Gly""","""Ala148Gly""",true
"""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA""",756,"""17""",1.0,,3.483508e6,3.483508e6,"""G""","""C""",,"""442""","""G""","""C""","""148.0""","""A""","""P""","""c.442G>C""","""p.Ala148Pro""","""missense_variant""","""missense_variant""","""0.73""",,,"""No""",6.1961e-7,,,,"""aa""","""urn:mavedb:00000657-b-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""cell viability assay (BAO_0003…","""survival rate (OBI_0000789)""",…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.483508e6,"""ASPA_Ala148Pro""","""Ala148Pro""",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""XRCC2_unpublished_var148031""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,,1.52660706e8,1.52660706e8,"""A""","""T""",,"""116""","""T""","""A""","""39.0""","""V""","""E""","""c.116T>A""","""p.Val39Glu""","""missense_variant""","""missense_variant""","""-0.000703847""",,,"""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""",,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.52660706e8,"""XRCC2_Val39Glu""","""Val39Glu""",true
"""XRCC2_unpublished_var148029""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,,1.52660706e8,1.52660706e8,"""A""","""C""",,"""116""","""T""","""G""","""39.0""","""V""","""G""","""c.116T>G""","""p.Val39Gly""","""missense_variant""","""missense_variant""","""0.00258373""",,,"""Yes""",6.2110e-7,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""",,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.52660706e8,"""XRCC2_Val39Gly""","""Val39Gly""",true
"""XRCC2_unpublished_var148032""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,,1.52660707e8,1.52660707e8,"""C""","""A""",,"""115""","""G""","""T""","""39.0""","""V""","""L""","""c.115G>T""","""p.Val39Leu""","""missense_variant""","""missense_variant""","""-0.0122043""",,,"""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""",,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.52660707e8,"""XRCC2_Val39Leu""","""Val39Leu""",true
"""XRCC2_unpublished_var148033""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,,1.52660707e8,1.52660707e8,"""C""","""G""",,"""115""","""G""","""C""","""39.0""","""V""","""L""","""c.115G>C""","""p.Val39Leu""","""missense_variant""","""missense_variant""","""-0.00863187""",,,"""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""",,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.52660707e8,"""XRCC2_Val39Leu""","""Val39Leu""",true


In [4]:
# pillar_clinvar_single_aa_df.write_csv("../3_outputs/pillar_snp_alleles.tsv", separator="\t")
# pillar_clinvar_df.filter(
#     pl.col("Dataset").str.contains("_unpublished")
# ).write_csv("../3_outputs/pillar_alleles.tsv", separator="\t")