# Clean and process Pillar allele collection

Downloaded from: https://zenodo.org/records/17796333

In [1]:
import polars as pl
import pooch

In [None]:
## For future data download from Zenodo with hash validation, currently not working due to restricted downloading access
# zenodo_url = "https://zenodo.org/records/17796333"
# file_name = "final_pillar_data_with_clinvar_18_25_gnomad_wREVEL_wAM_wspliceAI_wMutpred2_wtrainvar_gold_standards_expanded_111225.csv.gz"
# pillar_data_condensed = pooch.retrieve(
#     # URL to one of Pooch's test files
#     url=f"{zenodo_url}/files/{file_name}?download=1",
#     known_hash=None,
#     fname=file_name,
#     path="raw_inputs/pillar_data", 
#     # processor=Decompress(),
#     progressbar=True
# )

## check md5 manually for the downloaded file, since the above code is not working due to restricted downloading access
# Create a dummy file for demonstration
pillar_data_file = "../1_inputs/raw_inputs/pillar_data/final_pillar_data_with_clinvar_18_25_gnomad_wREVEL_wAM_wspliceAI_wMutpred2_wtrainvar_gold_standards_expanded_111225.csv.gz"
md5 = "2c1d161bd3d767be7d5baab8a6f85e16"

# Calculate the MD5 hash of the local file
file_hash = pooch.file_hash(pillar_data_file, alg="md5")

print(f"The MD5 hash check is: {file_hash==md5}")

The MD5 hash check is: True


In [6]:
pillar_clinvar_df = pl.read_csv(
    pillar_data_file, 
    schema_overrides={
        "Chrom": pl.String, 
        'aa_pos': pl.String, 
        'auth_reported_score': pl.String
    },
    infer_schema_length=10000
).with_columns(
    pl.col("Gene").str.replace_all(r"[^a-zA-Z0-9]", "").alias("Gene"),
    pl.col("hgvs_p").str.replace_all(r"[^a-zA-Z0-9\.]", "").alias("hgvs_p")
).with_columns(
    pl.col("Gene").str.replace_all("CHK2", "CHEK2").alias("Gene"),
    pl.col("Chrom").cast(pl.String).str.strip_chars().alias("Chrom"),
    pl.col("hg38_start").alias("nuc_loc"),
    pl.col("ref_allele").cast(pl.String).str.strip_chars().alias("ref_allele"),
    pl.col("alt_allele").cast(pl.String).str.strip_chars().alias("alt_allele")
).with_columns(
    pl.concat_str(
        [
            pl.col("Gene"),
            pl.col("hgvs_p").str.replace(r"^p\.", "") # remove leading "p." from hgvs_p
        ],
        separator="_"
    ).alias("gene_variant")
)

## filter the SNP variants only
pillar_clinvar_single_aa_df = pillar_clinvar_df.filter(
    (pl.col("consequence").str.contains("sense")) & \
    (pl.col("ref_allele").str.len_chars()==1) & (pl.col("alt_allele").str.len_chars()==1) & \
    (pl.col("hg38_start").is_not_null()) & (pl.col("hg38_end").is_not_null())
).drop_nulls(subset="hgvs_p").with_columns(
    pl.col("hgvs_p").str.split(".").list.get(-1).alias("aa_change"),
    pl.lit(True).alias("pillar_tested")
)
print("Filtered missense/SNP pillar dataset shape:", pillar_clinvar_single_aa_df.shape)

pillar_clinvar_single_aa_df = pillar_clinvar_single_aa_df.sort(
    "gene_variant", "gnomad_MAF", descending=[False,False]
)
print("Number of unique genes Pillar ALL:", pillar_clinvar_df.unique(subset="Gene").shape[0])
print("Number of unique genes Pillar SNP:", pillar_clinvar_single_aa_df.unique(subset="Gene").shape[0])
print("Number of unique gene variants:", pillar_clinvar_single_aa_df.unique(subset="gene_variant").shape[0])
display(pillar_clinvar_single_aa_df) #.filter(pl.col("Ref_seq_transcript_ID").str.contains("NC_"))

Filtered missense/SNP pillar dataset shape: (177099, 98)
Number of unique genes Pillar ALL: 40
Number of unique genes Pillar SNP: 40
Number of unique gene variants: 92941


ID,Dataset,Gene,HGNC_id,Chrom,STRAND,hg19_pos,hg38_start,hg38_end,ref_allele,alt_allele,auth_transcript_id,transcript_pos,transcript_ref,transcript_alt,aa_pos,aa_ref,aa_alt,hgvs_c,hgvs_p,consequence,simplified_consequence,auth_reported_score,auth_reported_rep_score,auth_reported_func_class,splice_measure,gnomad_MAF,clinvar_sig_2025,clinvar_star_2025,clinvar_date_last_reviewed_2025,nucleotide_or_aa,MaveDB Score Set URN,Ensembl_transcript_ID,Ref_seq_transcript_ID,Model_system,Assay Type,Phenotype Measured ontology term,…,spliceAI_DS_AG,spliceAI_DS_AL,spliceAI_DS_DG,spliceAI_DS_DL,spliceAI_DP_AG,spliceAI_DP_AL,spliceAI_DP_DG,spliceAI_DP_DL,MutPred2,MP2_train,REVEL_train,clinvar_sig_2018,clinvar_star_2018,clinvar_date_last_reviewed_2018,ClinVar Variation Id_ClinGen_repo,Allele Registry Id_ClinGen_repo,Disease_ClinGen_repo,Mondo Id_ClinGen_repo,Mode of Inheritance_ClinGen_repo,Assertion_ClinGen_repo,Applied Evidence Codes (Met)_ClinGen_repo,Applied Evidence Codes (Not Met)_ClinGen_repo,Summary of interpretation_ClinGen_repo,PubMed Articles_ClinGen_repo,Expert Panel_ClinGen_repo,Guideline_ClinGen_repo,Approval Date_ClinGen_repo,Published Date_ClinGen_repo,Retracted_ClinGen_repo,Evidence Repo Link_ClinGen_repo,Uuid_ClinGen_repo,Updated_Classification_ClinGen_repo,Updated_Evidence Codes_ClinGen_repo,nuc_loc,gene_variant,aa_change,pillar_tested
str,str,str,i64,str,f64,f64,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,bool,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,f64,str,str,bool
"""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA""",756,"""17""",1.0,3.386803e6,3.483509e6,3.483509e6,"""C""","""A""",,"""443""","""C""","""A""","""148.0""","""A""","""D""","""c.443C>A""","""p.Ala148Asp""","""missense_variant""","""missense_variant""","""0.9495""",,,"""No""",6.1958e-7,,,,"""aa""","""urn:mavedb:00000657-a-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""Reporter""","""fluorescence intensity (BAO_00…",…,0.0,0.0,0.0,0.0,-10.0,-30.0,-7.0,-10.0,0.783,false,false,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Asp""","""Ala148Asp""",true
"""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA""",756,"""17""",1.0,3.386803e6,3.483509e6,3.483509e6,"""C""","""A""",,"""443""","""C""","""A""","""148.0""","""A""","""D""","""c.443C>A""","""p.Ala148Asp""","""missense_variant""","""missense_variant""","""-0.1191""",,,"""No""",6.1958e-7,,,,"""aa""","""urn:mavedb:00000657-b-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0,0.0,0.0,0.0,-10.0,-30.0,-7.0,-10.0,0.783,false,false,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Asp""","""Ala148Asp""",true
"""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA_Grønbæk-Thygesen_2024_abu…","""ASPA""",756,"""17""",1.0,3.386803e6,3.483509e6,3.483509e6,"""C""","""G""",,"""443""","""C""","""G""","""148.0""","""A""","""G""","""c.443C>G""","""p.Ala148Gly""","""missense_variant""","""missense_variant""","""0.2662""",,,"""No""",,,,,"""aa""","""urn:mavedb:00000657-a-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""Reporter""","""fluorescence intensity (BAO_00…",…,0.0014,0.0129,0.0,0.0,-30.0,-10.0,-1.0,12.0,0.535,false,false,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Gly""","""Ala148Gly""",true
"""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA""",756,"""17""",1.0,3.386803e6,3.483509e6,3.483509e6,"""C""","""G""",,"""443""","""C""","""G""","""148.0""","""A""","""G""","""c.443C>G""","""p.Ala148Gly""","""missense_variant""","""missense_variant""","""0.1248""",,,"""No""",,,,,"""aa""","""urn:mavedb:00000657-b-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0014,0.0129,0.0,0.0,-30.0,-10.0,-1.0,12.0,0.535,false,false,,,,,,,,,,,,,,,,,,,,,,,3.483509e6,"""ASPA_Ala148Gly""","""Ala148Gly""",true
"""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA_Grønbæk-Thygesen_2024_tox…","""ASPA""",756,"""17""",1.0,3.386802e6,3.483508e6,3.483508e6,"""G""","""C""",,"""442""","""G""","""C""","""148.0""","""A""","""P""","""c.442G>C""","""p.Ala148Pro""","""missense_variant""","""missense_variant""","""0.73""",,,"""No""",6.1961e-7,,,,"""aa""","""urn:mavedb:00000657-b-1""","""ENST00000263080.3""","""NM_000049.4""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0027,0.0078,0.0,0.0,-29.0,-9.0,-9.0,-6.0,0.867,false,false,,,,,,,,,,,,,,,,,,,,,,,3.483508e6,"""ASPA_Ala148Pro""","""Ala148Pro""",true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""XRCC2_unpublished_var147784""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,1.52346292e8,1.52649207e8,1.52649207e8,"""A""","""T""",,,"""T""","""A""","""93.0""","""V""","""D""","""nan""","""p.Val93Asp""","""missense_variant""","""missense_variant""","""-0.149949""",,"""functionally_abnormal""","""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0003,0.0,0.0,0.0,0.0,-19.0,-48.0,6.0,0.814,false,false,,,,,,,,,,,,,,,,,,,,,,,1.52649207e8,"""XRCC2_Val93Asp""","""Val93Asp""",true
"""XRCC2_unpublished_var147782""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,1.52346292e8,1.52649207e8,1.52649207e8,"""A""","""C""",,,"""T""","""G""","""93.0""","""V""","""G""","""nan""","""p.Val93Gly""","""missense_variant""","""missense_variant""","""-0.00798471""",,"""functionally_normal""","""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0014,0.0,0.0041,0.0,0.0,27.0,1.0,-48.0,0.605,false,false,,,,,,,,,,,,,,,,,,,,,,,1.52649207e8,"""XRCC2_Val93Gly""","""Val93Gly""",true
"""XRCC2_unpublished_var147787""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,1.52346293e8,1.52649208e8,1.52649208e8,"""C""","""T""",,,"""G""","""A""","""93.0""","""V""","""I""","""nan""","""p.Val93Ile""","""missense_variant""","""missense_variant""","""0.0214945""",,"""functionally_normal""","""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0001,0.0,0.0,0.0,-20.0,-1.0,-49.0,5.0,0.116,false,false,,,,,,,,,,,,,,,,,,,,,,,1.52649208e8,"""XRCC2_Val93Ile""","""Val93Ile""",true
"""XRCC2_unpublished_var147786""","""XRCC2_unpublished""","""XRCC2""",12829,"""7""",-1.0,1.52346293e8,1.52649208e8,1.52649208e8,"""C""","""G""",,,"""G""","""C""","""93.0""","""V""","""L""","""nan""","""p.Val93Leu""","""missense_variant""","""missense_variant""","""0.0330522""",,"""functionally_normal""","""Yes""",,,,,"""nucleotide""",,"""ENST00000359321.2""","""NM_005431.2""","""immortalized human cells""","""Cell Fitness""","""survival rate (OBI_0000789)""",…,0.0001,0.0,0.0,0.0,26.0,-1.0,15.0,5.0,0.302,false,false,,,,,,,,,,,,,,,,,,,,,,,1.52649208e8,"""XRCC2_Val93Leu""","""Val93Leu""",true


In [7]:
pillar_clinvar_single_aa_df.write_csv("../3_outputs/pillar_snp_alleles.tsv", separator="\t")