In [1]:
import polars as pl

In [2]:
#!wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz

In [3]:
!zcat submission_summary.txt.gz | head -n 18

##Overview of interpretation, phenotypes, observations, and methods reported in each current submission 
##Explanation of the columns in this report
#VariationID:                   the identifier assigned by ClinVar and used to build the URL, namely https://ncbi.nlm.nih.gov/clinvar/VariationID
#ClinicalSignificance:          the germline classification on this submitted record
#DateLastEvaluated:             the last date the classification on this record was evaluated by the submitter
#Description:                   an optional free text description comment describing the rationale for the classification
#SubmittedPhenotypeInfo:        the name(s) or identifier(s) submitted as the condition for the classification 
#ReportedPhenotypeInfo:         the MedGen identifier/name combinations that the submitted condition for the classification maps to. 'na' means there is no public identifer in MedGen for the condition.
#ReviewStatus:                  the level of review for this submitted re

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

  pid, fd = os.forkpty()


In [4]:
df = pl.read_csv(
    "submission_summary.txt.gz", separator="\t", skip_rows=18,
    columns=[
        "#VariationID",
        "ClinicalSignificance",
        "ReportedPhenotypeInfo",
        "CollectionMethod",
        "Submitter",
    ]
)
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""clinical testing""","""Paris Brain Institute, Inserm …"
2,"""Pathogenic""","""C3661900:not provided""","""clinical testing""","""Athena Diagnostics"""
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
2,"""Likely pathogenic""","""na:Macular dystrophy with or w…","""research""","""Ophthalmic Genetics Group, Ins…"
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [5]:
df = df.filter(Submitter="OMIM")
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
4,"""Uncertain significance""","""C4551772:Galloway-Mowat syndro…","""literature only""","""OMIM"""
5,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
6,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [6]:
unambiguous_ids = df.group_by("#VariationID").n_unique().filter(ClinicalSignificance=1)["#VariationID"]
df = df.filter(pl.col("#VariationID").is_in(unambiguous_ids))
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
4,"""Uncertain significance""","""C4551772:Galloway-Mowat syndro…","""literature only""","""OMIM"""
5,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
6,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [7]:
df = df.filter(pl.col("ClinicalSignificance") == "Pathogenic")
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
5,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
6,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
7,"""Pathogenic""","""C4748792:Mitochondrial complex…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [8]:
# TODO: should check that the ClinicalSignificance is the one from OMIM, not the aggregate one from ClinVar
# yes, I think it's working

In [9]:
df.filter(pl.col("#VariationID") == 4907)

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
4907,"""Pathogenic""","""C1868114:Polydactyly of a trip…","""literature only""","""OMIM"""


In [10]:
df.filter(pl.col("#VariationID") == 3370406)

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
3370406,"""Pathogenic""","""C5975503:Spermatogenic failure…","""literature only""","""OMIM"""
3370406,"""Pathogenic""","""C5975510:Premature ovarian fai…","""literature only""","""OMIM"""


In [11]:
df["CollectionMethod"].value_counts()

CollectionMethod,count
str,u32
"""literature only""",32910


In [12]:
# comma is not good, need an escape character, maybe "|"

In [13]:
df.filter(pl.col("ReportedPhenotypeInfo").str.contains("|", literal=True))

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str


In [14]:
df = df.group_by("#VariationID").agg(pl.col("ReportedPhenotypeInfo").unique()).with_columns(pl.col("ReportedPhenotypeInfo").list.join("|"))
df

#VariationID,ReportedPhenotypeInfo
i64,str
5282,"""C2931875:Hermansky-Pudlak synd…"
11168,"""C0398689:Hyper-IgM syndrome ty…"
2443903,"""C5830283:Congenital myopathy 1…"
430614,"""C4478383:Intellectual disabili…"
144068,"""C3539506:Hereditary spastic pa…"
…,…
547999,"""C5562061:Rauch-Steindl syndrom…"
120184,"""C4040739:3-methylglutaconic ac…"
4701,"""C5201146:Blau syndrome"""
545022,"""C4693870:Ehlers-Danlos syndrom…"


In [15]:
df.filter(pl.col("ReportedPhenotypeInfo").str.contains("|", literal=True))

#VariationID,ReportedPhenotypeInfo
i64,str
7911,"""C1858806:Cone-rod dystrophy 3|…"
11549,"""C2678061:X-linked scapuloperon…"
14440,"""C3150943:Long QT syndrome 2|C3…"
11930,"""C0398691:Hyperimmunoglobulin D…"
979212,"""C5436637:Spastic paraplegia 83…"
…,…
1333342,"""C5830453:Congenital myopathy 2…"
9396,"""C1832680:Dilated cardiomyopath…"
13594,"""C4551985:Arterial calcificatio…"
9283,"""C1851920:Dystonia 5|CN322657:D…"


In [30]:
df = df.rename({"#VariationID": "clinvar_id"})
df

clinvar_id,ReportedPhenotypeInfo
i64,str
5282,"""C2931875:Hermansky-Pudlak synd…"
11168,"""C0398689:Hyper-IgM syndrome ty…"
2443903,"""C5830283:Congenital myopathy 1…"
430614,"""C4478383:Intellectual disabili…"
144068,"""C3539506:Hereditary spastic pa…"
…,…
547999,"""C5562061:Rauch-Steindl syndrom…"
120184,"""C4040739:3-methylglutaconic ac…"
4701,"""C5201146:Blau syndrome"""
545022,"""C4693870:Ehlers-Danlos syndrom…"


In [16]:
# TODO: merge ids with those in the vcf (hopefully there)

In [18]:
#!wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20250601.vcf.gz -O clinvar.vcf.gz

In [37]:
NUCLEOTIDES = list("ACGT")
CHROMS = [str(i) for i in range(1, 23)] + ['X', 'Y']
COORDINATES = ["chrom", "pos", "ref", "alt"]

In [29]:
V = V.filter(pl.col("chrom").is_in(CHROMS), pl.col("ref").is_in(NUCLEOTIDES), pl.col("alt").is_in(NUCLEOTIDES))
V

chrom,pos,ref,alt,clinvar_id
str,i64,str,str,i64
"""1""",69134,"""A""","""G""",2205837
"""1""",69314,"""T""","""G""",3205580
"""1""",69423,"""G""","""A""",3205581
"""1""",69581,"""C""","""G""",2252161
"""1""",69682,"""G""","""A""",2396347
…,…,…,…,…
"""Y""",14830121,"""C""","""A""",391879
"""Y""",14840423,"""C""","""T""",2689591
"""Y""",14840785,"""C""","""T""",770316
"""Y""",14840887,"""C""","""T""",2661892


In [31]:
V = V.join(df, how="inner", on="clinvar_id")
V

chrom,pos,ref,alt,clinvar_id,ReportedPhenotypeInfo
str,i64,str,str,i64,str
"""1""",1014143,"""C""","""T""",183381,"""C4015293:Mendelian susceptibil…"
"""1""",1014359,"""G""","""T""",161454,"""C4015293:Mendelian susceptibil…"
"""1""",1041582,"""C""","""T""",126556,"""C3808739:Congenital myasthenic…"
"""1""",1050575,"""G""","""C""",18241,"""C3808739:Congenital myasthenic…"
"""1""",1050763,"""G""","""T""",126555,"""C3808739:Congenital myasthenic…"
…,…,…,…,…,…
"""Y""",2787426,"""C""","""G""",9739,"""C2748896:46,XY sex reversal 1"""
"""Y""",2787551,"""C""","""T""",9754,"""C2748896:46,XY sex reversal 1"""
"""Y""",2787592,"""A""","""T""",9751,"""C2748896:46,XY sex reversal 1"""
"""Y""",2787600,"""G""","""A""",9753,"""C2748896:46,XY sex reversal 1"""


In [43]:
len(V), len(df)  # TODO: we are dropping too many, need to understand why
# start by analyzing some specific examples, are there in clinvar database?

(23401, 31677)

In [34]:
V.filter(clinvar_id=4907)

chrom,pos,ref,alt,clinvar_id,ReportedPhenotypeInfo
str,i64,str,str,i64,str
"""7""",156791480,"""G""","""A""",4907,"""C1868114:Polydactyly of a trip…"


In [36]:
traitgym = pl.read_parquet('hf://datasets/songlab/TraitGym/mendelian_traits_matched_9/test.parquet').filter(pl.col("label"))
traitgym

chrom,pos,ref,alt,OMIM,consequence,label,tss_dist,match_group
str,i64,str,str,str,str,bool,i64,str
"""1""",7961859,"""C""","""G""","""MIM 606324""","""PLS""",true,34,"""PLS_0"""
"""1""",9943502,"""A""","""T""","""MIM 608553""","""5_prime_UTR_variant""",true,26,"""5_prime_UTR_variant_0"""
"""1""",9943503,"""C""","""T""","""MIM 608553""","""5_prime_UTR_variant""",true,27,"""5_prime_UTR_variant_1"""
"""1""",11023351,"""G""","""A""","""MIM 612069""","""3_prime_UTR_variant""",true,1206,"""3_prime_UTR_variant_0"""
"""1""",21509427,"""C""","""T""","""MIM 241500""","""5_prime_UTR_variant""",true,0,"""5_prime_UTR_variant_2"""
…,…,…,…,…,…,…,…,…
"""X""",155022770,"""A""","""G""","""MIM 306700""","""PLS""",true,46,"""PLS_57"""
"""X""",155022771,"""G""","""A""","""MIM 306700""","""PLS""",true,47,"""PLS_62"""
"""X""",155022773,"""A""","""T""","""MIM 306700""","""PLS""",true,49,"""PLS_58"""
"""X""",155022807,"""T""","""C""","""MIM 306700""","""PLS""",true,83,"""PLS_59"""


In [39]:
x = V.join(traitgym, how="inner", on=COORDINATES)  # dropping too many, seems random though, no obvious pattern
x

chrom,pos,ref,alt,clinvar_id,ReportedPhenotypeInfo,OMIM,consequence,label,tss_dist,match_group
str,i64,str,str,i64,str,str,str,bool,i64,str
"""1""",11023351,"""G""","""A""",5239,"""C3148872:FRONTOTEMPORAL DEMENT…","""MIM 612069""","""3_prime_UTR_variant""",true,1206,"""3_prime_UTR_variant_0"""
"""1""",90916206,"""C""","""T""",31109,"""C3279997:Myopia 21, autosomal …","""MIM 614167""","""3_prime_UTR_variant""",true,105250,"""3_prime_UTR_variant_3"""
"""1""",112956192,"""C""","""T""",8916,"""C1864902:Exercise-induced hype…","""MIM 610021""","""5_prime_UTR_variant""",true,3,"""5_prime_UTR_variant_5"""
"""1""",155301478,"""C""","""G""",1515,"""C0340968:Pyruvate kinase defic…","""MIM 266200""","""PLS""",true,39,"""PLS_2"""
"""1""",160032009,"""G""","""C""",1288,"""C5201145:Hypercoagulability sy…","""MIM 610293""","""PLS""",true,18,"""PLS_3"""
…,…,…,…,…,…,…,…,…,…,…
"""X""",139530730,"""G""","""A""",641767,"""C5848256:Hemophilia B leyden""","""MIM 306900""","""upstream_gene_variant""",true,8,"""upstream_gene_variant_17"""
"""X""",139530731,"""A""","""T""",10645,"""C5848256:Hemophilia B leyden""","""MIM 306900""","""upstream_gene_variant""",true,7,"""upstream_gene_variant_19"""
"""X""",139530743,"""T""","""C""",10644,"""C5848256:Hemophilia B leyden""","""MIM 306900""","""5_prime_UTR_variant""",true,3,"""5_prime_UTR_variant_107"""
"""X""",139530748,"""A""","""G""",10646,"""C5848256:Hemophilia B leyden""","""MIM 306900""","""5_prime_UTR_variant""",true,8,"""5_prime_UTR_variant_110"""


In [41]:
x["consequence"].value_counts()

consequence,count
str,u32
"""pELS_flank""",3
"""5_prime_UTR_variant""",41
"""dELS""",9
"""dELS_flank""",3
"""3_prime_UTR_variant""",11
"""PLS""",16
"""upstream_gene_variant""",5
"""intron_variant""",1
"""non_coding_transcript_exon_var…",31


In [42]:
# maybe some are no longer pathogenic according to OMIM?
# or maybe, Smedley et al. did their own literature review rather than take them from OMIM
# TODO: check if these missing variants are perhaps not in UCSC genome browser OMIM
traitgym.join(V, how="anti", on=COORDINATES)

chrom,pos,ref,alt,OMIM,consequence,label,tss_dist,match_group
str,i64,str,str,str,str,bool,i64,str
"""1""",7961859,"""C""","""G""","""MIM 606324""","""PLS""",true,34,"""PLS_0"""
"""1""",9943502,"""A""","""T""","""MIM 608553""","""5_prime_UTR_variant""",true,26,"""5_prime_UTR_variant_0"""
"""1""",9943503,"""C""","""T""","""MIM 608553""","""5_prime_UTR_variant""",true,27,"""5_prime_UTR_variant_1"""
"""1""",21509427,"""C""","""T""","""MIM 241500""","""5_prime_UTR_variant""",true,0,"""5_prime_UTR_variant_2"""
"""1""",25816825,"""T""","""C""","""MIM 602771""","""3_prime_UTR_variant""",true,1814,"""3_prime_UTR_variant_1"""
…,…,…,…,…,…,…,…,…
"""X""",155022770,"""A""","""G""","""MIM 306700""","""PLS""",true,46,"""PLS_57"""
"""X""",155022771,"""G""","""A""","""MIM 306700""","""PLS""",true,47,"""PLS_62"""
"""X""",155022773,"""A""","""T""","""MIM 306700""","""PLS""",true,49,"""PLS_58"""
"""X""",155022807,"""T""","""C""","""MIM 306700""","""PLS""",true,83,"""PLS_59"""
