In [1]:
import polars as pl

In [2]:
#!wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz

In [8]:
!zcat submission_summary.txt.gz | head -n 18

##Overview of interpretation, phenotypes, observations, and methods reported in each current submission 
##Explanation of the columns in this report
#VariationID:                   the identifier assigned by ClinVar and used to build the URL, namely https://ncbi.nlm.nih.gov/clinvar/VariationID
#ClinicalSignificance:          the germline classification on this submitted record
#DateLastEvaluated:             the last date the classification on this record was evaluated by the submitter
#Description:                   an optional free text description comment describing the rationale for the classification
#SubmittedPhenotypeInfo:        the name(s) or identifier(s) submitted as the condition for the classification 
#ReportedPhenotypeInfo:         the MedGen identifier/name combinations that the submitted condition for the classification maps to. 'na' means there is no public identifer in MedGen for the condition.
#ReviewStatus:                  the level of review for this submitted re

In [47]:
df = pl.read_csv(
    "submission_summary.txt.gz", separator="\t", skip_rows=18,
    columns=[
        "#VariationID",
        "ClinicalSignificance",
        "ReportedPhenotypeInfo",
        "CollectionMethod",
        "Submitter",
    ]
)
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""clinical testing""","""Paris Brain Institute, Inserm …"
2,"""Pathogenic""","""C3661900:not provided""","""clinical testing""","""Athena Diagnostics"""
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
2,"""Likely pathogenic""","""na:Macular dystrophy with or w…","""research""","""Ophthalmic Genetics Group, Ins…"
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [48]:
df = df.filter(Submitter="OMIM")
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
4,"""Uncertain significance""","""C4551772:Galloway-Mowat syndro…","""literature only""","""OMIM"""
5,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
6,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [49]:
unambiguous_ids = df.group_by("#VariationID").n_unique().filter(ClinicalSignificance=1)["#VariationID"]
df = df.filter(pl.col("#VariationID").is_in(unambiguous_ids))
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
4,"""Uncertain significance""","""C4551772:Galloway-Mowat syndro…","""literature only""","""OMIM"""
5,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
6,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [50]:
df = df.filter(pl.col("ClinicalSignificance") == "Pathogenic")
df

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
2,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
3,"""Pathogenic""","""C3150901:Hereditary spastic pa…","""literature only""","""OMIM"""
5,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
6,"""Pathogenic""","""C4748791:Mitochondrial complex…","""literature only""","""OMIM"""
7,"""Pathogenic""","""C4748792:Mitochondrial complex…","""literature only""","""OMIM"""
…,…,…,…,…
3900693,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900694,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900695,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""
3900696,"""Pathogenic""","""na:GUILLOUET-GORDON SYNDROME""","""literature only""","""OMIM"""


In [13]:
# TODO: should check that the ClinicalSignificance is the one from OMIM, not the aggregate one from ClinVar
# yes, I think it's working

In [51]:
df.filter(pl.col("#VariationID") == 4907)

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
4907,"""Pathogenic""","""C1868114:Polydactyly of a trip…","""literature only""","""OMIM"""


In [52]:
df.filter(pl.col("#VariationID") == 3370406)

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str
3370406,"""Pathogenic""","""C5975503:Spermatogenic failure…","""literature only""","""OMIM"""
3370406,"""Pathogenic""","""C5975510:Premature ovarian fai…","""literature only""","""OMIM"""


In [53]:
df["CollectionMethod"].value_counts()

CollectionMethod,count
str,u32
"""literature only""",32910


In [None]:
# comma is not good, need an escape character, maybe "|"

In [65]:
df.filter(pl.col("ReportedPhenotypeInfo").str.contains("|", literal=True))

#VariationID,ClinicalSignificance,ReportedPhenotypeInfo,CollectionMethod,Submitter
i64,str,str,str,str


In [75]:
df = df.group_by("#VariationID").agg(pl.col("ReportedPhenotypeInfo").unique()).with_columns(pl.col("ReportedPhenotypeInfo").list.join("|"))
df

#VariationID,ReportedPhenotypeInfo
i64,str
869368,"""C1845543:Syndromic X-linked in…"
3746,"""C0745103:Hypercholesterolemia,…"
14774,"""C1862939:Amyotrophic lateral s…"
6086,"""C0013364:Familial dysautonomia"""
224330,"""C4225180:Even-plus syndrome"""
…,…
1834,"""C1836517:Senior-Loken syndrome…"
218104,"""C3275447:Ogden syndrome"""
10614,"""C0008533:Hereditary factor IX …"
3346,"""C0268297:3-Oxo-5 alpha-steroid…"


In [76]:
df.filter(pl.col("ReportedPhenotypeInfo").str.contains("|", literal=True))

#VariationID,ReportedPhenotypeInfo
i64,str
9641,"""C1328349:NARP syndrome|C327568…"
916034,"""C5394441:46,xx sex reversal 5|…"
7402,"""C2750452:Waardenburg syndrome …"
1385,"""C3150796:Nephronophthisis 11|C…"
17581,"""C0206711:Pilomatrixoma|C020662…"
…,…
374882,"""C4310634:Dystonia, childhood-o…"
11772,"""C1844696:Oto-palato-digital sy…"
14591,"""C1721007:Pachyonychia congenit…"
9634,"""C1838854:Aminoglycoside-induce…"


In [None]:
# TODO: merge ids with those in the vcf (hopefully there)