## ClinVar Import

Artifact Import for Release `2020-06`.

In [1]:
import fsspec
import pandas as pd
from data_source import catalog

#### Download 

In [2]:
!wget -P /tmp https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz

--2020-06-10 22:30:54--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 2607:f220:41e:250::13
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57029754 (54M) [application/x-gzip]
Saving to: ‘/tmp/submission_summary.txt.gz.1’


2020-06-10 22:31:08 (3.81 MB/s) - ‘/tmp/submission_summary.txt.gz.1’ saved [57029754/57029754]



In [3]:
df = pd.read_csv('/tmp/submission_summary.txt.gz', skiprows=15, sep='\t')
df.head()

Unnamed: 0,#VariationID,ClinicalSignificance,DateLastEvaluated,Description,SubmittedPhenotypeInfo,ReportedPhenotypeInfo,ReviewStatus,CollectionMethod,OriginCounts,Submitter,SCV,SubmittedGeneSymbol,ExplanationOfInterpretation
0,2,Pathogenic,"Jun 29, 2010",-,"SPASTIC PARAPLEGIA 48, AUTOSOMAL RECESSIVE","C3150901:Spastic paraplegia 48, autosomal rece...",no assertion criteria provided,literature only,germline:na,OMIM,SCV000020155.3,AP5Z1,-
1,3,Pathogenic,"Jun 29, 2010",-,SPASTIC PARAPLEGIA 48,"C3150901:Spastic paraplegia 48, autosomal rece...",no assertion criteria provided,literature only,germline:na,OMIM,SCV000020156.5,AP5Z1,-
2,4,Uncertain significance,"Jun 29, 2015",-,RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE,C4551772:Galloway-Mowat syndrome 1,no assertion criteria provided,literature only,germline:na,OMIM,SCV000020157.2,ZNF592,-
3,5,Pathogenic,"Oct 01, 2010",-,"MITOCHONDRIAL COMPLEX I DEFICIENCY, NUCLEAR TY...","C4748791:Mitochondrial complex 1 deficiency, n...",no assertion criteria provided,literature only,germline:na,OMIM,SCV000020158.5,FOXRED1,-
4,5,Pathogenic,"Dec 07, 2017",The Q232X variant in the FOXRED1 gene has been...,Not Provided,CN517202:not provided,"criteria provided, single submitter",clinical testing,germline:na,GeneDx,SCV000680696.2,FOXRED1,-


In [4]:
import io
buf = io.StringIO()
df.info(buf=buf)
info = buf.getvalue()
print(info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224708 entries, 0 to 1224707
Data columns (total 13 columns):
 #   Column                       Non-Null Count    Dtype 
---  ------                       --------------    ----- 
 0   #VariationID                 1224708 non-null  int64 
 1   ClinicalSignificance         1224708 non-null  object
 2   DateLastEvaluated            1224708 non-null  object
 3   Description                  1224708 non-null  object
 4   SubmittedPhenotypeInfo       1224708 non-null  object
 5   ReportedPhenotypeInfo        1224708 non-null  object
 6   ReviewStatus                 1224708 non-null  object
 7   CollectionMethod             1224708 non-null  object
 8   OriginCounts                 1224708 non-null  object
 9   Submitter                    1224708 non-null  object
 10  SCV                          1224708 non-null  object
 11  SubmittedGeneSymbol          1224703 non-null  object
 12  ExplanationOfInterpretation  1224708 non-null  object
dt

#### Convert

In [5]:
df.to_parquet('/tmp/submission_summary.parquet')

In [6]:
!du -sh /tmp/submission_summary.parquet

84M	/tmp/submission_summary.parquet


#### Create Catalog Entry

In [7]:
entry = catalog.create_entry(
    source='clinvar', 
    slug='submission_summary',
    version='v2020-06',
    created=pd.to_datetime('2020-06-01').to_pydatetime(),
    format='parquet',
    type='file',
    metadata=dict(info=info, nrow=len(df))
)
entry.dict()

{'source': {'slug': 'clinvar', 'name': None, 'description': None},
 'artifact': {'slug': 'submission_summary',
  'version': 'v2020-06',
  'created': datetime.datetime(2020, 6, 1, 0, 0),
  'formats': [{'name': 'parquet',
    'type': 'file',
    'default': True,
    'properties': None}],
  'name': None,
  'metadata': {'info': "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 1224708 entries, 0 to 1224707\nData columns (total 13 columns):\n #   Column                       Non-Null Count    Dtype \n---  ------                       --------------    ----- \n 0   #VariationID                 1224708 non-null  int64 \n 1   ClinicalSignificance         1224708 non-null  object\n 2   DateLastEvaluated            1224708 non-null  object\n 3   Description                  1224708 non-null  object\n 4   SubmittedPhenotypeInfo       1224708 non-null  object\n 5   ReportedPhenotypeInfo        1224708 non-null  object\n 6   ReviewStatus                 1224708 non-null  object\n 7   CollectionMe

In [8]:
# This will add the entry and save it to the default
# catalog location ($REPO/catalog.yaml)
catalog.add_entry(entry, overwrite=True)

#### Upload

In [9]:
url = entry.url()
url

'gs://public-data-source/catalog/clinvar/submission_summary/v2020-06/20200601T000000/data.parquet'

In [10]:
fs = entry.fs
fs

<gcsfs.core.GCSFileSystem at 0x7f8e976da290>

In [11]:
fs.upload('/tmp/submission_summary.parquet', url)

#### Test Read

Assuming this entry was to be used in another project, usage would look like this:

In [12]:
from data_source import catalog
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
df = catalog.load().to_pandas()
df.query('source_slug == "clinvar"')

Unnamed: 0,source_slug,artifact_slug,artifact_version,artifact_created,artifact_formats,artifact_metadata_nrow,artifact_metadata_schema,storage_slug,storage_scheme,storage_bucket,storage_root,storage_project,artifact_metadata_info,entry
2,clinvar,submission_summary,v2020-06,2020-06-01,"[{'name': 'parquet', 'type': 'file', 'default'...",1224708,,gcs,gs,public-data-source,catalog,target-ranking,<class 'pandas.core.frame.DataFrame'>\nRangeIn...,"source=Source(slug='clinvar', name=None, descr..."


In [13]:
url = df.query('source_slug == "clinvar"')['entry'].iloc[0].url()
url

'gs://public-data-source/catalog/clinvar/submission_summary/v2020-06/20200601T000000/data.parquet'

In [14]:
import fsspec
path = fsspec.open_local('simplecache::' + url, simplecache={'cache_storage': '/tmp/simplecache'})
df = spark.read.parquet(path)
df.select('#VariationID', 'ClinicalSignificance', 'SubmittedPhenotypeInfo').show(5, 50)

+------------+----------------------+--------------------------------------------------+
|#VariationID|  ClinicalSignificance|                            SubmittedPhenotypeInfo|
+------------+----------------------+--------------------------------------------------+
|           2|            Pathogenic|        SPASTIC PARAPLEGIA 48, AUTOSOMAL RECESSIVE|
|           3|            Pathogenic|                             SPASTIC PARAPLEGIA 48|
|           4|Uncertain significance|    RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE|
|           5|            Pathogenic|MITOCHONDRIAL COMPLEX I DEFICIENCY, NUCLEAR TYP...|
|           5|            Pathogenic|                                      Not Provided|
+------------+----------------------+--------------------------------------------------+
only showing top 5 rows

