# Scheduled Integration of ClinVar Gene Variant-Disease Data into WikiData

ClinVar aggregates information about genomic variation and its relationship to human health <br>
CC0 https://www.ncbi.nlm.nih.gov/clinvar/

This scheduled bot operates through WDI to integrate ClinVar Gene Variant-Disease Data <br>
https://github.com/SuLab/GeneWikiCentral/issues/50 <br>

Python script contributions, in order: Sabah Ul-Hasan, Andrew I Su

In [2]:
# Download data from NCBI

# Make sure os has wget installed, or the command wont work
import os
os.system('wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz') 

# Create time stamp of when downloaded (error if isoformat() used)
from datetime import datetime
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")

# Unzip the file
import gzip
import shutil
with gzip.open('variant_summary.txt.gz', 'rb') as f_in:
    with open('variant_summary.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# Convert .txt file to .csv
import pandas as pd 
import csv

txt_file = r"variant_summary.txt"
csv_file = r"variant_summary.csv"

with open(txt_file, "r") as in_text:
    in_reader = csv.reader(in_text, delimiter = '\t')
    with open(csv_file, "w") as out_csv:
        out_writer = csv.writer(out_csv)
        for row in in_reader:
            out_writer.writerow(row)

# Import .csv file and read first 5 rows
data = pd.read_csv("variant_summary.csv") 
data.shape # 31 columns, 1319815 rows
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID
0,191195,duplication,NM_006920.6(SCN1A):c.2011-13dup,6323,SCN1A,HGNC:10585,Conflicting interpretations of pathogenicity,0,"Feb 15, 2019",549232924,...,T,TA,2q24.3,"criteria provided, conflicting interpretations",3,,N,-,2,194032
1,191196,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,1,"Feb 23, 2015",794727058,...,C,T,5q23.2,"criteria provided, single submitter",1,,N,-,2,194033
2,191196,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,1,"Feb 23, 2015",794727058,...,C,T,5q23.2,"criteria provided, single submitter",1,,N,-,2,194033
3,191197,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,0,"Mar 16, 2015",368563439,...,A,G,10q24.31,"criteria provided, single submitter",1,,N,-,2,194034
4,191197,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,0,"Mar 16, 2015",368563439,...,A,G,10q24.31,"criteria provided, single submitter",1,,N,-,2,194034


In [3]:
# Clean-up the data for identification and integration

# Columns to keep
new = data[['Type', 'Name','GeneID','GeneSymbol','HGNC_ID', 'ClinicalSignificance','VariationID', 
            'ChromosomeAccession', 'Start', 'ReferenceAllele', 'AlternateAllele',
            'ReviewStatus']]
new.shape # 12 columns, 1319815 rows
new.head()

Unnamed: 0,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,VariationID,ChromosomeAccession,Start,ReferenceAllele,AlternateAllele,ReviewStatus
0,duplication,NM_006920.6(SCN1A):c.2011-13dup,6323,SCN1A,HGNC:10585,Conflicting interpretations of pathogenicity,194032,NC_000002.12,166042429,T,TA,"criteria provided, conflicting interpretations"
1,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,194033,NC_000005.9,125891622,C,T,"criteria provided, single submitter"
2,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,194033,NC_000005.10,126555930,C,T,"criteria provided, single submitter"
3,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,194034,NC_000010.10,102772013,A,G,"criteria provided, single submitter"
4,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,194034,NC_000010.11,101012256,A,G,"criteria provided, single submitter"


In [6]:
# Create new column that converts 'ReviewStatus' to star rating
## https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/

new['Rating'] = "" # Create empty column for gold star rating

## Convert strings from 'ReviewStatus' to 'Rating' 
new.loc[new['ReviewStatus'].str.contains('no assertion provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion criteria provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion for the individual variant'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('criteria provided, single submitter'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, multiple submitters, no conflicts'), 'Rating'] = 'two'
new.loc[new['ReviewStatus'].str.contains('reviewed by expert panel'), 'Rating'] = 'three'
new.loc[new['ReviewStatus'].str.contains('practice guideline'), 'Rating'] = 'four'

## Create a report that automatically outputs a summary of these each download?
# Keep anything with 'four' or more stars in 'Rating' column
four = new[new['Rating'].str.contains('four')] # excludes both one and none
four.shape # 13 columns, 63 rows
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = four[four['Type'].str.contains('single nucleotide variant')]
snv.shape # 54 rows
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 38 rows 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(38, 13)

In [85]:
# Create HGVS Column and IDs
snv['HGVS'] = "" # Create empty column for HGVS nomenclature

test = snv
test['HGVS']=test['ChromosomeAccession']+':g.'+test['Start'].astype(str)+test['ReferenceAllele']+'>'+test['AlternateAllele']
test.head()
# NC_000007.13:g.140453136A>T

# ChromosomeAccession 
# NC_000007.13

# Start
# 140453136

#ReferenceAllele
# A

#AlternateAllele
# T

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,VariationID,ChromosomeAccession,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS
5,single nucleotide variant,NM_001369.2(DNAH5):c.1585C>T (p.Arg529Trp),1767,DNAH5,HGNC:2950,Uncertain significance,194035,NC_000005.9,13911554,G,A,"criteria provided, multiple submitters, no con...",two,NC_000005.9:g.13911554G>A
6,single nucleotide variant,NM_001369.2(DNAH5):c.1585C>T (p.Arg529Trp),1767,DNAH5,HGNC:2950,Uncertain significance,194035,NC_000005.10,13911445,G,A,"criteria provided, multiple submitters, no con...",two,NC_000005.10:g.13911445G>A
7,single nucleotide variant,NM_001429.4(EP300):c.2240C>T (p.Pro747Leu),2033,EP300,HGNC:3373,Benign/Likely benign,194036,NC_000022.10,41543949,C,T,"criteria provided, multiple submitters, no con...",two,NC_000022.10:g.41543949C>T
8,single nucleotide variant,NM_001429.4(EP300):c.2240C>T (p.Pro747Leu),2033,EP300,HGNC:3373,Benign/Likely benign,194036,NC_000022.11,41147945,C,T,"criteria provided, multiple submitters, no con...",two,NC_000022.11:g.41147945C>T
11,single nucleotide variant,NM_001458.4(FLNC):c.1902G>A (p.Glu634=),2318,FLNC,HGNC:3756,Benign,194038,NC_000007.13,128481312,G,A,"criteria provided, multiple submitters, no con...",two,NC_000007.13:g.128481312G>A


In [None]:
# Jan 6
# Keep anything with 'two' or more stars in 'Rating' column
twoplus = new[~new['Rating'].str.contains('one')] # excludes both one and none
twoplus.shape # 13 columns, 196530 rows: 1123285 removed, 14.9 % of all data usable
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = twoplus[twoplus['Type'].str.contains('single nucleotide variant')]
snv.shape # 174733 rows, or 88.9% of data with two or more stars (13.2% of all data)
## 83.2 % of total data, prior to star rating filter, are SNVs
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 20040 rows, or 10.2% of data with two or more stars that are snvs (1.5% of all data)
## 12.8 % of total data, prior to star rating filter, are Pathogenic

In [1]:
# Installations by shell 
!pip install --upgrade pip # Installs pip, ensures it's up-to-date
!pip install tqdm # Visualizes installation progress (progress bar)
!pip install termcolor # For color-coding printed output
!pip install wikidataintegrator # For wikidata

Requirement already up-to-date: pip in /Users/sulhasan/anaconda3/lib/python3.7/site-packages (19.3.1)


In [2]:
# Installations by python
from wikidataintegrator import wdi_core, wdi_login # Core and login from wikidataintegrator module
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs # For retrieving references
import copy # Copies references needed in the .csv for uploading to wikidata
from datetime import datetime # For identifying the current date and time
import time # For keeping track of total for loop run time

import os # OS package to ensure interaction between the modules (ie WDI) and current OS being used

import pandas as pd # Pandas for data organization, then abbreviated to pd
import numpy as np # Another general purpose package

### ClinVar Gen gene-disease validity data

In [55]:
# Automate this from ftp file later...
import urllib 

file = urllib.request.urlopen('ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz', 'variant_summary.txt.gz')

with open('variant_summary.txt') as f:
    mylist = list(f)

print(mylist)

[]


### Login for running WDI

In [7]:
print("Logging in...") 

# Enter your own username and password ** to be updated to ProteinBoxBot
os.environ["WDUSER"] = "username" # Uses os package to call and set the environment for wikidata username
os.environ["WDPASS"] = "password"

# Conditional that outputs error command if not in the local python environment
if "WDUSER" in os.environ and "WDPASS" in os.environ: 
    WDUSER = os.environ['WDUSER']
    WDPASS = os.environ['WDPASS']
else: 
    raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")      

# Sets attributed username and password as 'login'
login = wdi_login.WDLogin(WDUSER, WDPASS) 

Logging in...
https://www.wikidata.org/w/api.php
Successfully logged in as Sulhasan
