# Scheduled Integration of ClinVar Gene Variant-Disease Data into WikiData

ClinVar aggregates information about genomic variation and its relationship to human health <br>
CC0 https://www.ncbi.nlm.nih.gov/clinvar/

This scheduled bot operates through WDI to integrate ClinVar Gene Variant-Disease Data <br>
https://github.com/SuLab/GeneWikiCentral/issues/50 <br>

Python script contributions, in order: Sabah Ul-Hasan, Andrew I Su

In [1]:
# Download data from NCBI

# Make sure os has wget installed, or the command wont work
import os
os.system('wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz') 

# Create time stamp of when downloaded (error if isoformat() used)
from datetime import datetime
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z")

# Unzip the file
import gzip
import shutil
with gzip.open('variant_summary.txt.gz', 'rb') as f_in:
    with open('variant_summary.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
# Convert .txt file to .csv
import pandas as pd 
import csv

txt_file = r"variant_summary.txt"
csv_file = r"variant_summary.csv"

with open(txt_file, "r") as in_text:
    in_reader = csv.reader(in_text, delimiter = '\t')
    with open(csv_file, "w") as out_csv:
        out_writer = csv.writer(out_csv)
        for row in in_reader:
            out_writer.writerow(row)

# Import .csv file and read first 5 rows
data = pd.read_csv("variant_summary.csv") 
data.shape # 31 columns, 1319815 rows
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID
0,191195,duplication,NM_006920.6(SCN1A):c.2011-13dup,6323,SCN1A,HGNC:10585,Conflicting interpretations of pathogenicity,0,"Feb 15, 2019",549232924,...,T,TA,2q24.3,"criteria provided, conflicting interpretations",3,,N,-,2,194032
1,191196,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,1,"Feb 23, 2015",794727058,...,C,T,5q23.2,"criteria provided, single submitter",1,,N,-,2,194033
2,191196,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,1,"Feb 23, 2015",794727058,...,C,T,5q23.2,"criteria provided, single submitter",1,,N,-,2,194033
3,191197,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,0,"Mar 16, 2015",368563439,...,A,G,10q24.31,"criteria provided, single submitter",1,,N,-,2,194034
4,191197,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,0,"Mar 16, 2015",368563439,...,A,G,10q24.31,"criteria provided, single submitter",1,,N,-,2,194034


In [2]:
# Clean-up the data for identification and integration

# Columns to keep
new = data[['Type', 'Name','GeneID','GeneSymbol','HGNC_ID', 'ClinicalSignificance','VariationID', 
            'ChromosomeAccession', 'Chromosome', 'Start', 'ReferenceAllele', 'AlternateAllele',
            'ReviewStatus']]
new.shape # 12 columns, 1319815 rows
new.head()

Unnamed: 0,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus
0,duplication,NM_006920.6(SCN1A):c.2011-13dup,6323,SCN1A,HGNC:10585,Conflicting interpretations of pathogenicity,194032,NC_000002.12,2,166042429,T,TA,"criteria provided, conflicting interpretations"
1,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,194033,NC_000005.9,5,125891622,C,T,"criteria provided, single submitter"
2,single nucleotide variant,NM_001182.5(ALDH7A1):c.1093+1G>A,501,ALDH7A1,HGNC:877,Pathogenic,194033,NC_000005.10,5,126555930,C,T,"criteria provided, single submitter"
3,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,194034,NC_000010.10,10,102772013,A,G,"criteria provided, single submitter"
4,single nucleotide variant,NM_001195263.2(PDZD7):c.1752T>C (p.Tyr584=),79955,PDZD7,HGNC:26257,Likely benign,194034,NC_000010.11,10,101012256,A,G,"criteria provided, single submitter"


In [3]:
# Create new column that converts 'ReviewStatus' to star rating
## https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/

new['Rating'] = "" # Create empty column for gold star rating

## Convert strings from 'ReviewStatus' to 'Rating' 
new.loc[new['ReviewStatus'].str.contains('no assertion provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion criteria provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion for the individual variant'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('criteria provided, single submitter'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, multiple submitters, no conflicts'), 'Rating'] = 'two'
new.loc[new['ReviewStatus'].str.contains('reviewed by expert panel'), 'Rating'] = 'three'
new.loc[new['ReviewStatus'].str.contains('practice guideline'), 'Rating'] = 'four'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [13]:
# Keep anything with 'three' or more stars in 'Rating' column
threeplus = new[new['Rating'].str.contains('three|four')] # excludes both one and none
threeplus.shape # 14 columns, 1299197 rows
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = threeplus[threeplus['Type'].str.contains('single nucleotide variant')]
snv.shape # 1087133 rows
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 3778 rows

(3778, 14)

In [21]:
# Create HGVS Column and IDs
patho['HGVS_NC'] = "" # Create empty column for HGVS nomenclature with NC
patho['HGVS_chr'] = "" # Create empty column for HGVS nomenclature with chr
# Naming nomenclature: https://varnomen.hgvs.org/bg-material/numbering/


# Filtering only for SNV addresses any instances of 'del'
# Insertion of GC for Wikidata... address this later
test = patho
test['HGVS_NC']=test['ChromosomeAccession']+':g.'+test['Start'].astype(str)+test['ReferenceAllele']+'>'+test['AlternateAllele']
test['HGVS_chr']='chr'+test['Chromosome'].astype(str)+':g.'+test['Start'].astype(str)+test['ReferenceAllele']+'>'+test['AlternateAllele']
test.head()

test.to_csv("test.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

Unnamed: 0,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS,HGVS_chr,HGVS_NC
6692,single nucleotide variant,NM_206933.3(USH2A):c.8682-9A>G,7399,USH2A,HGNC:12601,Pathogenic,197510,NC_000001.10,1,216040521,T,C,reviewed by expert panel,three,NC_000001.10:g.216040521T>C,chr1:g.216040521T>C,NC_000001.10:g.216040521T>C
6693,single nucleotide variant,NM_206933.3(USH2A):c.8682-9A>G,7399,USH2A,HGNC:12601,Pathogenic,197510,NC_000001.11,1,215867179,T,C,reviewed by expert panel,three,NC_000001.11:g.215867179T>C,chr1:g.215867179T>C,NC_000001.11:g.215867179T>C
25569,single nucleotide variant,NM_000492.3(CFTR):c.1117-1G>A,1080,CFTR,HGNC:1884,Pathogenic,209059,NC_000007.13,7,117182069,G,A,reviewed by expert panel,three,NC_000007.13:g.117182069G>A,chr7:g.117182069G>A,NC_000007.13:g.117182069G>A
25570,single nucleotide variant,NM_000492.3(CFTR):c.1117-1G>A,1080,CFTR,HGNC:1884,Pathogenic,209059,NC_000007.14,7,117542015,G,A,reviewed by expert panel,three,NC_000007.14:g.117542015G>A,chr7:g.117542015G>A,NC_000007.14:g.117542015G>A
25577,single nucleotide variant,NM_000492.3(CFTR):c.2658-1G>C,1080,CFTR,HGNC:1884,Pathogenic,209045,NC_000007.13,7,117243585,G,C,reviewed by expert panel,three,NC_000007.13:g.117243585G>C,chr7:g.117243585G>C,NC_000007.13:g.117243585G>C
25578,single nucleotide variant,NM_000492.3(CFTR):c.2658-1G>C,1080,CFTR,HGNC:1884,Pathogenic,209045,NC_000007.14,7,117603531,G,C,reviewed by expert panel,three,NC_000007.14:g.117603531G>C,chr7:g.117603531G>C,NC_000007.14:g.117603531G>C
25579,single nucleotide variant,NM_000492.3(CFTR):c.3294G>A (p.Trp1098Ter),1080,CFTR,HGNC:1884,Pathogenic,209057,NC_000007.13,7,117251789,G,A,reviewed by expert panel,three,NC_000007.13:g.117251789G>A,chr7:g.117251789G>A,NC_000007.13:g.117251789G>A
25580,single nucleotide variant,NM_000492.3(CFTR):c.3294G>A (p.Trp1098Ter),1080,CFTR,HGNC:1884,Pathogenic,209057,NC_000007.14,7,117611735,G,A,reviewed by expert panel,three,NC_000007.14:g.117611735G>A,chr7:g.117611735G>A,NC_000007.14:g.117611735G>A
39533,single nucleotide variant,NM_000059.3(BRCA2):c.6022A>T (p.Lys2008Ter),675,BRCA2,HGNC:1101,Pathogenic,216029,NC_000013.11,13,32340377,A,T,reviewed by expert panel,three,NC_000013.11:g.32340377A>T,chr13:g.32340377A>T,NC_000013.11:g.32340377A>T
39534,single nucleotide variant,NM_000059.3(BRCA2):c.6022A>T (p.Lys2008Ter),675,BRCA2,HGNC:1101,Pathogenic,216029,NC_000013.10,13,32914514,A,T,reviewed by expert panel,three,NC_000013.10:g.32914514A>T,chr13:g.32914514A>T,NC_000013.10:g.32914514A>T


In [23]:
query = pd.read_csv("/Users/sulhasan/Desktop/Su Lab Projects/ClinVar-Bot_GeneWikiCentral-Issue50/query.csv")  
query = query[query['HGVS'].str.contains('chr|NC_',  na=False)] # subset for NC or chr
query.head()

Unnamed: 0,gene,HGVS
0,http://www.wikidata.org/entity/Q28599633,NC_000003.11:g.37038139T>C
1,http://www.wikidata.org/entity/Q28599639,NC_000003.11:g.37081770A>C
2,http://www.wikidata.org/entity/Q28599624,NC_000003.11:g.37089094_37089095del
3,http://www.wikidata.org/entity/Q28599634,NC_000003.11:g.37090406del
11,http://www.wikidata.org/entity/Q29938348,chr1:g.115258745C>A
12,http://www.wikidata.org/entity/Q29938352,chr1:g.115258748C>T
13,http://www.wikidata.org/entity/Q29938697,chr11:g.108186827C>T
14,http://www.wikidata.org/entity/Q29938644,chr12:g.25380277G>C
15,http://www.wikidata.org/entity/Q29938740,chr13:g.32900751G>C
16,http://www.wikidata.org/entity/Q29938724,chr17:g.41276111C>A


In [19]:
mergedStuff = pd.merge(test, query, on=['HGVS'], how='inner')
mergedStuff.shape 

# 365 of all ClinVar (no filter) match NC_ HGVS of 574 possible in Wikidata
## 80 with two or more stars, SNV, and Pathogenic
### 13 three or more stars (0 with 4)
# 130 of all ClinVar (no filter) match chr HGVS of 244 possible in Wikidata
## 24 with two or more stars, SNV, and Pathogenic
### 0 with three or more stars

(13, 17)

In [None]:
# Keep anything with 'two' or more stars in 'Rating' column
twoplus = new[~new['Rating'].str.contains('one')] # excludes both one and none
twoplus.shape # 13 columns, 196530 rows: 1123285 removed, 14.9 % of all data usable
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = twoplus[twoplus['Type'].str.contains('single nucleotide variant')]
snv.shape # 174733 rows, or 88.9% of data with two or more stars (13.2% of all data)
## 83.2 % of total data, prior to star rating filter, are SNVs
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 20040 rows, or 10.2% of data with two or more stars that are snvs (1.5% of all data)
## 12.8 % of total data, prior to star rating filter, are Pathogenic

In [None]:
# Keep anything with 'four' or more stars in 'Rating' column
four = new[new['Rating'].str.contains('four')] # excludes both one and none
four.shape # 13 columns, 63 rows
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = four[four['Type'].str.contains('single nucleotide variant')]
snv.shape # 54 rows
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 38 rows 