# Scheduled Integration of ClinVar Gene Variant-Disease Data into WikiData

ClinVar aggregates information about genomic variation and its relationship to human health <br>
CC0 https://www.ncbi.nlm.nih.gov/clinvar/

This scheduled bot operates monthly through WDI to integrate ClinVar Gene Variant-Disease Data <br>
https://www.ncbi.nlm.nih.gov/clinvar/docs/ftp_primer/ (variant_summary) <br>
https://github.com/SuLab/GeneWikiCentral/issues/50 <br>
http://jenkins.sulab.org/ <br>

Python script contributions, in order: Sabah Ul-Hasan, Andrew I Su, Tong Shu Li

## Checks and Tests

- x

In [76]:
# Relevant Modules and Libraries

import os # OS package to ensure interaction between the modules (ie WDI) and current OS being used

from datetime import datetime # For identifying the current date and time
import time # Keep track of total for loop run time

import gzip # For unzip of files
import shutil # Copies content of source file(s)
import csv # For converting file(s) to csv format

import pandas as pd # For data organization, abbreviated to pd
import numpy as np # For data organization, abbreviated as np

from wikidataintegrator import wdi_core, wdi_login # Core and login from wikidataintegrator module
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs # For retrieving references
from wikidataintegrator.wdi_helpers import try_write # For writing new item pages

In [5]:
# Download data from NCBI

## Make sure os has wget installed, or the following command wont work
os.system('wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz') 
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z") # time stamp of download 

## Unzip the file
with gzip.open('variant_summary.txt.gz', 'rb') as f_in:
    with open('variant_summary.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
## Convert .txt file to .csv
txt_file = r"variant_summary.txt"
csv_file = r"variant_summary.csv"

with open(txt_file, "r") as in_text:
    in_reader = csv.reader(in_text, delimiter = '\t')
    with open(csv_file, "w") as out_csv:
        out_writer = csv.writer(out_csv)
        for row in in_reader:
            out_writer.writerow(row)

(1339084, 31)

In [108]:
## Import .csv file and read first 5 rows
df = pd.read_csv("variant_summary.csv") 
df.shape # 31 columns, 1339084 rows
df.head()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID
0,156440,copy number loss,GRCh38/hg38 13q12.12(chr13:22822881-23533846)x1,-1,LINC00327;LINC00621;SACS;SACS-AS1;SGCG,-,Uncertain significance,0,"Feb 25, 2011",-1,...,na,na,13q12.12,no assertion criteria provided,1,,N,"dbVar:nssv706495,dbVar:nsv533231",2,146689
1,156440,copy number loss,GRCh38/hg38 13q12.12(chr13:22822881-23533846)x1,-1,LINC00327;LINC00621;SACS;SACS-AS1;SGCG,-,Uncertain significance,0,"Feb 25, 2011",-1,...,na,na,13q12.12,no assertion criteria provided,1,,N,"dbVar:nssv706495,dbVar:nsv533231",2,146689
2,156440,copy number loss,GRCh38/hg38 13q12.12(chr13:22822881-23533846)x1,-1,LINC00327;LINC00621;SACS;SACS-AS1;SGCG,-,Uncertain significance,0,"Feb 25, 2011",-1,...,na,na,13q12.12,no assertion criteria provided,1,,N,"dbVar:nssv706495,dbVar:nsv533231",2,146689
3,156441,copy number loss,GRCh38/hg38 18q23(chr18:79754409-79877194)x1,-1,CTDP1;KCNG2,-,Likely benign,0,"May 06, 2011",-1,...,na,na,18q23,no assertion criteria provided,1,,N,"dbVar:nssv706496,dbVar:nsv533232",2,146690
4,156441,copy number loss,GRCh38/hg38 18q23(chr18:79754409-79877194)x1,-1,CTDP1;KCNG2,-,Likely benign,0,"May 06, 2011",-1,...,na,na,18q23,no assertion criteria provided,1,,N,"dbVar:nssv706496,dbVar:nsv533232",2,146690


In [109]:
# Clean-up of data for identification and integration

## Columns to keep
new = df[['Type', 'Name', 'GeneSymbol','HGNC_ID', 'ClinicalSignificance', 'PhenotypeIDS', 'PhenotypeList','VariationID', 
            'ChromosomeAccession', 'Chromosome', 'Start', 'ReferenceAllele', 'AlternateAllele',
            'ReviewStatus']]

## Create new column that converts 'ReviewStatus' to star rating
### https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/
new['Rating'] = "" # Create empty column for gold star rating
## Convert strings from 'ReviewStatus' to 'Rating' 
new.loc[new['ReviewStatus'].str.contains('no assertion provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion criteria provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion for the individual variant'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('criteria provided, single submitter'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, multiple submitters, no conflicts'), 'Rating'] = 'two'
new.loc[new['ReviewStatus'].str.contains('reviewed by expert panel'), 'Rating'] = 'three'
new.loc[new['ReviewStatus'].str.contains('practice guideline'), 'Rating'] = 'four'

new.shape # 15 columns, 1339084 rows 

## Training dataset, based on the following criteria
threeplus=new[new['Rating'].str.contains('three|four')]
snv=threeplus[threeplus['Type'].str.contains('single nucleotide variant')]
patho=snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]

patho.shape # 15 columns, 3870 rows (0.29% of original dataset)

## Create HGVS columns and IDs (based on genomic position)
### Naming nomenclature: https://varnomen.hgvs.org/bg-material/numbering/
patho['HGVS_NC'] = "" # Create empty column for HGVS nomenclature with NC
patho['HGVS_chr'] = "" # Create empty column for HGVS nomenclature with chr

patho['HGVS_NC']=patho['ChromosomeAccession']+':g.'+patho['Start'].astype(str)+patho['ReferenceAllele']+'>'+patho['AlternateAllele']
patho['HGVS_chr']='chr'+patho['Chromosome'].astype(str)+':g.'+patho['Start'].astype(str)+patho['ReferenceAllele']+'>'+patho['AlternateAllele']

df=patho
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/in

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr
23254,single nucleotide variant,NM_000059.3(BRCA2):c.8490G>A (p.Trp2830Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0006142,OM...","Breast-ovarian cancer, familial 2;Familial can...",155722,NC_000013.10,13,32945095,G,A,reviewed by expert panel,three,NC_000013.10:g.32945095G>A,chr13:g.32945095G>A
23255,single nucleotide variant,NM_000059.3(BRCA2):c.8490G>A (p.Trp2830Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0006142,OM...","Breast-ovarian cancer, familial 2;Familial can...",155722,NC_000013.11,13,32370958,G,A,reviewed by expert panel,three,NC_000013.11:g.32370958G>A,chr13:g.32370958G>A
24036,single nucleotide variant,NM_000059.3(BRCA2):c.6044T>A (p.Leu2015Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0027672,SN...","Breast-ovarian cancer, familial 2;Hereditary c...",156172,NC_000013.11,13,32340399,T,A,reviewed by expert panel,three,NC_000013.11:g.32340399T>A,chr13:g.32340399T>A
24037,single nucleotide variant,NM_000059.3(BRCA2):c.6044T>A (p.Leu2015Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0027672,SN...","Breast-ovarian cancer, familial 2;Hereditary c...",156172,NC_000013.10,13,32914536,T,A,reviewed by expert panel,three,NC_000013.10:g.32914536T>A,chr13:g.32914536T>A
24358,single nucleotide variant,NM_004360.5(CDH1):c.1023T>G (p.Tyr341Ter),CDH1,HGNC:1748,Pathogenic,"MedGen:C1708349,OMIM:137215,Orphanet:ORPHA26106",Hereditary diffuse gastric cancer,156374,NC_000016.10,16,68812149,T,G,reviewed by expert panel,three,NC_000016.10:g.68812149T>G,chr16:g.68812149T>G


In [137]:
# Login for running WDI
print("Logging in...") 

# **remove lines when scheduling to Jenkins** Enter your own username and password 
os.environ["WDUSER"] = "username" # Uses os package to call and set the environment for wikidata username
os.environ["WDPASS"] = "password"

## Conditional that outputs error command if not in the local python environment
if "WDUSER" in os.environ and "WDPASS" in os.environ: 
    WDUSER = os.environ['WDUSER']
    WDPASS = os.environ['WDPASS']
else: 
    raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")      

## Sets attributed username and password as 'login'
login = wdi_login.WDLogin(WDUSER, WDPASS) 

Logging in...
https://www.wikidata.org/w/api.php
Successfully logged in as Sulhasan


In [94]:
df = df[(df['HGVS_NC'].str.contains("NC_000012.11:g.103234177C>T|NC_000002.11:g.47656951C>T|NC_000003.11:g.37038192G>A|NC_000003.11:g.37042536C>T|NC_000003.11:g.37045935C>T|NC_000003.11:g.37048546C>T|NC_000003.11:g.37053589C>T|NC_000003.11:g.37056036G>A|NC_000017.10:g.7577539G>A|NC_000017.10:g.7577548C>T|NC_000017.10:g.7578190T>C|NC_000021.8:g.36171704G>T|NC_000021.8:g.36252962C>G|NC_000021.8:g.36259163T>C|NC_000010.10:g.89711899C>T|NC_000012.11:g.103310908T>C|NC_000012.12:g.102917130T>C|NC_000017.10:g.7577120C>T|NC_000017.10:g.7577538C>T"))|(df['HGVS_chr'].str.contains("chr17:g.41228590G>A|chr17:g.41234451G>A"))]
df.shape # 21 x 17

# First 1 NC does not match to Wikidata, chr (2) do not match to Wikidata *18 possible writes of 21
df.head(21)

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr
63376,single nucleotide variant,NM_000314.7(PTEN):c.517C>T (p.Arg173Cys),PTEN,HGNC:9588,Pathogenic,"MedGen:CN072330,OMIM:158350;MedGen:CN072330,OM...",Cowden syndrome 1;Cowden syndrome 1;Glioma sus...,189500,NC_000010.10,10,89711899,C,T,reviewed by expert panel,three,NC_000010.10:g.89711899C>T,chr10:g.89711899C>T
141137,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.11,12,103310908,T,C,reviewed by expert panel,three,NC_000012.11:g.103310908T>C,chr12:g.103310908T>C
141138,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.12,12,102917130,T,C,reviewed by expert panel,three,NC_000012.12:g.102917130T>C,chr12:g.102917130T>C
162457,single nucleotide variant,NM_000546.5(TP53):c.742C>T (p.Arg248Trp),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0000157,MedGen:C08...",Abnormality of the tongue;Acute myeloid leukem...,12347,NC_000017.10,17,7577539,G,A,reviewed by expert panel,three,NC_000017.10:g.7577539G>A,chr17:g.7577539G>A
172160,single nucleotide variant,NM_007299.4(BRCA1):c.1018C>T (p.Arg340Ter),BRCA1,HGNC:1100,Pathogenic,"MedGen:CN221562;MedGen:C2676676,OMIM:604370;Me...",Breast and/or ovarian cancer;Breast-ovarian ca...,17675,NC_000017.10,17,41234451,G,A,reviewed by expert panel,three,NC_000017.10:g.41234451G>A,chr17:g.41234451G>A
242498,single nucleotide variant,NM_000251.2(MSH2):c.1147C>T (p.Arg383Ter),MSH2,HGNC:7325,Pathogenic,"MedGen:C0027672,SNOMED CT:699346009;MedGen:C00...",Hereditary cancer-predisposing syndrome;Heredi...,90554,NC_000002.11,2,47656951,C,T,reviewed by expert panel,three,NC_000002.11:g.47656951C>T,chr2:g.47656951C>T


In [111]:
# Training data, Feb 11

# No match (need item page)
## NC_000012.11:g.103234177C>T

# Single NC match
## NC_000010.10:g.89711899C>T

# Single chr match
## chr17:g.41234451G>A

# Two NC matches in for same Wikidata item page, but different entries
## NC_000012.11:g.103310908T>C
## NC_000012.12:g.102917130T>C

# NC matches to two different Wikidata item pages (remove from https://www.wikidata.org/wiki/Q28371040 later)
## NC_000002.11:g.47656951C>T
### Removed after testing, but still keeping in training dataset

# One NC match, one chr match manually entered (Feb 11)
## NC_000017.10:g.7577539G>A
## chr17:g.7577539G>A


df = df[(df['HGVS_NC'].str.contains("NC_000012.11:g.103234177C>T|NC_000010.10:g.89711899C>T|NC_000012.11:g.103310908T>C|NC_000012.12:g.102917130T>C|NC_000002.11:g.47656951C>T|NC_000017.10:g.7577539G>A"))|(df['HGVS_chr'].str.contains("chr17:g.41234451G>A|chr17:g.7577539G>A"))]
df.shape # 7 x 17
df.head(7)

(7, 17)

In [149]:
df['Variant Qid'] = "" # To be replaced with 'absent' or 'multiple' 

In [157]:
# range has to take an integer (can't be a string, object, etc)

for i in range("string"):
    print(i)

TypeError: 'str' object cannot be interpreted as an integer

In [153]:
# For loop on training dataset
start_time = time.time() # Keep track of how long it takes loop to run

for index, row in df.iterrows(): # Index is a row number, row is all variables and values for that row
    
    # Identify string for a given data table row to retrive the variant name and item page Qid from Wikidata
    HGVS_NC = df.loc[index, 'HGVS_NC']
    HGVS_chr = df.loc[index, 'HGVS_chr']
    
    # SparQL query to search HGVS Identifier (P3331) in Wikidata 
    sparqlQuery_HGVSNC = "SELECT * WHERE {?variant wdt:P3331 \""+HGVS_NC+"\"}" 
    result_HGVSNC = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGVSNC) 
    sparqlQuery_HGVSchr = "SELECT * WHERE {?variant wdt:P3331 \""+HGVS_chr+"\"}" 
    result_HGVSchr = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGVSchr)   
    
    # Assign the results to respective lists *we know these are lists by using the type() function
    NC = result_HGVSNC["results"]["bindings"]
    CHR = result_HGVSchr["results"]["bindings"]
    
    # Concatenate lists as a combined array based on being different (2+), empty (0), or one or duplicate (1)
    HGVSarray = np.concatenate((NC,CHR), axis=0)
    print(HGVSarray)
    
    HGVSlist = [] # create empty list
    # Successful execution of code doesn't mean it's working as it should.
    
    for i in range(len(HGVSarray)): # assigning each list an integer
        print(HGVSlist)
        if HGVSarray[i] not in HGVSarray[i + 1:]: # if 0 is not in 1-[however long it is], and goes through all
            HGVSlist.append(HGVSarray[i]) # then add to the list (shows whatever is unique)
            
    print(HGVSlist)
            
            
            # Scenario 1: Flag if HGVS identifier(s) have multiple wikidata item pages (works, tested NC_000002.11:g.47656951C>T)
    if len(HGVSlist) > 1: 
        print("error") 
        continue
                
            # Scenario 2: Go forward with write if only one or if duplicate (combine)     
    Variant_Qid = HGVSlist[0]["variant"]["value"].replace("http://www.wikidata.org/entity/", "")
    df.at[index, 'Variant Qid'] = Variant_Qid  
            
            # Add missing NC or chr (HGVS nomenclature: P3331)
            
            # Add VariationID (ClinVar Variation ID: P3329)
            
        # Scenario 3: Write item page if no identifiers available   
        ## How to create this?
        if len(HGVSlist) == 0:
            print("need item page")
            df.at[index, 'Variant Qid'] = "absent"  
            #try_write(HGVS_NC, 
            #          edit_summary=("gene variant in human gene ", df.loc[index, 'GeneSymbol']),
            #          record_id = Qid,
            #          record_prop = 'human gene variant',
            #          login = login)
            #print("item page created")
            
        
        
end_time = time.time() # Captures when loop run ends
print("The total time of this loop is:", end_time - start_time, "seconds, or", (end_time - start_time)/60, "minutes")

df.head()

[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938054'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938054'}}]
[]
[]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q28371040'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q28371040'}}
 {'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q28371040'}}]
need item page
[]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938735'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q28599622'}}]
[{'variant': {'type': 'uri', 'value': 'http://www.wik

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr,Variant Qid
63376,single nucleotide variant,NM_000314.7(PTEN):c.517C>T (p.Arg173Cys),PTEN,HGNC:9588,Pathogenic,"MedGen:CN072330,OMIM:158350;MedGen:CN072330,OM...",Cowden syndrome 1;Cowden syndrome 1;Glioma sus...,189500,NC_000010.10,10,89711899,C,T,reviewed by expert panel,three,NC_000010.10:g.89711899C>T,chr10:g.89711899C>T,Q29938054
141117,single nucleotide variant,NM_000277.3(PAH):c.1315+1G>A,PAH,HGNC:8582,Pathogenic,"MeSH:D030342,MedGen:C0950123;MedGen:C0751434,O...",Inborn genetic diseases;Phenylketonuria;not pr...,576,NC_000012.11,12,103234177,C,T,reviewed by expert panel,three,NC_000012.11:g.103234177C>T,chr12:g.103234177C>T,
141137,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.11,12,103310908,T,C,reviewed by expert panel,three,NC_000012.11:g.103310908T>C,chr12:g.103310908T>C,Q64401263
141138,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.12,12,102917130,T,C,reviewed by expert panel,three,NC_000012.12:g.102917130T>C,chr12:g.102917130T>C,Q64401263
162457,single nucleotide variant,NM_000546.5(TP53):c.742C>T (p.Arg248Trp),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0000157,MedGen:C08...",Abnormality of the tongue;Acute myeloid leukem...,12347,NC_000017.10,17,7577539,G,A,reviewed by expert panel,three,NC_000017.10:g.7577539G>A,chr17:g.7577539G>A,Q28371040


In [112]:
# For loop on training dataset
start_time = time.time() # Keep track of how long it takes loop to run


for index, row in df.iterrows(): # Index is a row number, row is all variables and values for that row
    
    
    # Identify string for a given row to retrive variant name and item page Qid from Wikidata
    HGVS_NC = df.loc[index, 'HGVS_NC']
    HGVS_chr = df.loc[index, 'HGVS_chr']
    
    # SparQL query to search HGVS Identifier in Wikidata based on P3331 
    sparqlQuery_HGVSNC = "SELECT * WHERE {?variant wdt:P3331 \""+HGVS_NC+"\"}" 
    result_HGVSNC = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGVSNC) 
    sparqlQuery_HGVSchr = "SELECT * WHERE {?variant wdt:P3331 \""+HGVS_chr+"\"}" 
    result_HGVSchr = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGVSchr)   
    
    # Assign a length to the resultant dictionary for either NC or chr (number of item pages by Qid)
    NC_qlength = len(result_HGVSNC["results"]["bindings"]) 
    chr_qlength = len(result_HGVSchr["results"]["bindings"]) 
    # Create a unique id based on the joined length
    combo_qlength = NC_qlength + chr_qlength # concatenate the values and then get the unique values over the number
    # create an array with both together over having integers
        # then 2, 1, or 0 situation... but may be tricky to assign a set here 
        # an array of objects
    
    
    
    
    # If there is more than one Wikidata item page that matches with the HGVS identifiers
    ## Assumption: HGVS identifier is unique to that variant
    if combo_qlength > 2:
        print("error: multiple wikidata item pages for HGVS identifier") # log in output
        continue
    
    # If there is no Wikidata item page that matches with the HGVS identifiers
    ## How to create the item page, if there isn't a variant name automatically in data table?
        # Use the NC name, and people will update later
        # Need name (NC)
        # Description: "gene variant in human gene X"
    ## How to best re-work through loop after adding HGVS item page?
        # Only go through the loop once for every line
        
    if combo_qlength == 0:
        print("error: need item page") # Assume no item page exists
        
        
        
    # If there 
    ## How to make this all less repetitive? (adding identifier and clinvar id)
    if combo_qlength > 0 & combo_qlength < 3:
        if NC_qlength == chr_qlength: 
        ## How to best address if they're not equal?
            
            NC_qid = result_HGVSNC["results"]["bindings"][0]["variant"]["value"].replace("http://www.wikidata.org/entity/", "")
            chr_qid = result_HGVSchr["results"]["bindings"][0]["variant"]["value"].replace("http://www.wikidata.org/entity/", "")    
            print("same")
        if NC_qlength == 1:
        ## Note there are two NCs for Q64401263
            # Non-write
            # Additional disease, if applicable
            NC_qid = result_HGVSNC["results"]["bindings"][0]["variant"]["value"].replace("http://www.wikidata.org/entity/", "")
            print("Item page:", NC_qid, ", for", HGVS_NC) 
            # Add chr identifier
        if chr_qlength == 1:
            chr_qid = result_HGVSchr["results"]["bindings"][0]["variant"]["value"].replace("http://www.wikidata.org/entity/", "")    
            print("Item page:", chr_qid, ", for", HGVS_chr)
            ## Do this later
            # Add nc identifier
            #identifier_NC = [wdi_core.WDItemID(value=chr_qid, prop_nr="P3331")] 
            #wikidata_NCidentifer = wdi_core.WDItemEngine(wd_item_id=chr_qid, 
             #                                     data=identifier_NC, 
              #                                    append_value=["P3331"])  
            #wikidata_NCidentifer.get_wd_json_representation() 
            #wikidata_NCidentifer.write(login) 
        ## Edits to reference

        ## Add VariationID - example, but no reference url in data... leave out? https://www.wikidata.org/wiki/Q21851559
        

        
        
# look at diseases (add genetic association)
# Symmetry for diseases
        
# Separate out Phenotype List (;) for disease 
    # Split one line into two (ie if there are two diseases)
        # Split by commas for identifiers
            # For example, if there are four identifiers
            # Take all identifiers for that disease, and search against Wikidata
            # Take union of results
                # If only one QiD found, then good
                # If there are multiple, then flag (don't write anything) *dont write anything for whole line
                # Add identifiers from dataset that are missing in Wikidata
                

# array of variances, with one variant shown here   
# concatenate to get this below (with nc and chr)
# then write something that removes duplicates: 0, 1, or 2
    
    
        
end_time = time.time() # Captures when loop run ends
print("The total time of this loop is:", end_time - start_time, "seconds, or", (end_time - start_time)/60, "minutes")

Item page: Q29938054 , for NC_000010.10:g.89711899C>T
error: need item page
Item page: Q64401263 , for NC_000012.11:g.103310908T>C
Item page: Q64401263 , for NC_000012.12:g.102917130T>C
same
Item page: Q28371040 , for NC_000017.10:g.7577539G>A
Item page: Q28371040 , for chr17:g.7577539G>A
Item page: Q29938735 , for chr17:g.41234451G>A
Item page: Q28599622 , for NC_000002.11:g.47656951C>T
The total time of this loop is: 3.8079538345336914 seconds, or 0.06346589724222819 minutes


In [67]:
## Download HGVS Wikidata query results as query.csv 
### https://query.wikidata.org/#SELECT%20%2a%20WHERE%20%7B%3Fgene%20wdt%3AP3331%20%3FHGVS%7D

query = pd.read_csv("/Users/sulhasan/Desktop/Su Lab Projects/ClinVar-Bot_GeneWikiCentral-Issue50/query.csv")  

## Subset for NC or chr
query = query[query['HGVS'].str.contains('chr|NC_',  na=False)] # 818 with NC or chr
query.head(5) 

# How many HGVS IDs match for Wikidata query (818) vs. manually created in ClinVar (3778 x 2 for NC or chr)
len(set(query["HGVS"]) & (set(df["HGVS_chr"]) | set(df["HGVS_NC"]))) # 17
set(query["HGVS"]) & (set(df["HGVS_chr"]) | set(hasHGVS["HGVS_NC"])) # 4 chr, 13 NC *chr dont match...

{'NC_000002.11:g.47656951C>T',
 'NC_000003.11:g.37038192G>A',
 'NC_000003.11:g.37042536C>T',
 'NC_000003.11:g.37045935C>T',
 'NC_000003.11:g.37048546C>T',
 'NC_000003.11:g.37053589C>T',
 'NC_000003.11:g.37056036G>A',
 'NC_000010.10:g.89711899C>T',
 'NC_000012.11:g.103310908T>C',
 'NC_000012.12:g.102917130T>C',
 'NC_000017.10:g.7577120C>T',
 'NC_000017.10:g.7577538C>T',
 'NC_000017.10:g.7577539G>A',
 'NC_000017.10:g.7577548C>T',
 'NC_000017.10:g.7578190T>C',
 'NC_000021.8:g.36171704G>T',
 'NC_000021.8:g.36252962C>G',
 'NC_000021.8:g.36259163T>C'}

In [None]:
# Keep anything with 'two' or more stars in 'Rating' column
twoplus = new[~new['Rating'].str.contains('one')] # excludes both one and none
twoplus.shape # 13 columns, 196530 rows: 1123285 removed, 14.9 % of all data usable
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = twoplus[twoplus['Type'].str.contains('single nucleotide variant')]
snv.shape # 174733 rows, or 88.9% of data with two or more stars (13.2% of all data)
## 83.2 % of total data, prior to star rating filter, are SNVs
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 20040 rows, or 10.2% of data with two or more stars that are snvs (1.5% of all data)
## 12.8 % of total data, prior to star rating filter, are Pathogenic