# Scheduled Integration of ClinVar Gene Variant-Disease Data into WikiData

ClinVar aggregates information about genomic variation and its relationship to human health <br>
CC0 https://www.ncbi.nlm.nih.gov/clinvar/

This scheduled bot operates monthly through WDI to integrate ClinVar Gene Variant-Disease Data <br>
https://www.ncbi.nlm.nih.gov/clinvar/docs/ftp_primer/ (variant_summary) <br>
https://github.com/SuLab/GeneWikiCentral/issues/50 <br>
http://jenkins.sulab.org/ <br>

Python script contributions, in order: Sabah Ul-Hasan, Andrew I Su, Tong Shu Li

In [1]:
# Relevant Modules and Libraries

import os # OS package to ensure interaction between the modules (ie WDI) and current OS being used

from datetime import datetime # For identifying the current date and time
import time # Keep track of total for loop run time

import gzip # For unzip of files
import shutil # Copies content of source file(s)
import csv # For converting file(s) to csv format

import pandas as pd # For data organization, abbreviated to pd
import numpy as np # For data organization, abbreviated as np

from wikidataintegrator import wdi_core, wdi_login # Core and login from wikidataintegrator module
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs # For retrieving references
from wikidataintegrator.wdi_helpers import try_write # For writing new item pages

In [5]:
# Download data from NCBI

## Make sure os has wget installed, or the following command wont work
os.system('wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz') 
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z") # time stamp of download 

## Unzip the file
with gzip.open('variant_summary.txt.gz', 'rb') as f_in:
    with open('variant_summary.txt', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
        
## Convert .txt file to .csv
txt_file = r"variant_summary.txt"
csv_file = r"variant_summary.csv"

with open(txt_file, "r") as in_text:
    in_reader = csv.reader(in_text, delimiter = '\t')
    with open(csv_file, "w") as out_csv:
        out_writer = csv.writer(out_csv)
        for row in in_reader:
            out_writer.writerow(row)

(1339084, 31)

In [48]:
## Import .csv file and read first 5 rows
df = pd.read_csv("variant_summary.csv") 
df.shape # 31 columns, 1339084 rows
df.head()

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID
0,156440,copy number loss,GRCh38/hg38 13q12.12(chr13:22822881-23533846)x1,-1,LINC00327;LINC00621;SACS;SACS-AS1;SGCG,-,Uncertain significance,0,"Feb 25, 2011",-1,...,na,na,13q12.12,no assertion criteria provided,1,,N,"dbVar:nssv706495,dbVar:nsv533231",2,146689
1,156440,copy number loss,GRCh38/hg38 13q12.12(chr13:22822881-23533846)x1,-1,LINC00327;LINC00621;SACS;SACS-AS1;SGCG,-,Uncertain significance,0,"Feb 25, 2011",-1,...,na,na,13q12.12,no assertion criteria provided,1,,N,"dbVar:nssv706495,dbVar:nsv533231",2,146689
2,156440,copy number loss,GRCh38/hg38 13q12.12(chr13:22822881-23533846)x1,-1,LINC00327;LINC00621;SACS;SACS-AS1;SGCG,-,Uncertain significance,0,"Feb 25, 2011",-1,...,na,na,13q12.12,no assertion criteria provided,1,,N,"dbVar:nssv706495,dbVar:nsv533231",2,146689
3,156441,copy number loss,GRCh38/hg38 18q23(chr18:79754409-79877194)x1,-1,CTDP1;KCNG2,-,Likely benign,0,"May 06, 2011",-1,...,na,na,18q23,no assertion criteria provided,1,,N,"dbVar:nssv706496,dbVar:nsv533232",2,146690
4,156441,copy number loss,GRCh38/hg38 18q23(chr18:79754409-79877194)x1,-1,CTDP1;KCNG2,-,Likely benign,0,"May 06, 2011",-1,...,na,na,18q23,no assertion criteria provided,1,,N,"dbVar:nssv706496,dbVar:nsv533232",2,146690


In [49]:
# Clean-up of data for identification and integration

## Columns to keep
new = df[['Type', 'Name', 'GeneSymbol','HGNC_ID', 'ClinicalSignificance', 'PhenotypeIDS', 'PhenotypeList','VariationID', 
            'ChromosomeAccession', 'Chromosome', 'Start', 'ReferenceAllele', 'AlternateAllele',
            'ReviewStatus']]

## Create new column that converts 'ReviewStatus' to star rating
### https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/
new['Rating'] = "" # Create empty column for gold star rating
## Convert strings from 'ReviewStatus' to 'Rating' 
new.loc[new['ReviewStatus'].str.contains('no assertion provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion criteria provided'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('no assertion for the individual variant'), 'Rating'] = 'none'
new.loc[new['ReviewStatus'].str.contains('criteria provided, single submitter'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, conflicting interpretations'), 'Rating'] = 'one'
new.loc[new['ReviewStatus'].str.contains('criteria provided, multiple submitters, no conflicts'), 'Rating'] = 'two'
new.loc[new['ReviewStatus'].str.contains('reviewed by expert panel'), 'Rating'] = 'three'
new.loc[new['ReviewStatus'].str.contains('practice guideline'), 'Rating'] = 'four'

new.shape # 15 columns, 1339084 rows 

## Training dataset, based on the following criteria
threeplus=new[new['Rating'].str.contains('three|four')]
snv=threeplus[threeplus['Type'].str.contains('single nucleotide variant')]
patho=snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]

patho.shape # 15 columns, 3870 rows (0.29% of original dataset)

## Create HGVS columns and IDs (based on genomic position)
### Naming nomenclature: https://varnomen.hgvs.org/bg-material/numbering/
patho['HGVS_NC'] = "" # Create empty column for HGVS nomenclature with NC
patho['HGVS_chr'] = "" # Create empty column for HGVS nomenclature with chr

patho['HGVS_NC']=patho['ChromosomeAccession']+':g.'+patho['Start'].astype(str)+patho['ReferenceAllele']+'>'+patho['AlternateAllele']
patho['HGVS_chr']='chr'+patho['Chromosome'].astype(str)+':g.'+patho['Start'].astype(str)+patho['ReferenceAllele']+'>'+patho['AlternateAllele']

df=patho
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/in

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr
23254,single nucleotide variant,NM_000059.3(BRCA2):c.8490G>A (p.Trp2830Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0006142,OM...","Breast-ovarian cancer, familial 2;Familial can...",155722,NC_000013.10,13,32945095,G,A,reviewed by expert panel,three,NC_000013.10:g.32945095G>A,chr13:g.32945095G>A
23255,single nucleotide variant,NM_000059.3(BRCA2):c.8490G>A (p.Trp2830Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0006142,OM...","Breast-ovarian cancer, familial 2;Familial can...",155722,NC_000013.11,13,32370958,G,A,reviewed by expert panel,three,NC_000013.11:g.32370958G>A,chr13:g.32370958G>A
24036,single nucleotide variant,NM_000059.3(BRCA2):c.6044T>A (p.Leu2015Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0027672,SN...","Breast-ovarian cancer, familial 2;Hereditary c...",156172,NC_000013.11,13,32340399,T,A,reviewed by expert panel,three,NC_000013.11:g.32340399T>A,chr13:g.32340399T>A
24037,single nucleotide variant,NM_000059.3(BRCA2):c.6044T>A (p.Leu2015Ter),BRCA2,HGNC:1101,Pathogenic,"MedGen:C2675520,OMIM:612555;MedGen:C0027672,SN...","Breast-ovarian cancer, familial 2;Hereditary c...",156172,NC_000013.10,13,32914536,T,A,reviewed by expert panel,three,NC_000013.10:g.32914536T>A,chr13:g.32914536T>A
24358,single nucleotide variant,NM_004360.5(CDH1):c.1023T>G (p.Tyr341Ter),CDH1,HGNC:1748,Pathogenic,"MedGen:C1708349,OMIM:137215,Orphanet:ORPHA26106",Hereditary diffuse gastric cancer,156374,NC_000016.10,16,68812149,T,G,reviewed by expert panel,three,NC_000016.10:g.68812149T>G,chr16:g.68812149T>G


In [50]:
# Login for running WDI
print("Logging in...") 

# **remove lines when scheduling to Jenkins** Enter your own username and password 
os.environ["WDUSER"] = "username" # Uses os package to call and set the environment for wikidata username
os.environ["WDPASS"] = "password"

## Conditional that outputs error command if not in the local python environment
if "WDUSER" in os.environ and "WDPASS" in os.environ: 
    WDUSER = os.environ['WDUSER']
    WDPASS = os.environ['WDPASS']
else: 
    raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")      

## Sets attributed username and password as 'login'
login = wdi_login.WDLogin(WDUSER, WDPASS) 

Logging in...
https://www.wikidata.org/w/api.php
Successfully logged in as Sulhasan


In [5]:
df = df[(df['HGVS_NC'].str.contains("NC_000012.11:g.103234177C>T|NC_000002.11:g.47656951C>T|NC_000003.11:g.37038192G>A|NC_000003.11:g.37042536C>T|NC_000003.11:g.37045935C>T|NC_000003.11:g.37048546C>T|NC_000003.11:g.37053589C>T|NC_000003.11:g.37056036G>A|NC_000017.10:g.7577539G>A|NC_000017.10:g.7577548C>T|NC_000017.10:g.7578190T>C|NC_000021.8:g.36171704G>T|NC_000021.8:g.36252962C>G|NC_000021.8:g.36259163T>C|NC_000010.10:g.89711899C>T|NC_000012.11:g.103310908T>C|NC_000012.12:g.102917130T>C|NC_000017.10:g.7577120C>T|NC_000017.10:g.7577538C>T"))|(df['HGVS_chr'].str.contains("chr17:g.41228590G>A|chr17:g.41234451G>A"))]
df.shape # 21 x 17

# First 1 NC does not match to Wikidata, chr (2) do not match to Wikidata *18 possible writes of 21
df.head(21)

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr
63376,single nucleotide variant,NM_000314.7(PTEN):c.517C>T (p.Arg173Cys),PTEN,HGNC:9588,Pathogenic,"MedGen:CN072330,OMIM:158350;MedGen:CN072330,OM...",Cowden syndrome 1;Cowden syndrome 1;Glioma sus...,189500,NC_000010.10,10,89711899,C,T,reviewed by expert panel,three,NC_000010.10:g.89711899C>T,chr10:g.89711899C>T
141117,single nucleotide variant,NM_000277.3(PAH):c.1315+1G>A,PAH,HGNC:8582,Pathogenic,"MeSH:D030342,MedGen:C0950123;MedGen:C0751434,O...",Inborn genetic diseases;Phenylketonuria;not pr...,576,NC_000012.11,12,103234177,C,T,reviewed by expert panel,three,NC_000012.11:g.103234177C>T,chr12:g.103234177C>T
141137,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.11,12,103310908,T,C,reviewed by expert panel,three,NC_000012.11:g.103310908T>C,chr12:g.103310908T>C
141138,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.12,12,102917130,T,C,reviewed by expert panel,three,NC_000012.12:g.102917130T>C,chr12:g.102917130T>C
162457,single nucleotide variant,NM_000546.5(TP53):c.742C>T (p.Arg248Trp),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0000157,MedGen:C08...",Abnormality of the tongue;Acute myeloid leukem...,12347,NC_000017.10,17,7577539,G,A,reviewed by expert panel,three,NC_000017.10:g.7577539G>A,chr17:g.7577539G>A
162484,single nucleotide variant,NM_000546.5(TP53):c.743G>A (p.Arg248Gln),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0004808,MeSH:D0154...",Acute myeloid leukemia;Adenocarcinoma of prost...,12356,NC_000017.10,17,7577538,C,T,reviewed by expert panel,three,NC_000017.10:g.7577538C>T,chr17:g.7577538C>T
162512,single nucleotide variant,NM_000546.5(TP53):c.733G>A (p.Gly245Ser),TP53,HGNC:11998,Pathogenic,MedGen:C0001418;MedGen:C0007112;MedGen:C027870...,Adenocarcinoma;Adenocarcinoma of prostate;Aden...,12365,NC_000017.10,17,7577548,C,T,reviewed by expert panel,three,NC_000017.10:g.7577548C>T,chr17:g.7577548C>T
162517,single nucleotide variant,NM_000546.5(TP53):c.818G>A (p.Arg273His),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0004808,MeSH:D0154...",Acute myeloid leukemia;Adenocarcinoma of prost...,12366,NC_000017.10,17,7577120,C,T,reviewed by expert panel,three,NC_000017.10:g.7577120C>T,chr17:g.7577120C>T
166485,single nucleotide variant,NM_001754.4(RUNX1):c.328A>G (p.Lys110Glu),RUNX1,HGNC:10471,Pathogenic,"MedGen:C1832388,OMIM:601399,Orphanet:ORPHA7129...",Familial platelet disorder with associated mye...,14465,NC_000021.8,21,36259163,T,C,reviewed by expert panel,three,NC_000021.8:g.36259163T>C,chr21:g.36259163T>C
166489,single nucleotide variant,NM_001754.4(RUNX1):c.861C>A (p.Tyr287Ter),RUNX1,HGNC:10471,Pathogenic,"MedGen:C1832388,OMIM:601399,Orphanet:ORPHA71290",Familial platelet disorder with associated mye...,14467,NC_000021.8,21,36171704,G,T,reviewed by expert panel,three,NC_000021.8:g.36171704G>T,chr21:g.36171704G>T


In [51]:
# Training data, Feb 11

# No match (need item page)
## NC_000012.11:g.103234177C>T

# Single NC match
## NC_000010.10:g.89711899C>T

# Single chr match
## chr17:g.41234451G>A

# Two NC matches in for same Wikidata item page, but different entries
## NC_000012.11:g.103310908T>C
## NC_000012.12:g.102917130T>C

# NC matches to two different Wikidata item pages (remove from Sandbox https://www.wikidata.org/wiki/Q4115189)
## NC_000002.11:g.47656951C>T

# One NC match, one chr match manually entered (Feb 11)
## NC_000017.10:g.7577539G>A
## chr17:g.7577539G>A


df = df[(df['HGVS_NC'].str.contains("NC_000012.11:g.103234177C>T|NC_000010.10:g.89711899C>T|NC_000012.11:g.103310908T>C|NC_000012.12:g.102917130T>C|NC_000002.11:g.47656951C>T|NC_000017.10:g.7577539G>A"))|(df['HGVS_chr'].str.contains("chr17:g.41234451G>A|chr17:g.7577539G>A"))]
df.shape # 7 x 17
df.head(7)

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr
63376,single nucleotide variant,NM_000314.7(PTEN):c.517C>T (p.Arg173Cys),PTEN,HGNC:9588,Pathogenic,"MedGen:CN072330,OMIM:158350;MedGen:CN072330,OM...",Cowden syndrome 1;Cowden syndrome 1;Glioma sus...,189500,NC_000010.10,10,89711899,C,T,reviewed by expert panel,three,NC_000010.10:g.89711899C>T,chr10:g.89711899C>T
141117,single nucleotide variant,NM_000277.3(PAH):c.1315+1G>A,PAH,HGNC:8582,Pathogenic,"MeSH:D030342,MedGen:C0950123;MedGen:C0751434,O...",Inborn genetic diseases;Phenylketonuria;not pr...,576,NC_000012.11,12,103234177,C,T,reviewed by expert panel,three,NC_000012.11:g.103234177C>T,chr12:g.103234177C>T
141137,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.11,12,103310908,T,C,reviewed by expert panel,three,NC_000012.11:g.103310908T>C,chr12:g.103310908T>C
141138,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.12,12,102917130,T,C,reviewed by expert panel,three,NC_000012.12:g.102917130T>C,chr12:g.102917130T>C
162457,single nucleotide variant,NM_000546.5(TP53):c.742C>T (p.Arg248Trp),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0000157,MedGen:C08...",Abnormality of the tongue;Acute myeloid leukem...,12347,NC_000017.10,17,7577539,G,A,reviewed by expert panel,three,NC_000017.10:g.7577539G>A,chr17:g.7577539G>A
172160,single nucleotide variant,NM_007299.4(BRCA1):c.1018C>T (p.Arg340Ter),BRCA1,HGNC:1100,Pathogenic,"MedGen:CN221562;MedGen:C2676676,OMIM:604370;Me...",Breast and/or ovarian cancer;Breast-ovarian ca...,17675,NC_000017.10,17,41234451,G,A,reviewed by expert panel,three,NC_000017.10:g.41234451G>A,chr17:g.41234451G>A
242498,single nucleotide variant,NM_000251.2(MSH2):c.1147C>T (p.Arg383Ter),MSH2,HGNC:7325,Pathogenic,"MedGen:C0027672,SNOMED CT:699346009;MedGen:C00...",Hereditary cancer-predisposing syndrome;Heredi...,90554,NC_000002.11,2,47656951,C,T,reviewed by expert panel,three,NC_000002.11:g.47656951C>T,chr2:g.47656951C>T


In [52]:
df['Variant Qid'] = "" # To be replaced with Qid, 'absent', or 'multiple' 
df['Disease Qid'] = "" 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
# Start time of for loop run 
start_time = time.time() 

# For loop on training dataset
for index, row in df.iterrows(): 
    
    # Assign string for a given row in the data table to retrieve the Wikidata Qid for item page 
    ## Criterion: HGVS identifier for genomic ids only (exclude cDNA *ENST and NM* and protein *NP*)
    HGVS_NC = row['HGVS_NC'] 
    HGVS_chr = row['HGVS_chr'] 
    
    # SparQL query to search HGVS Identifier (P3331) in Wikidata 
    sparqlQuery_HGVSNC = "SELECT * WHERE {?variant wdt:P3331 \""+HGVS_NC+"\"}" 
    result_HGVSNC = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGVSNC) # python wrapper for WDI
    print("sparql results NC:", result_HGVSNC) # dictionary
    sparqlQuery_HGVSchr = "SELECT * WHERE {?variant wdt:P3331 \""+HGVS_chr+"\"}" 
    result_HGVSchr = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGVSchr)   
    
    # Assign the results to respective lists *we know these are lists by using the type() function
    NC = result_HGVSNC["results"]["bindings"]
    print("NC list:", NC) # list
    CHR = result_HGVSchr["results"]["bindings"]
    
    # Distinguish 3 scenarios for flagging (2+), a new item page (0), or an association statement (1)
    HGVSarray = np.concatenate((NC,CHR), axis=0) # Concatenate lists as combined array
    print("HGVS array:", HGVSarray) # 0, 1, or 2 (or more) elements in array "variant"
    # Axis = 0, 1, 2... https://docs.scipy.org/doc/numpy/reference/generated/numpy.concatenate.html (2D arrays)
    ## [[]] = 2D, upper limit of axis possible generally # dimensions -1 
    ### I have a 1D array, axis = 0
    HGVSlist = [] # create empty list
    
    # For loop assigns each list as an integer in array
    for i in range(len(HGVSarray)): # if array is 0, then it skips loop entirely (empty)
        # If 0 not in 1-[however long it is], and goes through all
        if HGVSarray[i] not in HGVSarray[i + 1:]: # evaluated as T/F, want entry to append (be TRUE) even if i = 0
            ## HGVSarray[i] = something
            ## HGVSarray[i + 1:] == empty
            ## something not in* empty, true (then append) different from !=
            HGVSlist.append(HGVSarray[i]) # outputs only unique from conditional
            ## if [1,1] then it wont do anything on first 1 (false), and will append on second 1 (true)
                ### what will happen if there's 3, 4, 5...
        ### comparison: true/false (==)
        ### assignment: = 
        # remove this here, equivalent of no statement
        # else:
            # HGVSlist == HGVSarray[i] # works here, but still need to remove it because nothing happens
  
    ### need to know what's going on with logic, variables, loops, and flow
    #### write a test if it doesn't work out (ie cell above)
    ############ come to Andrew and say, I think this is happening but is this what's happening??
    
    print("HGVS list:", HGVSlist)
            
    # Scenario 1: Flag if HGVS identifier(s) have multiple wikidata item pages (works, tested NC_000002.11:g.47656951C>T)
    if len(HGVSlist) > 1: 
        df.at[index, 'Variant Qid'] = "flag"  
        continue
            
    # Scenario 2: Write item page if no identifiers available   
    ## How to create this?
    if len(HGVSlist) == 0:
        print("need item page")
        df.at[index, 'Variant Qid'] = "absent"  
        #try_write(HGVS_NC, 
        #          edit_summary=("gene variant in human gene ", df.loc[index, 'GeneSymbol']),
        #          record_id = Qid,
        #          record_prop = 'human gene variant',
        #          login = login)
        #print("item page created")
        continue
        
    # Scenario 3: Go forward with write if only one or if duplicate (combine)     
    Variant_Qid = HGVSlist[0]["variant"]["value"].replace("http://www.wikidata.org/entity/", "")
    df.at[index, 'Variant Qid'] = Variant_Qid      
    # Add missing NC or chr (HGVS nomenclature: P3331)   
    # Add VariationID (ClinVar Variation ID: P3329)
    
    # Look at Disease
    # Separate out by phenotype (;), split line in two+ based on diseases *concatenate
        ## Do in loop, or outside of it?
        # If only one Qid for that line, then continue with associatin write
        # If two or more, then don't write
        # Add identifiers that are missing in wikidata  
    
    # Write for both variant and disease(s)
        
end_time = time.time() # Captures when loop run ends
print("The total time of this loop is:", end_time - start_time, "seconds, or", (end_time - start_time)/60, "minutes")

df.head(7)

sparql results NC: {'head': {'vars': ['variant']}, 'results': {'bindings': [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938054'}}]}}
NC list: [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938054'}}]
HGVS array: [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938054'}}]
HGVS list: [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q29938054'}}]
sparql results NC: {'head': {'vars': ['variant']}, 'results': {'bindings': []}}
NC list: []
HGVS array: []
HGVS list: []
need item page
sparql results NC: {'head': {'vars': ['variant']}, 'results': {'bindings': [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]}}
NC list: [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]
HGVS array: [{'variant': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q64401263'}}]
HGVS list: [{'variant': {'type': 'uri', 'value': 'http://www.wikidata

Unnamed: 0,Type,Name,GeneSymbol,HGNC_ID,ClinicalSignificance,PhenotypeIDS,PhenotypeList,VariationID,ChromosomeAccession,Chromosome,Start,ReferenceAllele,AlternateAllele,ReviewStatus,Rating,HGVS_NC,HGVS_chr,Variant Qid
63376,single nucleotide variant,NM_000314.7(PTEN):c.517C>T (p.Arg173Cys),PTEN,HGNC:9588,Pathogenic,"MedGen:CN072330,OMIM:158350;MedGen:CN072330,OM...",Cowden syndrome 1;Cowden syndrome 1;Glioma sus...,189500,NC_000010.10,10,89711899,C,T,reviewed by expert panel,three,NC_000010.10:g.89711899C>T,chr10:g.89711899C>T,Q29938054
141117,single nucleotide variant,NM_000277.3(PAH):c.1315+1G>A,PAH,HGNC:8582,Pathogenic,"MeSH:D030342,MedGen:C0950123;MedGen:C0751434,O...",Inborn genetic diseases;Phenylketonuria;not pr...,576,NC_000012.11,12,103234177,C,T,reviewed by expert panel,three,NC_000012.11:g.103234177C>T,chr12:g.103234177C>T,absent
141137,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.11,12,103310908,T,C,reviewed by expert panel,three,NC_000012.11:g.103310908T>C,chr12:g.103310908T>C,Q64401263
141138,single nucleotide variant,NM_000277.3(PAH):c.1A>G (p.Met1Val),PAH,HGNC:8582,Pathogenic,"Human Phenotype Ontology:HP:0004923,MedGen:C07...","Hyperphenylalaninemia, non-pku;Phenylketonuria...",586,NC_000012.12,12,102917130,T,C,reviewed by expert panel,three,NC_000012.12:g.102917130T>C,chr12:g.102917130T>C,Q64401263
162457,single nucleotide variant,NM_000546.5(TP53):c.742C>T (p.Arg248Trp),TP53,HGNC:11998,Pathogenic,"Human Phenotype Ontology:HP:0000157,MedGen:C08...",Abnormality of the tongue;Acute myeloid leukem...,12347,NC_000017.10,17,7577539,G,A,reviewed by expert panel,three,NC_000017.10:g.7577539G>A,chr17:g.7577539G>A,Q28371040
172160,single nucleotide variant,NM_007299.4(BRCA1):c.1018C>T (p.Arg340Ter),BRCA1,HGNC:1100,Pathogenic,"MedGen:CN221562;MedGen:C2676676,OMIM:604370;Me...",Breast and/or ovarian cancer;Breast-ovarian ca...,17675,NC_000017.10,17,41234451,G,A,reviewed by expert panel,three,NC_000017.10:g.41234451G>A,chr17:g.41234451G>A,Q29938735
242498,single nucleotide variant,NM_000251.2(MSH2):c.1147C>T (p.Arg383Ter),MSH2,HGNC:7325,Pathogenic,"MedGen:C0027672,SNOMED CT:699346009;MedGen:C00...",Hereditary cancer-predisposing syndrome;Heredi...,90554,NC_000002.11,2,47656951,C,T,reviewed by expert panel,three,NC_000002.11:g.47656951C>T,chr2:g.47656951C>T,flag


In [47]:
list = [1,2,3,4]
print(list)

# if i = 1, and we want to get everything 2 and after
## How slices work
list[0:3] # giving elements: 0, 1, 2
list[1:3] # giving elements: 1, 2
list[1:4] # giving elements: 1, 2, 3
list[1:5] # giving elements: 1, 2, 3
list[1:50] # giving elements: 1, 2, 3
list[10:50] # nothing in this section of the list (empty)

list = [1]
list[0 + 1:] # empty, so that makes if not in true

[1, 2, 3, 4]


[]

In [67]:
## Download HGVS Wikidata query results as query.csv 
### https://query.wikidata.org/#SELECT%20%2a%20WHERE%20%7B%3Fgene%20wdt%3AP3331%20%3FHGVS%7D

query = pd.read_csv("/Users/sulhasan/Desktop/Su Lab Projects/ClinVar-Bot_GeneWikiCentral-Issue50/query.csv")  

## Subset for NC or chr
query = query[query['HGVS'].str.contains('chr|NC_',  na=False)] # 818 with NC or chr
query.head(5) 

# How many HGVS IDs match for Wikidata query (818) vs. manually created in ClinVar (3778 x 2 for NC or chr)
len(set(query["HGVS"]) & (set(df["HGVS_chr"]) | set(df["HGVS_NC"]))) # 17
set(query["HGVS"]) & (set(df["HGVS_chr"]) | set(hasHGVS["HGVS_NC"])) # 4 chr, 13 NC *chr dont match...

{'NC_000002.11:g.47656951C>T',
 'NC_000003.11:g.37038192G>A',
 'NC_000003.11:g.37042536C>T',
 'NC_000003.11:g.37045935C>T',
 'NC_000003.11:g.37048546C>T',
 'NC_000003.11:g.37053589C>T',
 'NC_000003.11:g.37056036G>A',
 'NC_000010.10:g.89711899C>T',
 'NC_000012.11:g.103310908T>C',
 'NC_000012.12:g.102917130T>C',
 'NC_000017.10:g.7577120C>T',
 'NC_000017.10:g.7577538C>T',
 'NC_000017.10:g.7577539G>A',
 'NC_000017.10:g.7577548C>T',
 'NC_000017.10:g.7578190T>C',
 'NC_000021.8:g.36171704G>T',
 'NC_000021.8:g.36252962C>G',
 'NC_000021.8:g.36259163T>C'}

In [None]:
# Keep anything with 'two' or more stars in 'Rating' column
twoplus = new[~new['Rating'].str.contains('one')] # excludes both one and none
twoplus.shape # 13 columns, 196530 rows: 1123285 removed, 14.9 % of all data usable
# Keep anything noted as 'single nucleotide variant' in 'Type' column
snv = twoplus[twoplus['Type'].str.contains('single nucleotide variant')]
snv.shape # 174733 rows, or 88.9% of data with two or more stars (13.2% of all data)
## 83.2 % of total data, prior to star rating filter, are SNVs
# Keep anything with 'Pathogenic' in the 'ClinicalSignificance' column
patho = snv[snv['ClinicalSignificance'].str.contains('Pathogenic')]
patho.shape # 20040 rows, or 10.2% of data with two or more stars that are snvs (1.5% of all data)
## 12.8 % of total data, prior to star rating filter, are Pathogenic