# Example bot to replicate the statement added to Wikidata
Note that these sections are created using the 'Markdown' dropdown option whereas the others are created using 'Code'

### Load the relevant modules and packages

In [1]:
# Script contributions (in order): Sabah Ul-Hasan, Andra Waagmeester, Andrew Su

# Installations by shell (! command)
# pip installations needed based on environment used
# If ran in PyCharm, for example, then directly import (vs needed for notebook, colab, and jenkins)
# Bot will be run in jenkins: http://jenkins.sulab.org/

!pip install --upgrade pip # Install pip, then ensure it's up-to-date for installing python packages 
!pip3 install tqdm # Library to visualize installation progress (progress bar)
!pip3 install termcolor # Module to call on color-coding for the printed output
!pip3 install wikidataintegrator # Module for wikidata

# Brief terminology overview
# Library or Package: A collection of modules
# Module: File that contains python functions and global variables

Collecting pip
  Downloading https://files.pythonhosted.org/packages/00/b6/9cfa56b4081ad13874b0c6f96af8ce16cfbc1cb06bedf8e9164ce5551ec1/pip-19.3.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 318kB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 9.0.1
    Uninstalling pip-9.0.1:
      Successfully uninstalled pip-9.0.1
Successfully installed pip-19.3.1
Collecting tqdm
  Using cached https://files.pythonhosted.org/packages/e1/c1/bc1dba38b48f4ae3c4428aea669c5e27bd5a7642a74c8348451e0bd8ff86/tqdm-4.36.1-py2.py3-none-any.whl
Installing collected packages: tqdm
Successfully installed tqdm-4.36.1
Collecting termcolor
  Using cached https://files.pythonhosted.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz
Installing collected packages: termcolor
    Running setup.py install for termcolor ... [?25ldone
[?25hSuccessfully installed termcolor-1.1.0
Collecting wik

In [2]:
from wikidataintegrator import wdi_core, wdi_login # Imports core and login packages from wikidataintegrator module
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs # For retrieving references
from datetime import datetime # For identifying the current date and time
import copy # Copies references needed in the .csv for uploading to wikidata

import pandas as pd # Pandas is a data organization package, then abbreviated to pd
import numpy as np # NumPy is another general purpose package
from termcolor import colored # Imports colored package from termcolor

import os # OS package ensures interaction between the modules (ie WDI) and current operating system is being used

### Upload the ClinGen data (source:  https://search.clinicalgenome.org/kb/gene-validity.csv)
Issue posted on Github: https://github.com/SuLab/GeneWikiCentral/issues/116

In [27]:
# Set data frame, skip first 6 rows to avoid error
df = pd.read_csv('https://search.clinicalgenome.org/kb/gene-validity.csv', skiprows=6, header=None)   
# pandas is the library with function read.csv -> relabeled as pd to abbreviate

# Input current date and time of file download as a string for ref function later
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z") # String that represents a time (put it as 'midnight')

# Relabel column headings (variables)
df.columns = ['Gene', 'HGNC Gene ID', 'Disease', 'MONDO Disease ID','SOP','Classification','Report Reference URL','Report Date']
# Diseases with similar names but separate MONDO IDs are unique diseases. Enter as unique values within the 'gene association' statement accordingly
# Diseases with the same MONDO IDs associated to the same gene, but with multiple reports should be entered as multiple references
# There are complications with 'Classifications' and associated URLs for which there are mutliple references possible => See ClinGenBot_Status-Output...csv

# View dataframe
df

Unnamed: 0,Gene,HGNC Gene ID,Disease,MONDO Disease ID,SOP,Classification,Report Reference URL,Report Date
0,A2ML1,HGNC:23336,Noonan syndrome with multiple lentigines,MONDO_0007893,SOP5,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-07T14:37:47.175Z
1,A2ML1,HGNC:23336,cardiofaciocutaneous syndrome,MONDO_0015280,SOP5,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-07T14:31:03.696Z
2,A2ML1,HGNC:23336,Costello syndrome,MONDO_0009026,SOP5,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-07T14:34:05.324Z
3,A2ML1,HGNC:23336,Noonan syndrome,MONDO_0018997,SOP5,Disputed,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-07T14:23:53.157Z
4,A2ML1,HGNC:23336,Noonan syndrome-like disorder with loose anage...,MONDO_0011899,SOP5,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-07T14:40:11.599Z
5,AARS,HGNC:20,undetermined early-onset epileptic encephalopathy,MONDO_0018614,SOP6,Limited,https://search.clinicalgenome.org/kb/gene-vali...,2018-11-20T17:00:00.000Z
6,ABCC9,HGNC:60,hypertrichotic osteochondrodysplasia Cantu type,MONDO_0009406,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2017-09-27T00:00:00
7,ABCD1,HGNC:61,X-linked cerebral adrenoleukodystrophy,MONDO_0010247,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-02-07T14:00:00
8,ABHD12,HGNC:15868,PHARC syndrome,MONDO_0012984,SOP5,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-06-28T16:45:15.791Z
9,ACAD8,HGNC:87,isobutyryl-CoA dehydrogenase deficiency,MONDO_0012648,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-04-26T16:00:00.000Z


If we do isoformat, it doesnt like it.

WDApiError: {'error': {'code': 'modification-failed', 'info': 'Data value corrupt: $timestamp must resemble ISO 8601, given 2019-10-18T16:32:34.420821', 'messages': [{'name': 'wikibase-validator-bad-value', 'parameters': ['$timestamp must resemble ISO 8601, given 2019-10-18T16:32:34.420821'], 'html': {'*': 'Data value corrupt: $timestamp must resemble ISO 8601, given 2019-10-18T16:32:34.420821'}}], '*': 'See https://www.wikidata.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}, 'servedby': 'mw1347'}

### Delete section after automated
#### Adjust ClinGen data file for practice run before implementing all entries
Include a row that has already been entered: Row 1 <br/> 
Include a row that hasn't yet been entered: Row 0

### Delete section after automated

In [28]:
subsetdf = df[15:30] # Subset for first two rows of the df and rename it as subsetdf
subsetdf

Unnamed: 0,Gene,HGNC Gene ID,Disease,MONDO Disease ID,SOP,Classification,Report Reference URL,Report Date
15,ACADVL,HGNC:92,very long chain acyl-CoA dehydrogenase deficiency,MONDO_0008723,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-02-20T17:00:00.000Z
16,ACAT1,HGNC:93,beta-ketothiolase deficiency,MONDO_0008760,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-05-22T16:00:00.000Z
17,ACSL4,HGNC:3571,non-syndromic X-linked intellectual disability,MONDO_0019181,SOP4,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2017-10-20T00:00:00
18,ACTA1,HGNC:129,hypertrophic cardiomyopathy,MONDO_0005045,SOP4,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,false
19,ACTA2,HGNC:130,familial thoracic aortic aneurysm and aortic d...,MONDO_0019625,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2016-09-27T00:00:00
20,ACTC1,HGNC:143,hypertrophic cardiomyopathy,MONDO_0005045,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2017-10-23T14:00:00
21,ACTG1,HGNC:144,nonsyndromic genetic deafness,MONDO_0019497,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z
22,ACTG1,HGNC:144,Baraitser-winter syndrome 2,MONDO_0013812,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z
23,ACTN2,HGNC:164,intrinsic cardiomyopathy,MONDO_0000591,SOP5,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2018-08-06T13:12:55.615Z
24,ACVRL1,HGNC:175,"telangiectasia, hereditary hemorrhagic, type 2",MONDO_0010880,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-08-28T16:00:00.000Z


### Log into Wikidata for executing the task 

In [5]:
print("Logging in...") # Outputs 'Logging in' as you wait to have your username and password recognized

# Enter your own username and password ** to be updated to ProteinBoxBot
os.environ["WDUSER"] = "username" # Uses os package to call and set the environment for wikidata username
os.environ["WDPASS"] = "password"

# If else conditional for both username and password being in the local environment
if "WDUSER" in os.environ and "WDPASS" in os.environ: 
    WDUSER = os.environ['WDUSER']
    WDPASS = os.environ['WDPASS']
else: # Else statement outputs error command if not in the local python environment
    raise ValueError("WDUSER and WDPASS must be specified in local.py or as environment variables")
    
login = wdi_login.WDLogin(WDUSER, WDPASS) # wdi_login package calls upon login function with atributed username and password and sets as 'login'

Logging in...
https://www.wikidata.org/w/api.php
Successfully logged in as Sulhasan


### For loop that iterates across dataframe and uploads to WikiData

In [29]:
# Create a function for adding references to then be iterated in the loop
def create_reference(): # Function defined as 'create_reference()'
        refStatedIn = wdi_core.WDItemID(value="Q64403342", prop_nr="P248", is_reference=True) # Q64403342 is the ClinGen item Q# and P248 is 'stated in' property P#
        refRetrieved = wdi_core.WDTime(timeStringNow, prop_nr="P813", is_reference=True) # calls on previous string with time stamp from file download, P813 is 'retrieved'
        refURL = wdi_core.WDUrl((subsetdf.loc[index, 'Report Reference URL']), prop_nr="P854", is_reference=True) # P854 is 'reference URL'
        return [refStatedIn, refRetrieved, refURL]

In [30]:
subsetdf['Status'] = "pending" # Create "Status" column with 'pending' for all cells ('error' or 'complete' or 'previously logged' downstream)
subsetdf['Definitive'] = "" # Empty cell to be replaced with 'yes' or 'no' string
subsetdf['Gene QID'] = "" # To be replaced with 'absent' or 'multiple'
subsetdf['Disease QID'] = "" # To be replaced with 'absent' or 'multiple'

subsetdf
# Python output shows concern that we're entering the same string for all cells, but that's what we want so it's fine

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

Unnamed: 0,Gene,HGNC Gene ID,Disease,MONDO Disease ID,SOP,Classification,Report Reference URL,Report Date,Status,Definitive,Gene QID,Disease QID
15,ACADVL,HGNC:92,very long chain acyl-CoA dehydrogenase deficiency,MONDO_0008723,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-02-20T17:00:00.000Z,pending,,,
16,ACAT1,HGNC:93,beta-ketothiolase deficiency,MONDO_0008760,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-05-22T16:00:00.000Z,pending,,,
17,ACSL4,HGNC:3571,non-syndromic X-linked intellectual disability,MONDO_0019181,SOP4,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2017-10-20T00:00:00,pending,,,
18,ACTA1,HGNC:129,hypertrophic cardiomyopathy,MONDO_0005045,SOP4,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,false,pending,,,
19,ACTA2,HGNC:130,familial thoracic aortic aneurysm and aortic d...,MONDO_0019625,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2016-09-27T00:00:00,pending,,,
20,ACTC1,HGNC:143,hypertrophic cardiomyopathy,MONDO_0005045,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2017-10-23T14:00:00,pending,,,
21,ACTG1,HGNC:144,nonsyndromic genetic deafness,MONDO_0019497,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z,pending,,,
22,ACTG1,HGNC:144,Baraitser-winter syndrome 2,MONDO_0013812,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z,pending,,,
23,ACTN2,HGNC:164,intrinsic cardiomyopathy,MONDO_0000591,SOP5,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2018-08-06T13:12:55.615Z,pending,,,
24,ACVRL1,HGNC:175,"telangiectasia, hereditary hemorrhagic, type 2",MONDO_0010880,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-08-28T16:00:00.000Z,pending,,,


In [31]:
# For loop that executes the following through each row of the dataframe 
# index is row number, row is all variables and values for that row
for index, row in subsetdf.iterrows():
    
    # Write for rows only for where the Classification is 'Definitive'
    if row['Classification']!='Definitive': # If the string is anything other than 'Definitive' for the Classification column
        subsetdf.at[index, 'Status'] = "error" # Then input "error" in the Status column
        subsetdf.at[index, 'Definitive'] = "no" # And 'no' for Definitive classification
        continue # And skip the rest of the for loop
    else:
        subsetdf.at[index, 'Definitive'] = "yes" 
        
    # Identifies the string in the Gene or Disease column for a given row
    HGNC = subsetdf.loc[index, 'Gene'] 
    MONDO = subsetdf.loc[index, 'MONDO Disease ID'].replace("_", ":") # .replace() changes _ to : for SparQL query
    
    # SparQL query to search for Gene or Diseasae in Wikidata based on HGNC (P353) or MONDO (P5270)
    sparqlQuery_HGNC = "SELECT * WHERE {?gene wdt:P353 \""+HGNC+"\"}" 
    result_HGNC = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_HGNC) # Resultant query
    sparqlQuery_MONDO = "SELECT * WHERE {?disease wdt:P5270 \""+MONDO+"\"}" 
    result_MONDO = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery_MONDO)
    
    # Conditional that utilizes length function to call upon the result dictionary for either Gene or Disease
    if len(result_HGNC["results"]["bindings"])==1: # We only want one Q# result 
        HGNC_qid = result_HGNC["results"]["bindings"][0]["gene"]["value"].replace("http://www.wikidata.org/entity/", "") 
    elif len(result_HGNC["results"]["bindings"])>1: # If the value is greater than 1 (if the Gene shows up with more than one Q#)
        subsetdf.at[index, 'Status'] = "error" 
        subsetdf.at[index, 'Gene QID'] = "multiple" 
        continue 
    else: # If the value is 0
        subsetdf.at[index, 'Status'] = "error" 
        subsetdf.at[index, 'Gene QID'] = "absent" 
        continue 
        
    # Note that we DONT want to nest, otherwise it will log the disease qid from the previously successful run   
    if len(result_MONDO["results"]["bindings"])==1: 
        MONDO_qid = result_MONDO["results"]["bindings"][0]["disease"]["value"].replace("http://www.wikidata.org/entity/", "") 
    elif len(result_MONDO["results"]["bindings"])>1: 
        subsetdf.at[index, 'Status'] = "error" 
        subsetdf.at[index, 'Disease QID'] = "multiple" 
        continue
    else:
        subsetdf.at[index, 'Status'] = "error" 
        subsetdf.at[index, 'Disease QID'] = "absent" 
        continue
        
    # Call upon create_reference() function created, note that function will to existing references (not overwrite)     
    reference = create_reference() 
 
    # Add disease value to gene item page, and gene value to disease item page

    statement_HGNC = [wdi_core.WDItemID(value=MONDO_qid, prop_nr="P2293", references=[copy.deepcopy(reference)])] # Creates 'gene assocation' statement (P2293) whether or not it's already there, and includes the references
    wikidata_HGNCitem = wdi_core.WDItemEngine(wd_item_id=HGNC_qid,data=statement_HGNC, append_value=["P2293"])
    wikidata_HGNCitem.get_wd_json_representation() # Gives json structure that submitted to API, can be helpful for debugging 
    statement_MONDO = [wdi_core.WDItemID(value=HGNC_qid, prop_nr="P2293", references=[copy.deepcopy(reference)])] # Symmetry for disease item page
    wikidata_MONDOitem = wdi_core.WDItemEngine(wd_item_id=MONDO_qid,data=statement_MONDO, append_value=["P2293"])
    wikidata_MONDOitem.get_wd_json_representation()
    
    subsetdf.at[index, 'Status'] = "complete" 
    print(colored(HGNC,"blue"), "Gene successfully logged as", colored(wikidata_HGNCitem.write(login),"blue"), "and", colored(MONDO,"green"), "Disease successfully logged as", colored(wikidata_MONDOitem.write(login),"green"))

# Write output to a .csv file
now = datetime.now() # Retrieves current time and saves it as 'now'
# Includes hour:minute:second_dd-mm-yyyy time stamp (https://en.wikipedia.org/wiki/ISO_8601)
subsetdf.to_csv("ClinGenBot_Status-Output_" + now.isoformat() + ".csv")  # isoformat
subsetdf

# Brief terminology overview

# Types of data structures (Data Structure: How we store and retrieve data)
# We can easily figure out what type of data structure something with 'print(type())'
# String: A mutable list of characters in order (keystroke, letter, number...)
# List: denoted by [], a changeable/mutable and sequence of objects/elements where each value inside of the element(s) within a list is considera an item
# Tuple: denoted by (), an unchangeable sequence of objects (similar to lists, except with parentheses and cannot be changed)
# Array: Stores values of the same data type, which is its main distinction from a list 
# Dictionary: denoted by {}, maps mutable set of objects referred to as 'keys' to another set referred to as 'values' 
# Set: An unordered collection of data that is mutable with no duplicate elements
# Frozen set: An unmutable set

# Conditionals and loops
# Conditionals: if, if else, chained vs nested 
# Iterations (loops): reassignment, variable update, for vs while

[34mACADVL[0m Gene successfully logged as [34mQ15996541[0m and [32mMONDO:0008723[0m Disease successfully logged as [32mQ7923095[0m
[34mACAT1[0m Gene successfully logged as [34mQ14913201[0m and [32mMONDO:0008760[0m Disease successfully logged as [32mQ4897218[0m
[34mACTG1[0m Gene successfully logged as [34mQ17709277[0m and [32mMONDO:0019497[0m Disease successfully logged as [32mQ9079046[0m
[34mADGRV1[0m Gene successfully logged as [34mQ18047368[0m and [32mMONDO:0011558[0m Disease successfully logged as [32mQ32143643[0m
[34mAFF2[0m Gene successfully logged as [34mQ17928899[0m and [32mMONDO:0010659[0m Disease successfully logged as [32mQ21051307[0m


Unnamed: 0,Gene,HGNC Gene ID,Disease,MONDO Disease ID,SOP,Classification,Report Reference URL,Report Date,Status,Definitive,Gene QID,Disease QID
15,ACADVL,HGNC:92,very long chain acyl-CoA dehydrogenase deficiency,MONDO_0008723,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-02-20T17:00:00.000Z,complete,yes,,
16,ACAT1,HGNC:93,beta-ketothiolase deficiency,MONDO_0008760,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2018-05-22T16:00:00.000Z,complete,yes,,
17,ACSL4,HGNC:3571,non-syndromic X-linked intellectual disability,MONDO_0019181,SOP4,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2017-10-20T00:00:00,error,no,,
18,ACTA1,HGNC:129,hypertrophic cardiomyopathy,MONDO_0005045,SOP4,No Reported Evidence,https://search.clinicalgenome.org/kb/gene-vali...,false,error,no,,
19,ACTA2,HGNC:130,familial thoracic aortic aneurysm and aortic d...,MONDO_0019625,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2016-09-27T00:00:00,error,yes,,absent
20,ACTC1,HGNC:143,hypertrophic cardiomyopathy,MONDO_0005045,SOP4,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2017-10-23T14:00:00,error,yes,,absent
21,ACTG1,HGNC:144,nonsyndromic genetic deafness,MONDO_0019497,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z,complete,yes,,
22,ACTG1,HGNC:144,Baraitser-winter syndrome 2,MONDO_0013812,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-01-07T17:00:00.000Z,error,yes,,absent
23,ACTN2,HGNC:164,intrinsic cardiomyopathy,MONDO_0000591,SOP5,Moderate,https://search.clinicalgenome.org/kb/gene-vali...,2018-08-06T13:12:55.615Z,error,no,,
24,ACVRL1,HGNC:175,"telangiectasia, hereditary hemorrhagic, type 2",MONDO_0010880,SOP6,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2019-08-28T16:00:00.000Z,error,yes,,absent


In [None]:
strftime("%H:%M:%S_%Y-%m-%d")

# Troubleshooting SparQL query

# A brand new statement
# Statement that is ClinGen
# Statement that isn't ClinGen
# Create explicit tests (output)
# ref_handlers (ginger's code) -- handful of commands in wdi
# No, we don't want to overwrite the statements (assume that the data is always good) *email ClinGen

# if x != reference:
# write statement
    # if x != reference (mondo)
    # write statement
    # else ...how to do look at both at the same time
# else:
# subsetdf.at[index, 'Status'] = "previously logged" 
# continue

# For gene

```
SELECT ?gene ?geneLabel ?reference_stated_in ?reference_URL WHERE{
    VALUES ?gene {HGNC_qid} 
    ?disease p:P2293 ?statement .
    ?statement ps:P2293 ?MONDO_qid.
    ?statement prov:wasDerivedFrom/pr:P248 ?reference_stated_in . 
    ?statement prov:wasDerivedFrom/pr:P854 ?reference_URL 
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
```

# For disease 
SELECT ?disease ?diseaseLabel ?reference_stated_in ?reference_URL WHERE{
    VALUES ?disease {MONDO_qid} 
    ?disease p:P2293 ?statement .
    ?statement ps:P2293 ?HGNC_qid.
    ?statement prov:wasDerivedFrom/pr:P248 ?reference_stated_in . 
    ?statement prov:wasDerivedFrom/pr:P854 ?reference_URL 
    SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}