# Example queries run on COVID-19 Knowledge Graph on Server
[Work in progress]

This notebook demonstrates how to run Cypher queries in a Jupyter Notebook by connecting to a database server.

## 1. Accepting parameters from KG Client 

The following cell will retrieve parameters from the URL. Execute it to continue 

In [1]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

<IPython.core.display.Javascript object>

## 2. Check if the passed parameters are correct 

In [2]:
# Check if the parameters are correct
import webbrowser
import ntpath
import os
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

url_partitioned = full_notebook_url.partition('/SuaveDispatch')
base_url = url_partitioned[0];
images_available = False
if dzc_file == "undefined":
    dzc_file = ""
    localdzc = "" 
    full_images = "full images not available on NFS storage"
if len(dzc_file) > 20:
    if "lib-staging-uploads" in dzc_file:
        localdzc = dzc_file.replace("https://maxim.ucsd.edu/dzgen/lib-staging-uploads","/lib-nfs/dzgen")
        full_images = localdzc.replace("/content.dzc","/full_images/")
    else:
        localdzc = "dzc not available on NFS storage"
        full_images = "full images not available on NFS storage"
        images_available = True

printmd("<b><span style='color:red'>Verify survey parameters: </span></b>")

print("Base Survey URL: ", survey_url)
print("Enabled Views: ", views)
print("Default View: ", view)
print("User ID: ", user)
print("Additional Parameters: ", params)
print("Data File: ", csv_file)
print("Image Tile Collection URL: ", dzc_file)
print("Active Object: ", active_object)
print("Jupyter Hub URL: ", base_url)
print("Local Tile Collection Path : ", localdzc)
print("Local Full-size Image Path: ", full_images)
if os.path.exists(full_images):
    print("Full-size Images Available")
else:
    print("Full-size Images Not Available")
    

<b><span style='color:red'>Verify survey parameters: </span></b>

Base Survey URL:  http://suave2.sdsc.edu/main/file=sdhhsa_Selected_Risks_by_Tracts.csv
Enabled Views:  
Default View:  map
User ID:  sdhhsa
Additional Parameters:  
Data File:  sdhhsa_Selected_Risks_by_Tracts.csv
Image Tile Collection URL:  https://maxim.ucsd.edu/dzgen/uploads/9cf8cb16ca1c3888e7ec0ffea5d982fb/content.dzc
Active Object:  6073019803
Jupyter Hub URL:  https://datahub.ucsd.edu/user/izaslavsky/notebooks/jupyter-suave/operations/kg/kg_query.ipynb?surveyurl=http://suave2.sdsc.edu/main/file=sdhhsa_Selected_Risks_by_Tracts.csv&views=&view=map&user=sdhhsa&csv=sdhhsa_Selected_Risks_by_Tracts.csv&dzc=https://maxim.ucsd.edu/dzgen/uploads/9cf8cb16ca1c3888e7ec0ffea5d982fb/content.dzc&activeobject=6073019803
Local Tile Collection Path :  dzc not available on NFS storage
Local Full-size Image Path:  full images not available on NFS storage
Full-size Images Not Available


In [3]:
import os
import time
import pandas as pd
from py2neo import Graph

### Examining the input against the KG capabilities:

If zip, censustract,SRA ==> "location"
 - what are the "location" parameters in this KG? ==> country, admin1, admin2
 - can we find a mapping between the submitted location and the locationi in the KG
 - assume that we found that ZIP == 92093 ==> admin2 = "San Diego County"
 
What is your target_var :: death rate
 - Find "death rate" in an ontology, and then find the closest neighbor or parent that exists in the KG
 
 - term expansion
 

In [4]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

In [5]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

### List Organisms in KG

In [6]:
query = """
MATCH (p:Organism)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy
"""
graph.run(query).to_data_frame()

Unnamed: 0,name,scientificName,taxonomy
0,SARS-CoV-2,,taxonomy:2697049
1,MERS-CoV,,taxonomy:1335626
2,SARS-CoV,,taxonomy:694009
3,human,,taxonomy:9606
4,house mouse,,taxonomy:10090
5,intermediate horseshoe bat,,taxonomy:59477
6,Malayan horseshoe bat,,taxonomy:608659
7,horseshoe bat,,taxonomy:49442
8,Malayan pangolin,,taxonomy:9974
9,palm civet,,taxonomy:71116


### List Coronavirus Outbreaks

In [7]:
query = """
MATCH (p:Organism)-[:CAUSES]->(o:Outbreak)
RETURN p.name as name, p.scientificName as scientificName, p.id as taxonomy, o.id as outbreak, o.startDate as startDate
"""
graph.run(query).to_data_frame()

### List Strains that are mentioned in PubMed Central Articles

In [8]:
query = """
MATCH (p:Publication)-[:MENTIONS]->(s:Strain)<-[:CARRIES]-(o:Organism)
RETURN p.id as pmc, s.name as name, s.collectionDate  as collectionDate, o.name as host, s.id as host_id
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame().head(20)
# TODO where do the 2013 bat strains come from??

### List Gene and Protein information for Reference Genome
This query lists the genes and proteins encoded by the SARS-CoV-2 reference genome. This is the first genome of SARS-CoV-2 collected in Wuhan on Dec. 5, 2019.

In [23]:
query = """
MATCH (s:Strain)-[:HAS]->(g:Gene)-[:ENCODES]->(p:Protein)
RETURN s.id as referenceGenome, s.name as name, s.collectionDate  as collectionDate, 
       g.name as gene, g.id as geneId, p.name as protein, p.id as protein_id 
ORDER by s.collectionDate
"""
graph.run(query).to_data_frame()

In [27]:
target_var

'deathrate'

### Cases in a specific County (Admin2)

In [6]:


admin2 = 'San Diego County'

query = """
MATCH (c:Cases{date: date("2020-06-15")})-[:REPORTED_IN]->(a:Admin2{name: $admin2})
RETURN a.name as name, c.cummulativeConfirmed as confirmed, c.cummulativeDeaths as deaths
"""
graph.run(query, admin2=admin2).to_data_frame()

Unnamed: 0,name,confirmed,deaths
0,San Diego County,9610,320


### Aggregate cases by State (Admin1)

In [None]:
query = """
MATCH (o:Outbreak{id: "COVID-19"})<-[:RELATED_TO]-(c:Cases{date: date("2020-05-04")})-[:REPORTED_IN]->(a:Admin2)-[:IN]->(a1:Admin1)
RETURN a1.name as state, sum(c.cummulativeConfirmed) as confirmed, sum(c.cummulativeDeaths) as deaths
ORDER BY deaths
"""
graph.run(query).to_data_frame()