# Create Knowledge Graph for Novel Coronavirus (COVID-19) Outbreak¶

In [1]:
import os
from py2neo import Graph

### Open Graph data base

In [2]:
graph = Graph(password="neo4jbinder")

Remove any existing nodes and relationships

In [3]:
graph.run("MATCH (n) DETACH DELETE n")

<py2neo.database.Cursor at 0x1057e9250>

Make sure that the core entities are unique

In [4]:
graph.run("CREATE CONSTRAINT ON (c:City) ASSERT c.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (s:State) ASSERT s.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (k:Country) ASSERT k.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (r:Strain) ASSERT r.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (i:InfectiousDiseaseOutbreak) ASSERT i.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (p:Pathogen) ASSERT p.name IS UNIQUE")

<py2neo.database.Cursor at 0x1087ee5e0>

Get absolute path for data directory (LOAD CSV requires absolute path)

In [5]:
data_dir = os.path.abspath("../data")

### Create City-level nodes and relationships

In [6]:
load_city = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/city_COVID-19.csv' AS e"
query = """
MERGE (c:City {name:e.City})
SET c.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})   
MERGE (s:State {name:e.State})
MERGE (k:Country {name:e.Country})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (c)-[:LOCATED_IN]->(s)
MERGE (s)-[:LOCATED_IN]->(k)
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(c)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_city + query)

<py2neo.database.Cursor at 0x1087ee550>

#### Run some test queries

In [7]:
query = """
MATCH (c:City)-[l1:LOCATED_IN]->(s:State)-[l2:LOCATED_IN]->(k:Country)
RETURN c.name as City, c.location as Location, s.name as State, k.name as Country
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,Location,State,Country
0,Toronto,"(-79.3832, 43.6532)",Ontario,Canada
1,London,"(-81.2453, 42.9849)",Ontario,Canada
2,Boston,"(-71.0589, 42.3601)",Massachusetts,USA
3,Seattle,"(-120.74, 47.7511)",Washington,USA
4,Los Angeles,"(-118.2437, 34.0522)",California,USA
5,Orange,"(-117.8531, 33.7879)",California,USA
6,Santa Clara,"(-121.9552, 37.3541)",California,USA
7,San Benito,"(-120.9876, 36.5761)",California,USA
8,San Diego County,"(-117.1611, 32.7157)",California,USA
9,Tempe,"(-111.094, 34.0489)",Arizona,USA


In [8]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(c:City)-[:LOCATED_IN]->(s:State)
RETURN  c.name as City, s.name as State, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,State,Outbreak,Confirmed,Deaths,Recovered
0,Orange,California,COVID-19,1,0,0
1,Boston,Massachusetts,COVID-19,1,0,0
2,London,Ontario,COVID-19,1,0,1
3,Santa Clara,California,COVID-19,2,0,0
4,Chicago,Illinois,COVID-19,2,0,2
5,Tempe,Arizona,COVID-19,1,0,0
6,Madison,Wisconsin,COVID-19,1,0,0
7,Toronto,Ontario,COVID-19,2,0,0
8,San Diego County,California,COVID-19,1,0,0
9,San Benito,California,COVID-19,2,0,0


## Create State-level nodes and relationships

In [9]:
load_state = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/state_COVID-19.csv' AS e" 
query = """
MERGE (s:State {name:e.State}) 
SET s.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})  
MERGE (k:Country {name:e.Country})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (s)-[:LOCATED_IN]->(k)
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(s)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_state + query)

<py2neo.database.Cursor at 0x108823460>

#### Run a test query

In [10]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(s:State)-[:LOCATED_IN]->(k:Country)
RETURN  s.name as State, s.location as Location, k.name as Country, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,State,Location,Country,Outbreak,Confirmed,Deaths,Recovered
0,Diamond Princess cruise ship,"(129.638, 35.4437)",Others,COVID-19,175,0,0
1,Hong Kong,"(114.1694, 22.3193)",Hong Kong,COVID-19,50,1,1
2,Macau,"(113.5439, 22.1987)",Macau,COVID-19,10,0,2
3,South Australia,"(138.6007, -34.9285)",Australia,COVID-19,2,0,0
4,New South Wales,"(151.2093, -33.8688)",Australia,COVID-19,4,0,2
5,Queensland,"(153.0251, -27.4698)",Australia,COVID-19,5,0,0
6,Victoria,"(144.9631, -37.8136)",Australia,COVID-19,4,0,0
7,Taiwan,"(120.9605, 23.6978)",Taiwan,COVID-19,18,0,1
8,British Columbia,"(-123.121, 49.2827)",Canada,COVID-19,4,0,0
9,Ontario,"(-80.31425, 43.319050000000004)",Canada,COVID-19,3,0,1


## Create Country-level nodes and relationships

In [11]:
load_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/country_COVID-19.csv' AS e"  
query = """  
MERGE (k:Country {name:e.Country})
SET k.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(k)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_country + query)

<py2neo.database.Cursor at 0x117f33e50>

#### Run a test query

In [12]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(k:Country)
RETURN k.name as Country,k.location as Location, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,Country,Location,Outbreak,Confirmed,Deaths,Recovered
0,Others,"(129.638, 35.4437)",COVID-19,175,0,0
1,Hong Kong,"(114.1694, 22.3193)",COVID-19,50,1,1
2,Macau,"(113.5439, 22.1987)",COVID-19,10,0,2
3,Australia,"(146.94955000000002, -33.520174999999995)",COVID-19,15,0,2
4,Taiwan,"(120.9605, 23.6978)",COVID-19,18,0,1
5,Belgium,"(4.4699, 50.5039)",COVID-19,1,0,0
6,Cambodia,"(104.991, 12.5657)",COVID-19,1,0,1
7,Finland,"(25.7482, 61.9241)",COVID-19,1,0,1
8,France,"(2.2137, 46.2276)",COVID-19,11,0,2
9,Germany,"(10.4515, 51.1657)",COVID-19,16,0,0


### Create nodes SARS and MERS data
To be added in the future

In [13]:
query = """
CREATE (s:InfectiousDiseaseOutbreak {name: 'SARS'})
CREATE (m:InfectiousDiseaseOutbreak {name: 'MERS'})
"""
graph.run(query)

<py2neo.database.Cursor at 0x117f9d220>

In [14]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)
RETURN o.name as Outbreak
"""
graph.run(query).to_data_frame()

Unnamed: 0,Outbreak
0,SARS
1,MERS
2,COVID-19


### Add taxonomy data

In [15]:
query = """
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
SET p.taxonomyId = '269749'
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})

MERGE (p)-[:CAUSES]->(o)
"""
graph.run(query)

<py2neo.database.Cursor at 0x117f9dfa0>

### Add strain data at City level

In [16]:
load_strains_city = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_city_COVID-19.csv' AS e" 
query = """  
MERGE (c:City {name:e.City})
MERGE (s:Strain {name:e.strain})
SET s.genbankId = e.genbank_accession
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: '2019-nCoV'})
        
MERGE (s)-[:FOUND_IN]->(c)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_city + query)

<py2neo.database.Cursor at 0x117f9d670>

In [17]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(c:City)
RETURN c.name as City, s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,Strain
0,Chicago,USA/IL1/2020
1,Boston,USA-MA1/2020
2,Los Angeles,USA/CA1/2020
3,Seattle,USA-WA1/2020


### Add strain data at State level

In [18]:
load_strains_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_state_COVID-19.csv' AS e"  
query = """  
MERGE (st:State {name:e.State})
MERGE (s:Strain {name:e.strain})
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})
        
MERGE (s)-[:FOUND_IN]->(st)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_country + query)

<py2neo.database.Cursor at 0x117f9ddf0>

In [19]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(st:State)
RETURN st.name as State, s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,State,Strain
0,Beijing,Beijing/IVDC-BJ-005/2020
1,Chongqing,Chongqing/IVDC-CQ-001/2020
2,Chongqing,Chongqing/ZX01/2020
3,Chongqing,Chongqing/YC01/2020
4,Guangdong,Shenzhen/SZTH-002/2020
...,...,...
75,California,USA/CA4/2020
76,Massachusetts,USA-MA1/2020
77,Wisconsin,USA/WI1/2020
78,Washington,USA-WA1/2020


### Add strain data at Country level

In [20]:
load_strains_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_country_COVID-19.csv' AS e"
query = """  
MERGE (k:Country {name:e.Country})
MERGE (s:Strain {name:e.strain})
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})
        
MERGE (s)-[:FOUND_IN]->(k)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_country + query)

<py2neo.database.Cursor at 0x117fa3340>

In [21]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(k:Country)
RETURN k.name as Country, s.name as Strain, s.genbankId as GenbankId
"""
graph.run(query).to_data_frame()

Unnamed: 0,Country,Strain,GenbankId
0,Australia,Australia/NSW01/2020,
1,Australia,Sydney/3/2020,
2,Australia,Australia/QLD02/2020,
3,Australia,Sydney/2/2020,
4,Australia,Australia/QLD01/2020,
...,...,...,...
96,China,WHU02,
97,China,HKU-SZ-007b_2020,
98,China,Wuhan/IPBCAMS-WH-01/2019,
99,China,Wuhan/IPBCAMS-WH-03/2019,


In [22]:
query = """
MATCH (p: Pathogen)-[:HAS_STRAIN]-(s:Strain)
RETURN s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,Strain
0,Nonthaburi/74/2020
1,Singapore/3/2020
2,Japan/KY-V-029/2020
3,France/IDF0373/2020
4,Singapore/1/2020
...,...
96,HKU-SZ-004_2020
97,Wuhan-Hu-1/2019
98,HKU-SZ-005b_2020
99,USA/AZ1/2020
