# Create Knowledge Graph for Novel Coronavirus (COVID-19) Outbreak¶

In [1]:
import os
from py2neo import Graph

### Open Graph data base

In [2]:
graph = Graph(password="neo4jbinder")

Remove any existing nodes and relationships

In [3]:
graph.run("MATCH (n) DETACH DELETE n")

<py2neo.database.Cursor at 0x111cdc3a0>

Make sure that the core entities are unique

In [4]:
graph.run("CREATE CONSTRAINT ON (c:City) ASSERT c.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (s:State) ASSERT s.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (k:Country) ASSERT k.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (r:Strain) ASSERT r.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (i:InfectiousDiseaseOutbreak) ASSERT i.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (p:Pathogen) ASSERT p.name IS UNIQUE")

<py2neo.database.Cursor at 0x111ceff40>

Get absolute path for data directory (LOAD CSV requires absolute path)

In [5]:
data_dir = os.path.abspath("../data")

### Create City-level nodes and relationships

In [6]:
load_city = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/city_COVID-19.csv' AS e"
query = """
MERGE (c:City {name:e.City})
SET c.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})   
MERGE (s:State {name:e.State})
MERGE (k:Country {name:e.Country})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (c)-[:LOCATED_IN]->(s)
MERGE (s)-[:LOCATED_IN]->(k)
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(c)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_city + query)

<py2neo.database.Cursor at 0x111cdc760>

#### Run some test queries

In [7]:
query = """
MATCH (c:City)-[l1:LOCATED_IN]->(s:State)-[l2:LOCATED_IN]->(k:Country)
RETURN c.name as City, c.location as Location, s.name as State, k.name as Country
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,Location,State,Country
0,Lackland,"(-98.6134, 29.3829)","Lackland, TX (From Diamond Princess)",USA
1,Santa Rosa County,"(-86.9824, 30.769000000000002)",Florida,USA
2,Hillsborough,"(-82.3018, 27.9904)",Florida,USA
3,Sarasota,"(-82.5307, 27.3364)",Florida,USA
4,Westchester County,"(-73.7949, 41.122)",New York,USA
5,Queens County,"(-73.7949, 40.7282)",New York,USA
6,New York City,"(-74.006, 40.7128)",New York,USA
7,King County,"(-122.3321, 47.6062)",Washington,USA
8,Snohomish County,"(-121.8339, 48.033)",Washington,USA
9,Grant County,"(-119.3732, 47.1981)",Washington,USA


In [8]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(c:City)-[:LOCATED_IN]->(s:State)
RETURN  c.name as City, s.name as State, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,State,Outbreak,Confirmed,Deaths,Recovered
0,Williamson County,Tennessee,COVID-19,1,0,0
1,Bergen County,New Jersey,COVID-19,2,0,0
2,Santa Clara,California,COVID-19,20,0,1
3,Los Angeles,California,COVID-19,11,0,0
4,Wake County,North Carolina,COVID-19,1,0,0
5,Washington County,Oregon,COVID-19,2,0,0
6,Toronto,Ontario,COVID-19,21,0,2
7,Contra Costa County,California,COVID-19,1,0,0
8,London,Ontario,COVID-19,1,0,1
9,Fulton County,Georgia,COVID-19,2,0,0


## Create State-level nodes and relationships

In [9]:
load_state = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/state_COVID-19.csv' AS e" 
query = """
MERGE (s:State {name:e.State}) 
SET s.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})  
MERGE (k:Country {name:e.Country})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (s)-[:LOCATED_IN]->(k)
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(s)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_state + query)

<py2neo.database.Cursor at 0x121200250>

#### Run a test query

In [10]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(s:State)-[:LOCATED_IN]->(k:Country)
RETURN  s.name as State, s.location as Location, k.name as Country, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,State,Location,Country,Outbreak,Confirmed,Deaths,Recovered
0,Hong Kong,"(114.2, 22.3)",Hong Kong,COVID-19,105,2,43
1,Macau,"(113.55, 22.1667)",Macau,COVID-19,10,0,9
2,Taiwan,"(121.0, 23.7)",Taiwan,COVID-19,44,1,12
3,Unassigned Location (From Diamond Princess),"(139.638, 35.4437)",USA,COVID-19,45,0,0
4,"Lackland, TX (From Diamond Princess)","(-98.6134, 29.3829)",USA,COVID-19,0,0,0
...,...,...,...,...,...,...,...
62,Victoria,"(144.9631, -37.8136)",Australia,COVID-19,10,0,7
63,Western Australia,"(115.8605, -31.9505)",Australia,COVID-19,3,1,0
64,Queensland,"(153.4, -28.0167)",Australia,COVID-19,13,0,8
65,South Australia,"(138.6007, -34.9285)",Australia,COVID-19,5,0,2


## Create Country-level nodes and relationships

In [11]:
load_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/country_COVID-19.csv' AS e"  
query = """  
MERGE (k:Country {name:e.Country})
SET k.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(k)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_country + query)

<py2neo.database.Cursor at 0x121408b20>

#### Run a test query

In [12]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(k:Country)
RETURN k.name as Country,k.location as Location, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,Country,Location,Outbreak,Confirmed,Deaths,Recovered
0,Iraq,"(44.0, 33.0)",COVID-19,35,2,0
1,Brazil,"(-51.9253, -14.235)",COVID-19,4,0,0
2,Philippines,"(122.0, 13.0)",COVID-19,3,1,1
3,China,"(111.54290322580647, 33.40693612903225)",COVID-19,80422,3013,52240
4,Switzerland,"(8.2275, 46.8182)",COVID-19,114,1,3
...,...,...,...,...,...,...
85,Azerbaijan,"(47.5769, 40.1431)",COVID-19,6,0,0
86,Lithuania,"(23.8813, 55.1694)",COVID-19,1,0,0
87,Bahrain,"(50.55, 26.0275)",COVID-19,55,0,0
88,Iran,"(53.0, 32.0)",COVID-19,3513,107,739


### Create nodes SARS and MERS data
To be added in the future

In [13]:
query = """
CREATE (s:InfectiousDiseaseOutbreak {name: 'SARS'})
CREATE (m:InfectiousDiseaseOutbreak {name: 'MERS'})
"""
graph.run(query)

<py2neo.database.Cursor at 0x121408cd0>

In [14]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)
RETURN o.name as Outbreak
"""
graph.run(query).to_data_frame()

Unnamed: 0,Outbreak
0,SARS
1,MERS
2,COVID-19


### Add taxonomy data

In [15]:
query = """
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
SET p.taxonomyId = '269749'
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})

MERGE (p)-[:CAUSES]->(o)
"""
graph.run(query)

<py2neo.database.Cursor at 0x111cefd60>

### Add strain data at City level

In [16]:
load_strains_city = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_city_COVID-19.csv' AS e" 
query = """  
MERGE (c:City {name:e.City})
MERGE (s:Strain {name:e.strain})
SET s.genbankId = e.genbank_accession
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: '2019-nCoV'})
        
MERGE (s)-[:FOUND_IN]->(c)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_city + query)

<py2neo.database.Cursor at 0x1214089a0>

In [17]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(c:City)
RETURN c.name as City, s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,Strain
0,King County,USA/WA-S2/2020
1,Snohomish County,USA/WA1/2020
2,Snohomish County,USA/WA2/2020
3,Los Angeles,USA/CA1/2020
4,Orange County,USA/CA2/2020
5,Boston,USA/MA1/2020


### Add strain data at State level

In [18]:
load_strains_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_state_COVID-19.csv' AS e"  
query = """  
MERGE (st:State {name:e.State})
MERGE (s:Strain {name:e.strain})
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})
        
MERGE (s)-[:FOUND_IN]->(st)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_country + query)

<py2neo.database.Cursor at 0x121467040>

In [19]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(st:State)
RETURN st.name as State, s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,State,Strain
0,Fujian,Fujian/8/2020
1,Fujian,Fujian/13/2020
2,Guangdong,pangolin/Guangdong/P2S/2019
3,Guangdong,Guangzhou/20SF206/2020
4,Guangdong,Shenzhen/SZTH-004/2020
...,...,...
132,Beijing,Beijing/IVDC-BJ-005/2020
133,British Columbia,Canada/BC_37_0-2/2020
134,Chongqing,Chongqing/IVDC-CQ-001/2020
135,Chongqing,Chongqing/YC01/2020


### Add strain data at Country level

In [20]:
load_strains_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_country_COVID-19.csv' AS e"
query = """  
MERGE (k:Country {name:e.Country})
MERGE (s:Strain {name:e.strain})
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})
        
MERGE (s)-[:FOUND_IN]->(k)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_country + query)

<py2neo.database.Cursor at 0x121467eb0>

In [21]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(k:Country)
RETURN k.name as Country, s.name as Strain, s.genbankId as GenbankId
"""
graph.run(query).to_data_frame()

Unnamed: 0,Country,Strain,GenbankId
0,Hong Kong,HongKong/VB20026565/2020,
1,Hong Kong,HongKong/VM20001061/2020,
2,Hong Kong,HongKong/VM20001988/2020,
3,Taiwan,Taiwan/3/2020,
4,Taiwan,Taiwan/CGMH-CGU-01/2020,
...,...,...,...
204,Australia,Australia/NSW06/2020,
205,Australia,Australia/QLD02/2020,
206,Australia,Australia/QLD04/2020,
207,Australia,Australia/NSW01/2020,


In [22]:
query = """
MATCH (p: Pathogen)-[:HAS_STRAIN]-(s:Strain)
RETURN s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,Strain
0,Korea/KCDC12/2020
1,Japan/TY-WK-012/2020
2,Switzerland/1000477377/2020
3,Brazil/SPBR-01/2020
4,Germany/Baden-Wuerttemberg-1/2020
...,...
204,Wuhan/IPBCAMS-WH-05/2020
205,Australia/VIC01/2020
206,pangolin/Guangdong/P2S/2019
207,China/WH-09/2020
