# Create Knowledge Graph for Novel Coronavirus (COVID-19) Outbreak¶

In [1]:
import os
from py2neo import Graph

### Open Graph data base

In [2]:
graph = Graph(password="neo4jbinder")

Remove any existing nodes and relationships

In [3]:
graph.run("MATCH (n) DETACH DELETE n")

<py2neo.database.Cursor at 0x10dec0670>

Make sure that the core entities are unique

In [4]:
graph.run("CREATE CONSTRAINT ON (c:City) ASSERT c.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (s:State) ASSERT s.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (k:Country) ASSERT k.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (r:Strain) ASSERT r.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (i:InfectiousDiseaseOutbreak) ASSERT i.name IS UNIQUE")
graph.run("CREATE CONSTRAINT ON (p:Pathogen) ASSERT p.name IS UNIQUE")

<py2neo.database.Cursor at 0x10df056d0>

Get absolute path for data directory (LOAD CSV requires absolute path)

In [5]:
data_dir = os.path.abspath("../data")

### Create City-level nodes and relationships

In [6]:
load_city = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/city_COVID-19.csv' AS e"
query = """
MERGE (c:City {name:e.City})
SET c.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})   
MERGE (s:State {name:e.State})
MERGE (k:Country {name:e.Country})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (c)-[:LOCATED_IN]->(s)
MERGE (s)-[:LOCATED_IN]->(k)
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(c)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_city + query)

<py2neo.database.Cursor at 0x10def2850>

#### Run some test queries

In [7]:
query = """
MATCH (c:City)-[l1:LOCATED_IN]->(s:State)-[l2:LOCATED_IN]->(k:Country)
RETURN c.name as City, c.location as Location, s.name as State, k.name as Country
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,Location,State,Country
0,Seattle,"(-120.74, 47.7511)",Washington,USA
1,Tempe,"(-111.094, 34.0489)",Arizona,USA
2,San Antonio,"(-98.4936, 29.4241)",Texas,USA
3,Madison,"(-89.4012, 43.0731)",Wisconsin,USA
4,Boston,"(-71.0589, 42.3601)",Massachusetts,USA
5,Orange,"(-117.8531, 33.7879)",California,USA
6,Los Angeles,"(-118.2437, 34.0522)",California,USA
7,Santa Clara,"(-121.9552, 37.3541)",California,USA
8,San Diego County,"(-117.1611, 32.7157)",California,USA
9,San Benito,"(-120.9876, 36.5761)",California,USA


In [8]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(c:City)-[:LOCATED_IN]->(s:State)
RETURN  c.name as City, s.name as State, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,State,Outbreak,Confirmed,Deaths,Recovered
0,Seattle,Washington,COVID-19,1,0,1
1,Tempe,Arizona,COVID-19,1,0,0
2,San Antonio,Texas,COVID-19,1,0,0
3,Orange,California,COVID-19,1,0,0
4,Madison,Wisconsin,COVID-19,1,0,0
5,Los Angeles,California,COVID-19,1,0,0
6,Boston,Massachusetts,COVID-19,1,0,0
7,London,Ontario,COVID-19,1,0,1
8,Santa Clara,California,COVID-19,2,0,0
9,San Diego County,California,COVID-19,2,0,0


## Create State-level nodes and relationships

In [9]:
load_state = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/state_COVID-19.csv' AS e" 
query = """
MERGE (s:State {name:e.State}) 
SET s.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})  
MERGE (k:Country {name:e.Country})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (s)-[:LOCATED_IN]->(k)
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(s)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_state + query)

<py2neo.database.Cursor at 0x10df05fa0>

#### Run a test query

In [10]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(s:State)-[:LOCATED_IN]->(k:Country)
RETURN  s.name as State, s.location as Location, k.name as Country, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,State,Location,Country,Outbreak,Confirmed,Deaths,Recovered
0,Washington,"(-120.74, 47.7511)",USA,COVID-19,1,0,1
1,Arizona,"(-111.094, 34.0489)",USA,COVID-19,1,0,0
2,Texas,"(-98.4936, 29.4241)",USA,COVID-19,1,0,0
3,Wisconsin,"(-89.4012, 43.0731)",USA,COVID-19,1,0,0
4,Massachusetts,"(-71.0589, 42.3601)",USA,COVID-19,1,0,0
5,California,"(-119.24014, 34.897200000000005)",USA,COVID-19,8,0,0
6,Illinois,"(-89.3985, 40.6331)",USA,COVID-19,2,0,2
7,Nebraska,"(-95.9758, 41.2545)",USA,COVID-19,14,0,0
8,British Columbia,"(-123.12100000000001, 49.2827)",Canada,COVID-19,5,0,0
9,Ontario,"(-80.31425, 43.319050000000004)",Canada,COVID-19,3,0,1


## Create Country-level nodes and relationships

In [11]:
load_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/country_COVID-19.csv' AS e"  
query = """  
MERGE (k:Country {name:e.Country})
SET k.location = point({longitude: toFloat(e.Long), latitude: toFloat(e.Lat)})
MERGE (o:InfectiousDiseaseOutbreak {name:e.Outbreak})
        
MERGE (o)-[i:OCCURED_IN {update:e.`Last Update`}]->(k)
SET i.confirmed = toInt(e.Confirmed),
    i.deaths = toInt(e.Deaths), 
    i.recovered = toInt(e.Recovered)
"""
graph.run(load_country + query)

<py2neo.database.Cursor at 0x11d61ec40>

#### Run a test query

In [12]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)-[l:OCCURED_IN]->(k:Country)
RETURN k.name as Country,k.location as Location, o.name as Outbreak, l.confirmed as Confirmed, l.deaths as Deaths, l.recovered as Recovered
"""
graph.run(query).to_data_frame()

Unnamed: 0,Country,Location,Outbreak,Confirmed,Deaths,Recovered
0,USA,"(-106.03022500000002, 37.752575)",COVID-19,29,0,3
1,Canada,"(-94.58316666666667, 45.30693333333334)",COVID-19,8,0,1
2,China,"(111.56541483870966, 33.35420870967741)",COVID-19,74139,2002,14199
3,Others,"(129.638, 35.4437)",COVID-19,542,0,0
4,Hong Kong,"(114.1694, 22.3193)",COVID-19,62,1,2
5,Macau,"(113.5439, 22.1987)",COVID-19,10,0,5
6,Australia,"(146.94955000000002, -33.520174999999995)",COVID-19,15,0,10
7,Taiwan,"(120.9605, 23.6978)",COVID-19,22,1,2
8,Belgium,"(4.4699, 50.5039)",COVID-19,1,0,1
9,Cambodia,"(104.991, 12.5657)",COVID-19,1,0,1


### Create nodes SARS and MERS data
To be added in the future

In [13]:
query = """
CREATE (s:InfectiousDiseaseOutbreak {name: 'SARS'})
CREATE (m:InfectiousDiseaseOutbreak {name: 'MERS'})
"""
graph.run(query)

<py2neo.database.Cursor at 0x11d67b0a0>

In [14]:
query = """
MATCH (o:InfectiousDiseaseOutbreak)
RETURN o.name as Outbreak
"""
graph.run(query).to_data_frame()

Unnamed: 0,Outbreak
0,COVID-19
1,SARS
2,MERS


### Add taxonomy data

In [15]:
query = """
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
SET p.taxonomyId = '269749'
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})

MERGE (p)-[:CAUSES]->(o)
"""
graph.run(query)

<py2neo.database.Cursor at 0x11d67b820>

### Add strain data at City level

In [16]:
load_strains_city = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_city_COVID-19.csv' AS e" 
query = """  
MERGE (c:City {name:e.City})
MERGE (s:Strain {name:e.strain})
SET s.genbankId = e.genbank_accession
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: '2019-nCoV'})
        
MERGE (s)-[:FOUND_IN]->(c)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_city + query)

<py2neo.database.Cursor at 0x11d67bcd0>

In [17]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(c:City)
RETURN c.name as City, s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,City,Strain
0,Chicago,USA/IL1/2020
1,Boston,USA/MA1/2020
2,Los Angeles,USA/CA1/2020
3,Seattle,USA/WA1/2020


### Add strain data at State level

In [18]:
load_strains_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_state_COVID-19.csv' AS e"  
query = """  
MERGE (st:State {name:e.State})
MERGE (s:Strain {name:e.strain})
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})
        
MERGE (s)-[:FOUND_IN]->(st)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_country + query)

<py2neo.database.Cursor at 0x11d67b460>

In [19]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(st:State)
RETURN st.name as State, s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,State,Strain
0,Illinois,USA/IL1/2020
1,Illinois,USA/IL2/2020
2,California,USA/CA3/2020
3,California,USA/CA4/2020
4,California,USA/CA5/2020
...,...,...
88,Yunnan,Yunnan/IVDC-YN-003/2020
89,Zhejiang,Zhejiang/WZ-02/2020
90,Zhejiang,Hangzhou/HZ-1/2020
91,Zhejiang,Hangzhou/HZCDC0001/2020


### Add strain data at Country level

In [20]:
load_strains_country = f"LOAD CSV WITH HEADERS FROM 'file:///{data_dir}/strains_country_COVID-19.csv' AS e"
query = """  
MERGE (k:Country {name:e.Country})
MERGE (s:Strain {name:e.strain})
MERGE (p: Pathogen{name: 'Wuhan seafood market pneumonia virus'})
MERGE (o:InfectiousDiseaseOutbreak {name: 'COVID-19'})
        
MERGE (s)-[:FOUND_IN]->(k)
MERGE (p)-[:HAS_STRAIN]->(s)
"""
graph.run(load_strains_country + query)

<py2neo.database.Cursor at 0x11d689ee0>

In [21]:
query = """
MATCH (s: Strain)-[:FOUND_IN]->(k:Country)
RETURN k.name as Country, s.name as Strain, s.genbankId as GenbankId
"""
graph.run(query).to_data_frame()

Unnamed: 0,Country,Strain,GenbankId
0,USA,USA/CA2/2020,
1,USA,USA/CA6/2020,
2,USA,USA/IL2/2020,
3,USA,USA/WA1-A12/2020,
4,USA,USA/CA1/2020,MN994467
...,...,...,...
149,South Korea,Korea/KCDC03/2020,
150,Thailand,Nonthaburi/74/2020,
151,Thailand,Nonthaburi/61/2020,
152,United Kingdom,England/02/2020,


In [22]:
query = """
MATCH (p: Pathogen)-[:HAS_STRAIN]-(s:Strain)
RETURN s.name as Strain
"""
graph.run(query).to_data_frame()

Unnamed: 0,Strain
0,Singapore/2/2020
1,Germany/BavPat1/2020
2,Kanagawa/1/2020
3,HongKong/VM20001988/2020
4,England/02/2020
...,...
149,Foshan/20SF211/2020
150,Wuhan/IPBCAMS-WH-05/2020
151,Guangzhou/20SF206/2020
152,Chongqing/YC01/2020
