# PrepareNeo4jBulkImport
This notebook uses metadata to prepare CSV Files for Neo4j Bulk Data Import

Author: Peter W. Rose (pwrose@ucsd.edu)

In [1]:
import os
from os import walk
from os.path import join
import pandas as pd
import numpy as np
from utils import create_node_headers, create_relationship_headers, get_node_data_headers, get_relationship_data_headers, create_meta_node, create_meta_relationship

In [2]:
# reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
# configure pandas dataframe
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option("display.max_colwidth", None)

## Specify working directories

If NEO4J_HOME directory is not set, this notebook creates the example_results/import directory as a proxy for a Neo4j import directory.

In [4]:
NEO4J_HOME = os.getenv("NEO4J_HOME", default="../example_results")

Location of Neo4j import directory

In [5]:
NEO4J_IMPORT = os.path.join(NEO4J_HOME, "import")

Create the example_results/import directory for testing the example provided in this repo.

In [6]:
if NEO4J_HOME == "../example_results":
    os.makedirs(os.path.join(NEO4J_IMPORT), exist_ok=True)

Locations of metadata files (Defaults are files for testing this notebook)

In [7]:
METADATA = os.getenv("NEO4J_METADATA", default="../example_metadata/")
NODE_METADATA = os.path.join(METADATA, "nodes")
RELATIONSHIP_METADATA = os.path.join(METADATA, "relationships")

Locations of data files (Defaults are files for testing this notebook)

In [8]:
DATA = os.getenv("NEO4J_DATA", default="../example_data/")
NODE_DATA = os.path.join(DATA, "nodes")
RELATIONSHIP_DATA = os.path.join(DATA, "relationships")

## Create headers from metadata files

Create the node name, the expected node file header, and the Neo4j header for bulk import from the metadata files.

In [9]:
dirpath, _, filenames = next(walk(NODE_METADATA))
node_headers = [create_node_headers(dirpath, filename) for filename in filenames]

In [10]:
node_metadata = pd.DataFrame(node_headers)

In [11]:
node_metadata

Unnamed: 0,node,metadataHeader,importHeader,metadataPath
0,Patient,"id,firstName,lastName,age,sex,smoker","id:ID(Patient-ID),firstName:string,lastName:string,age:int,sex:string,smoker:boolean",../example_metadata/nodes/Patient.csv
1,State,"id,name,synonyms,population,location","id:ID(State-ID),name:string,synonyms:string[],population:int,location:point{crs:WGS-84}",../example_metadata/nodes/State.csv
2,City,"id,name,synonyms,population,location","id:ID(City-ID),name:string,synonyms:string[],population:int,location:point{crs:WGS-84}",../example_metadata/nodes/City.csv
3,Symptom,"id,name","id:ID(Symptom-ID),name:string",../example_metadata/nodes/Symptom.csv
4,Disease,"id,name","id:ID(Disease-ID),name:string",../example_metadata/nodes/Disease.csv


Create the relationship name, the expected relatinonship file header, and the Neo4j header for bulk import from the metadata files.

In [12]:
dirpath, _, filenames = next(walk(RELATIONSHIP_METADATA))
relationship_headers = [create_relationship_headers(dirpath, filename) for filename in filenames]

In [13]:
relationship_metadata = pd.DataFrame(relationship_headers)

In [14]:
relationship_metadata

Unnamed: 0,relationship,metadataHeader,importHeader,source,target,metadataPath
0,LOCATED_IN,"from,to",":START_ID(City-ID),:END_ID(State-ID)",City,State,../example_metadata/relationships/City-LOCATED_IN-State.csv
1,DIAGNOSED_WITH,"from,to,diagnosisDate",":START_ID(Patient-ID),:END_ID(Disease-ID),diagnosisDate:date",Patient,Disease,../example_metadata/relationships/Patient-DIAGNOSED_WITH-Disease.csv
2,LIVES_IN,"from,to",":START_ID(Patient-ID),:END_ID(City-ID)",Patient,City,../example_metadata/relationships/Patient-LIVES_IN-City.csv
3,SHOWS,"from,to,startDate",":START_ID(Patient-ID),:END_ID(Symptom-ID),startDate:date",Patient,Symptom,../example_metadata/relationships/Patient-SHOWS-Symptom.csv
4,PRESENTS,"from,to",":START_ID(Disease-ID),:END_ID(Symptom-ID)",Disease,Symptom,../example_metadata/relationships/Disease-PRESENTS-Symptom.csv


## Add constraints and indices for Nodes

In [15]:
def get_string_properties(row):
    node = row["node"]
    header = row["importHeader"]
    fields = header.split(",")
    fields = filter(lambda field: field.endswith(":string"), fields)
    string_properties = []
    
    for field in fields:
        field_name = field.split(":")[0]
        string_properties.append(field_name)

    return string_properties

In [16]:
node_metadata["stringProperties"] = node_metadata.apply(get_string_properties, axis=1)

In [17]:
def add_index(row):
    node = row["node"]
    properties = row["stringProperties"]
    indices = f"CREATE CONSTRAINT {node} FOR (n:{node}) REQUIRE n.id IS UNIQUE;"
    
    for prop in properties:
        indices += f"CREATE INDEX {node}_{prop} FOR (n:{node}) ON (n.{prop});"
        #indices += f"CREATE FULLTEXT INDEX FOR (n:{node}) ON EACH [n.{prop}];"

    return indices

In [18]:
node_metadata["index"] = node_metadata.apply(add_index, axis=1)

In [19]:
node_metadata.head()

Unnamed: 0,node,metadataHeader,importHeader,metadataPath,stringProperties,index
0,Patient,"id,firstName,lastName,age,sex,smoker","id:ID(Patient-ID),firstName:string,lastName:string,age:int,sex:string,smoker:boolean",../example_metadata/nodes/Patient.csv,"[firstName, lastName, sex]",CREATE CONSTRAINT Patient FOR (n:Patient) REQUIRE n.id IS UNIQUE;CREATE INDEX Patient_firstName FOR (n:Patient) ON (n.firstName);CREATE INDEX Patient_lastName FOR (n:Patient) ON (n.lastName);CREATE INDEX Patient_sex FOR (n:Patient) ON (n.sex);
1,State,"id,name,synonyms,population,location","id:ID(State-ID),name:string,synonyms:string[],population:int,location:point{crs:WGS-84}",../example_metadata/nodes/State.csv,[name],CREATE CONSTRAINT State FOR (n:State) REQUIRE n.id IS UNIQUE;CREATE INDEX State_name FOR (n:State) ON (n.name);
2,City,"id,name,synonyms,population,location","id:ID(City-ID),name:string,synonyms:string[],population:int,location:point{crs:WGS-84}",../example_metadata/nodes/City.csv,[name],CREATE CONSTRAINT City FOR (n:City) REQUIRE n.id IS UNIQUE;CREATE INDEX City_name FOR (n:City) ON (n.name);
3,Symptom,"id,name","id:ID(Symptom-ID),name:string",../example_metadata/nodes/Symptom.csv,[name],CREATE CONSTRAINT Symptom FOR (n:Symptom) REQUIRE n.id IS UNIQUE;CREATE INDEX Symptom_name FOR (n:Symptom) ON (n.name);
4,Disease,"id,name","id:ID(Disease-ID),name:string",../example_metadata/nodes/Disease.csv,[name],CREATE CONSTRAINT Disease FOR (n:Disease) REQUIRE n.id IS UNIQUE;CREATE INDEX Disease_name FOR (n:Disease) ON (n.name);


## Get headers from data files

Node data files

In [20]:
dirpath, _, filenames = next(walk(NODE_DATA))
csv_files = filter(lambda name: name.endswith(".csv"), filenames)
data_headers = [get_node_data_headers(dirpath, filename) for filename in csv_files]

Node Data ../example_data/nodes State.csv
Node Data ../example_data/nodes City.csv
Node Data ../example_data/nodes Symptom.csv
Node Data ../example_data/nodes Patient_2021.csv
Node Data ../example_data/nodes Disease.csv
Node Data ../example_data/nodes Patient_2020.csv


In [21]:
node_data = pd.DataFrame(data_headers)

In [22]:
node_data

Unnamed: 0,node,dataHeader,dataPath
0,State,"id,name,synonyms,population,location",../example_data/nodes/State.csv
1,City,"id,name,synonyms,population,location",../example_data/nodes/City.csv
2,Symptom,"id,name",../example_data/nodes/Symptom.csv
3,Patient,"id,firstName,lastName,age,sex,smoker",../example_data/nodes/Patient_2021.csv
4,Disease,"id,name",../example_data/nodes/Disease.csv
5,Patient,"id,firstName,lastName,age,sex,smoker",../example_data/nodes/Patient_2020.csv


Relationship data files

In [23]:
dirpath, _, filenames = next(walk(RELATIONSHIP_DATA))
csv_files = filter(lambda name: name.endswith(".csv"), filenames)
data_headers = [get_relationship_data_headers(dirpath, filename) for filename in csv_files]

Relationship Data ../example_data/relationships City-LOCATED_IN-State.csv
Relationship Data ../example_data/relationships Patient-DIAGNOSED_WITH-Disease_2021.csv
Relationship Data ../example_data/relationships Patient-DIAGNOSED_WITH-Disease_2020.csv
Relationship Data ../example_data/relationships Patient-LIVES_IN-City.csv
Relationship Data ../example_data/relationships Patient-SHOWS-Symptom.csv
Relationship Data ../example_data/relationships Disease-PRESENTS-Symptom.csv


In [24]:
relationship_data = pd.DataFrame(data_headers)

In [25]:
relationship_data

Unnamed: 0,relationship,dataHeader,dataPath
0,LOCATED_IN,"from,to",../example_data/relationships/City-LOCATED_IN-State.csv
1,DIAGNOSED_WITH,"from,to,diagnosisDate",../example_data/relationships/Patient-DIAGNOSED_WITH-Disease_2021.csv
2,DIAGNOSED_WITH,"from,to,diagnosisDate",../example_data/relationships/Patient-DIAGNOSED_WITH-Disease_2020.csv
3,LIVES_IN,"from,to",../example_data/relationships/Patient-LIVES_IN-City.csv
4,SHOWS,"from,to,startDate",../example_data/relationships/Patient-SHOWS-Symptom.csv
5,PRESENTS,"from,to",../example_data/relationships/Disease-PRESENTS-Symptom.csv


## Merge metadata with data

In [26]:
matched_nodes = node_data.merge(node_metadata, on="node", how="outer")
matched_nodes["match"] = matched_nodes["dataHeader"] == matched_nodes["metadataHeader"]
matched_nodes.fillna("", inplace=True)

In [27]:
matched_nodes

Unnamed: 0,node,dataHeader,dataPath,metadataHeader,importHeader,metadataPath,stringProperties,index,match
0,State,"id,name,synonyms,population,location",../example_data/nodes/State.csv,"id,name,synonyms,population,location","id:ID(State-ID),name:string,synonyms:string[],population:int,location:point{crs:WGS-84}",../example_metadata/nodes/State.csv,[name],CREATE CONSTRAINT State FOR (n:State) REQUIRE n.id IS UNIQUE;CREATE INDEX State_name FOR (n:State) ON (n.name);,True
1,City,"id,name,synonyms,population,location",../example_data/nodes/City.csv,"id,name,synonyms,population,location","id:ID(City-ID),name:string,synonyms:string[],population:int,location:point{crs:WGS-84}",../example_metadata/nodes/City.csv,[name],CREATE CONSTRAINT City FOR (n:City) REQUIRE n.id IS UNIQUE;CREATE INDEX City_name FOR (n:City) ON (n.name);,True
2,Symptom,"id,name",../example_data/nodes/Symptom.csv,"id,name","id:ID(Symptom-ID),name:string",../example_metadata/nodes/Symptom.csv,[name],CREATE CONSTRAINT Symptom FOR (n:Symptom) REQUIRE n.id IS UNIQUE;CREATE INDEX Symptom_name FOR (n:Symptom) ON (n.name);,True
3,Patient,"id,firstName,lastName,age,sex,smoker",../example_data/nodes/Patient_2021.csv,"id,firstName,lastName,age,sex,smoker","id:ID(Patient-ID),firstName:string,lastName:string,age:int,sex:string,smoker:boolean",../example_metadata/nodes/Patient.csv,"[firstName, lastName, sex]",CREATE CONSTRAINT Patient FOR (n:Patient) REQUIRE n.id IS UNIQUE;CREATE INDEX Patient_firstName FOR (n:Patient) ON (n.firstName);CREATE INDEX Patient_lastName FOR (n:Patient) ON (n.lastName);CREATE INDEX Patient_sex FOR (n:Patient) ON (n.sex);,True
4,Patient,"id,firstName,lastName,age,sex,smoker",../example_data/nodes/Patient_2020.csv,"id,firstName,lastName,age,sex,smoker","id:ID(Patient-ID),firstName:string,lastName:string,age:int,sex:string,smoker:boolean",../example_metadata/nodes/Patient.csv,"[firstName, lastName, sex]",CREATE CONSTRAINT Patient FOR (n:Patient) REQUIRE n.id IS UNIQUE;CREATE INDEX Patient_firstName FOR (n:Patient) ON (n.firstName);CREATE INDEX Patient_lastName FOR (n:Patient) ON (n.lastName);CREATE INDEX Patient_sex FOR (n:Patient) ON (n.sex);,True
5,Disease,"id,name",../example_data/nodes/Disease.csv,"id,name","id:ID(Disease-ID),name:string",../example_metadata/nodes/Disease.csv,[name],CREATE CONSTRAINT Disease FOR (n:Disease) REQUIRE n.id IS UNIQUE;CREATE INDEX Disease_name FOR (n:Disease) ON (n.name);,True


In [28]:
mismatched_nodes = matched_nodes[(matched_nodes["match"] == False) & (matched_nodes["dataPath"] != "")]
mismatched_nodes

Unnamed: 0,node,dataHeader,dataPath,metadataHeader,importHeader,metadataPath,stringProperties,index,match


In [29]:
if mismatched_nodes.shape[0] > 0:
    print("The following node data files do not match the metadata specification:")
    mismatched_nodes.to_csv(os.path.join(NEO4J_IMPORT, "mismatches_n.csv"), index=False)
    
mismatched_nodes

Unnamed: 0,node,dataHeader,dataPath,metadataHeader,importHeader,metadataPath,stringProperties,index,match


In [30]:
matched_relationships = relationship_data.merge(relationship_metadata, on="relationship", how="outer")
matched_relationships["match"] = matched_relationships["dataHeader"] == matched_relationships["metadataHeader"]
matched_relationships.fillna("", inplace=True)
matched_relationships["fullRelationship"] = matched_relationships["source"] + "-" + matched_relationships["relationship"] + "-" + matched_relationships["target"]
matched_relationships

Unnamed: 0,relationship,dataHeader,dataPath,metadataHeader,importHeader,source,target,metadataPath,match,fullRelationship
0,LOCATED_IN,"from,to",../example_data/relationships/City-LOCATED_IN-State.csv,"from,to",":START_ID(City-ID),:END_ID(State-ID)",City,State,../example_metadata/relationships/City-LOCATED_IN-State.csv,True,City-LOCATED_IN-State
1,DIAGNOSED_WITH,"from,to,diagnosisDate",../example_data/relationships/Patient-DIAGNOSED_WITH-Disease_2021.csv,"from,to,diagnosisDate",":START_ID(Patient-ID),:END_ID(Disease-ID),diagnosisDate:date",Patient,Disease,../example_metadata/relationships/Patient-DIAGNOSED_WITH-Disease.csv,True,Patient-DIAGNOSED_WITH-Disease
2,DIAGNOSED_WITH,"from,to,diagnosisDate",../example_data/relationships/Patient-DIAGNOSED_WITH-Disease_2020.csv,"from,to,diagnosisDate",":START_ID(Patient-ID),:END_ID(Disease-ID),diagnosisDate:date",Patient,Disease,../example_metadata/relationships/Patient-DIAGNOSED_WITH-Disease.csv,True,Patient-DIAGNOSED_WITH-Disease
3,LIVES_IN,"from,to",../example_data/relationships/Patient-LIVES_IN-City.csv,"from,to",":START_ID(Patient-ID),:END_ID(City-ID)",Patient,City,../example_metadata/relationships/Patient-LIVES_IN-City.csv,True,Patient-LIVES_IN-City
4,SHOWS,"from,to,startDate",../example_data/relationships/Patient-SHOWS-Symptom.csv,"from,to,startDate",":START_ID(Patient-ID),:END_ID(Symptom-ID),startDate:date",Patient,Symptom,../example_metadata/relationships/Patient-SHOWS-Symptom.csv,True,Patient-SHOWS-Symptom
5,PRESENTS,"from,to",../example_data/relationships/Disease-PRESENTS-Symptom.csv,"from,to",":START_ID(Disease-ID),:END_ID(Symptom-ID)",Disease,Symptom,../example_metadata/relationships/Disease-PRESENTS-Symptom.csv,True,Disease-PRESENTS-Symptom


In [31]:
mismatched_relationships = matched_relationships[(matched_relationships["match"] == False) & (matched_relationships["dataPath"] != "")]
if mismatched_relationships.shape[0] > 0:
    print("The following relationship data files do not match the metadata specification:")
    mismatched_relationships.to_csv(os.path.join(NEO4J_IMPORT, "mismatches_r.csv"), index=False)
    
mismatched_relationships

Unnamed: 0,relationship,dataHeader,dataPath,metadataHeader,importHeader,source,target,metadataPath,match,fullRelationship


## Write Neo4j header files for bulk import

In [32]:
def save_node_header(name, import_header, NEO4J_IMPORT):
    df = pd.DataFrame([], columns=import_header.split(","))
    df.to_csv(os.path.join(NEO4J_IMPORT, "header_" + name + "_n.csv"), index=False)

Write node header files

In [33]:
matched_nodes.query("match == True", inplace=True)

In [34]:
out = matched_nodes.apply(lambda row: save_node_header(row["node"], row["importHeader"], NEO4J_IMPORT), axis=1)

Write relationship header files

In [35]:
def save_relationship_header(name, import_header, NEO4J_IMPORT):
    df = pd.DataFrame([], columns=import_header.split(","))
    df.to_csv(os.path.join(NEO4J_IMPORT, "header_" + name + "_r.csv"), index=False)

In [36]:
matched_relationships.query("match == True", inplace=True)

In [37]:
matched_relationships

Unnamed: 0,relationship,dataHeader,dataPath,metadataHeader,importHeader,source,target,metadataPath,match,fullRelationship
0,LOCATED_IN,"from,to",../example_data/relationships/City-LOCATED_IN-State.csv,"from,to",":START_ID(City-ID),:END_ID(State-ID)",City,State,../example_metadata/relationships/City-LOCATED_IN-State.csv,True,City-LOCATED_IN-State
1,DIAGNOSED_WITH,"from,to,diagnosisDate",../example_data/relationships/Patient-DIAGNOSED_WITH-Disease_2021.csv,"from,to,diagnosisDate",":START_ID(Patient-ID),:END_ID(Disease-ID),diagnosisDate:date",Patient,Disease,../example_metadata/relationships/Patient-DIAGNOSED_WITH-Disease.csv,True,Patient-DIAGNOSED_WITH-Disease
2,DIAGNOSED_WITH,"from,to,diagnosisDate",../example_data/relationships/Patient-DIAGNOSED_WITH-Disease_2020.csv,"from,to,diagnosisDate",":START_ID(Patient-ID),:END_ID(Disease-ID),diagnosisDate:date",Patient,Disease,../example_metadata/relationships/Patient-DIAGNOSED_WITH-Disease.csv,True,Patient-DIAGNOSED_WITH-Disease
3,LIVES_IN,"from,to",../example_data/relationships/Patient-LIVES_IN-City.csv,"from,to",":START_ID(Patient-ID),:END_ID(City-ID)",Patient,City,../example_metadata/relationships/Patient-LIVES_IN-City.csv,True,Patient-LIVES_IN-City
4,SHOWS,"from,to,startDate",../example_data/relationships/Patient-SHOWS-Symptom.csv,"from,to,startDate",":START_ID(Patient-ID),:END_ID(Symptom-ID),startDate:date",Patient,Symptom,../example_metadata/relationships/Patient-SHOWS-Symptom.csv,True,Patient-SHOWS-Symptom
5,PRESENTS,"from,to",../example_data/relationships/Disease-PRESENTS-Symptom.csv,"from,to",":START_ID(Disease-ID),:END_ID(Symptom-ID)",Disease,Symptom,../example_metadata/relationships/Disease-PRESENTS-Symptom.csv,True,Disease-PRESENTS-Symptom


In [38]:
out = matched_relationships.apply(lambda row: save_relationship_header(row["fullRelationship"], row["importHeader"], NEO4J_IMPORT), axis=1)

## Create MetaNode and MetaRelationship files

Compile a dictionary of all node properties

In [39]:
property_dir = {}
for header in matched_nodes["metadataHeader"]:
    for prop in header.split(","):
        property_dir[prop] = ""

Create dataframe with MetaNode data

In [40]:
node_list = []
for node, filepath in matched_nodes[["node","metadataPath"]].itertuples(index=False):
    node_list.append(create_meta_node(node, property_dir, filepath))
    
meta_nodes = pd.DataFrame(node_list)
meta_nodes.drop_duplicates(inplace=True)
meta_nodes.to_csv(os.path.join(NEO4J_IMPORT, "MetaNode_n.csv"), index=False)

In [41]:
meta_nodes

Unnamed: 0,id,name,synonyms,population,location,firstName,lastName,age,sex,smoker,nodeName:ID(MetaNode-ID)
0,Geonames.org id for location (string),Name of state (string),Alternate names of state (string[]),Population (int),Latitude and longitude in WGS-84 format (point{crs:WGS-84}),,,,,,State
1,Geonames.org id for location (string),Name of city (string),Alternate names of city (string[]),Population (int),Latitude and longitude in WGS-84 format (point{crs:WGS-84}),,,,,,City
2,Symptom id from Symptom Ontology (string),Name of symptom (string),,,,,,,,,Symptom
3,Unique patient id (string),,,,,First name (string),Last name (string),Age (int),Biological sex (string),Patient is a smoker (boolean),Patient
5,Disease id from Human Disease Ontology (string),Name of disease from Human Disease Ontology (string),,,,,,,,,Disease


Compile a dictionary of all relationship properties

In [42]:
property_dir = {}
for header in matched_relationships["metadataHeader"]:
    for property in header.split(","):
        property_dir[property] = ""

Create dataframe with MetaRelationship data

In [43]:
relationship_list = []
for relationship, source, target, filepath in matched_relationships[["relationship", "source", "target", "metadataPath"]].itertuples(index=False):
    relationship_list.append(create_meta_relationship(relationship, source, target, property_dir, filepath))
    
meta_relationships = pd.DataFrame(relationship_list)
meta_relationships.drop_duplicates(inplace=True)
meta_relationships.to_csv(os.path.join(NEO4J_IMPORT, "MetaRelationship_r.csv"), index=False)

In [44]:
meta_relationships

Unnamed: 0,from,to,diagnosisDate,startDate,relationshipName,source:START_ID(MetaNode-ID),target:END_ID(MetaNode-ID)
0,Id of source node (string),Id of target node (string),,,LOCATED_IN,City,State
1,Id of source node (string),Id of target node (string),Date of diagnosis (date),,DIAGNOSED_WITH,Patient,Disease
3,Id of source node (string),Id of target node (string),,,LIVES_IN,Patient,City
4,Id of source node (string),Id of target node (string),,Date when symptom started (date),SHOWS,Patient,Symptom
5,Id of source node (string),Id of target node (string),,,PRESENTS,Disease,Symptom


## Create Neo4j bulk upload command line arguments
See: https://neo4j.com/docs/operations-manual/current/tools/neo4j-admin/neo4j-admin-import/

In [45]:
args = ""
for node in matched_nodes["node"].unique():
    #args += f" --nodes={node}=header_{node}_n.csv,{node}*_n.csv"
    args += f" --nodes={node}=header_{node}_n.csv,{node}.*_n.csv"

In [46]:
args += " --nodes=MetaNode=MetaNode_n.csv"

In [47]:
rel_data = matched_relationships[["relationship", "fullRelationship"]].copy()
rel_data.drop_duplicates(inplace=True)
for relationship, fullRelationship in rel_data.itertuples(index=False):
    #args += f" --relationships={relationship}=header_{fullRelationship}_r.csv,{fullRelationship}*_r.csv"
    args += f" --relationships={relationship}=header_{fullRelationship}_r.csv,{fullRelationship}.*_r.csv"

In [48]:
args += " --relationships=MetaRelationship=MetaRelationship_r.csv"

In [49]:
file = open(os.path.join(NEO4J_IMPORT, "args.txt"), "w")
file.write(args)
file.close() 

In [50]:
print(args)

 --nodes=State=header_State_n.csv,State.*_n.csv --nodes=City=header_City_n.csv,City.*_n.csv --nodes=Symptom=header_Symptom_n.csv,Symptom.*_n.csv --nodes=Patient=header_Patient_n.csv,Patient.*_n.csv --nodes=Disease=header_Disease_n.csv,Disease.*_n.csv --nodes=MetaNode=MetaNode_n.csv --relationships=LOCATED_IN=header_City-LOCATED_IN-State_r.csv,City-LOCATED_IN-State.*_r.csv --relationships=DIAGNOSED_WITH=header_Patient-DIAGNOSED_WITH-Disease_r.csv,Patient-DIAGNOSED_WITH-Disease.*_r.csv --relationships=LIVES_IN=header_Patient-LIVES_IN-City_r.csv,Patient-LIVES_IN-City.*_r.csv --relationships=SHOWS=header_Patient-SHOWS-Symptom_r.csv,Patient-SHOWS-Symptom.*_r.csv --relationships=PRESENTS=header_Disease-PRESENTS-Symptom_r.csv,Disease-PRESENTS-Symptom.*_r.csv --relationships=MetaRelationship=MetaRelationship_r.csv


## Create a Cypher script with default Constraints, indices, and fulltext indices

Create constraints for all node ids and indices and fulltext indices for all string properties

In [51]:
indexed_nodes = matched_nodes[["node", "index", "stringProperties"]].copy()

In [52]:
indices = "".join(indexed_nodes["index"].unique())

Create fulltext index

In [53]:
node_names = indexed_nodes["node"].values
node_names = list(set(node_names))
node_names = "|".join(f"{w}" for w in node_names)

In [54]:
property_names = indexed_nodes["stringProperties"].values
property_names = list(np.unique(np.concatenate(property_names).flat))
property_names = ",n.".join(f"{w}" for w in property_names)

In [55]:
fulltext_index = f"CREATE FULLTEXT INDEX fulltext FOR (n:{node_names}) ON EACH [n.{property_names}];"

In [56]:
indices = indices + fulltext_index
indices = indices.replace(";", ";\n")

In [57]:
print(indices)

CREATE CONSTRAINT State FOR (n:State) REQUIRE n.id IS UNIQUE;
CREATE INDEX State_name FOR (n:State) ON (n.name);
CREATE CONSTRAINT City FOR (n:City) REQUIRE n.id IS UNIQUE;
CREATE INDEX City_name FOR (n:City) ON (n.name);
CREATE CONSTRAINT Symptom FOR (n:Symptom) REQUIRE n.id IS UNIQUE;
CREATE INDEX Symptom_name FOR (n:Symptom) ON (n.name);
CREATE CONSTRAINT Patient FOR (n:Patient) REQUIRE n.id IS UNIQUE;
CREATE INDEX Patient_firstName FOR (n:Patient) ON (n.firstName);
CREATE INDEX Patient_lastName FOR (n:Patient) ON (n.lastName);
CREATE INDEX Patient_sex FOR (n:Patient) ON (n.sex);
CREATE CONSTRAINT Disease FOR (n:Disease) REQUIRE n.id IS UNIQUE;
CREATE INDEX Disease_name FOR (n:Disease) ON (n.name);
CREATE FULLTEXT INDEX fulltext FOR (n:City|Symptom|Disease|Patient|State) ON EACH [n.firstName,n.lastName,n.name,n.sex];



In [58]:
with open(os.path.join(NEO4J_IMPORT, "indices.cypher"), "w") as f:
    f.write(indices)