# PrepareNeo4jBulkImport
This notebook uses metadata to prepare CSV Files for Neo4j Bulk Data Import

Author: Peter W. Rose (pwrose@ucsd.edu)

In [1]:
import os
from os import walk
from os.path import join
import pandas as pd
import numpy as np
from utils import create_node_headers, create_relationship_headers, get_node_data_headers, get_relationship_data_headers, create_meta_node, create_meta_relationship

In [2]:
# reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
# configure pandas dataframe
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option("display.max_colwidth", None)

## Specify working directories

If NEO4J_HOME directory is not set, this notebook creates the example_results/import directory as a proxy for a Neo4j import directory.

In [4]:
NEO4J_HOME = os.getenv("NEO4J_HOME", default="../example_results")

Location of Neo4j import directory

In [5]:
NEO4J_IMPORT = os.path.join(NEO4J_HOME, "import")

Create the example_results/import directory for testing the example provided in this repo.

In [6]:
if NEO4J_HOME == "../example_results":
    os.makedirs(os.path.join(NEO4J_IMPORT), exist_ok=True)

Locations of metadata files (Defaults are files for testing this notebook)

In [7]:
METADATA = os.getenv("NEO4J_METADATA", default="../example_metadata/")
NODE_METADATA = os.path.join(METADATA, "nodes")
RELATIONSHIP_METADATA = os.path.join(METADATA, "relationships")

Locations of data files (Defaults are files for testing this notebook)

In [8]:
DATA = os.getenv("NEO4J_DATA", default="../example_data/")
NODE_DATA = os.path.join(DATA, "nodes")
RELATIONSHIP_DATA = os.path.join(DATA, "relationships")

## Create headers from metadata files

Create the node name, the expected node file header, and the Neo4j header for bulk import from the metadata files.

In [9]:
dirpath, _, filenames = next(walk(NODE_METADATA))
node_headers = [create_node_headers(dirpath, filename) for filename in filenames]

In [10]:
node_metadata = pd.DataFrame(node_headers)

In [11]:
node_metadata

Unnamed: 0,node,metadataHeader,importHeader,metadataPath
0,Software,"id,name,description,url","id:ID(Software-ID),name:string,description:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Software.csv
1,Organization,"id,name,city,state,country,ror,uei,duns,geonames,url","id:ID(Organization-ID),name:string,city:string,state:string,country:string,ror:string,uei:string,duns:string,geonames:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Organization.csv
2,Patent,"id,name,inventors,filingDate,grantedDate,status,applicants,url","id:ID(Patent-ID),name:string,inventors:string[],filingDate:date,grantedDate:date,status:string,applicants:string[],url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Patent.csv
3,ResearchInitiative,"id,name,longName,description,researchInitiativeUrl,url,websites","id:ID(ResearchInitiative-ID),name:string,longName:string,description:string,researchInitiativeUrl:string,url:string,websites:string[]",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/ResearchInitiative.csv
4,Presentation,"id,name,presenters,presentationUrl,videoUrl","id:ID(Presentation-ID),name:string,presenters:string,presentationUrl:string,videoUrl:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Presentation.csv
5,Publication,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url","id:ID(Publication-ID),name:string,abstract:string,journal:string,year:int,type:string,doi:string,pmId:string,pmcId:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Publication.csv
6,Researcher,"id,name,fullName,firstName,middleName,lastName,orcid,profileId","id:ID(Researcher-ID),name:string,fullName:string,firstName:string,middleName:string,lastName:string,orcid:string,profileId:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Researcher.csv
7,Event,"id,name,eventType,eventUrl,startDate,endDate,city,state,country","id:ID(Event-ID),name:string,eventType:string,eventUrl:string,startDate:date,endDate:date,city:string,state:string,country:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Event.csv
8,Dataset,"id,name,url","id:ID(Dataset-ID),name:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Dataset.csv
9,FundingOpportunity,"id,name,url","id:ID(FundingOpportunity-ID),name:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/FundingOpportunity.csv


Create the relationship name, the expected relatinonship file header, and the Neo4j header for bulk import from the metadata files.

In [12]:
dirpath, _, filenames = next(walk(RELATIONSHIP_METADATA))
relationship_headers = [create_relationship_headers(dirpath, filename) for filename in filenames]

In [13]:
relationship_metadata = pd.DataFrame(relationship_headers)

In [14]:
relationship_metadata

Unnamed: 0,relationship,metadataHeader,importHeader,source,target,metadataPath
0,SUPPORTED,"from,to",":START_ID(ResearchInitiative-ID),:END_ID(Grant-ID)",ResearchInitiative,Grant,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/ResearchInitiative-SUPPORTED-Grant.csv
1,DEVELOPED,"from,to",":START_ID(Researcher-ID),:END_ID(Software-ID)",Researcher,Software,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-DEVELOPED-Software.csv
2,CITES,"from,to",":START_ID(Publication-ID),:END_ID(Publication-ID)",Publication,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Publication-CITES-Publication.csv
3,FUNDED,"from,to",":START_ID(Grant-ID),:END_ID(ResearchInitiative-ID)",Grant,ResearchInitiative,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Grant-FUNDED-ResearchInitiative.csv
4,PRESENTED_AT,"from,to",":START_ID(Presentation-ID),:END_ID(Event-ID)",Presentation,Event,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Presentation-PRESENTED_AT-Event.csv
5,CREATED,"from,to",":START_ID(Researcher-ID),:END_ID(Dataset-ID)",Researcher,Dataset,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-CREATED-Dataset.csv
6,PROVIDES,"from,to",":START_ID(FundingOpportunity-ID),:END_ID(Grant-ID)",FundingOpportunity,Grant,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/FundingOpportunity-PROVIDES-Grant.csv
7,EMPLOYED_AT,"from,to,lastUpdated",":START_ID(Researcher-ID),:END_ID(Organization-ID),lastUpdated:int",Researcher,Organization,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-EMPLOYED_AT-Organization.csv
8,USED,"from,to",":START_ID(Researcher-ID),:END_ID(Dataset-ID)",Researcher,Dataset,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-USED-Dataset.csv
9,IS_INVENTOR,"from,to",":START_ID(Researcher-ID),:END_ID(Patent-ID)",Researcher,Patent,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-IS_INVENTOR-Patent.csv


## Add constraints and indices for Nodes

In [15]:
def get_string_properties(row):
    node = row["node"]
    header = row["importHeader"]
    fields = header.split(",")
    fields = filter(lambda field: field.endswith(":string"), fields)
    string_properties = []
    
    for field in fields:
        field_name = field.split(":")[0]
        string_properties.append(field_name)

    return string_properties

In [16]:
node_metadata["stringProperties"] = node_metadata.apply(get_string_properties, axis=1)

In [17]:
def add_index(row):
    node = row["node"]
    properties = row["stringProperties"]
    indices = f"CREATE CONSTRAINT {node} FOR (n:{node}) REQUIRE n.id IS UNIQUE;"
    
    for prop in properties:
        indices += f"CREATE INDEX {node}_{prop} FOR (n:{node}) ON (n.{prop});"
        #indices += f"CREATE FULLTEXT INDEX FOR (n:{node}) ON EACH [n.{prop}];"

    return indices

In [18]:
node_metadata["index"] = node_metadata.apply(add_index, axis=1)

In [19]:
node_metadata.head()

Unnamed: 0,node,metadataHeader,importHeader,metadataPath,stringProperties,index
0,Software,"id,name,description,url","id:ID(Software-ID),name:string,description:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Software.csv,"[name, description, url]",CREATE CONSTRAINT Software FOR (n:Software) REQUIRE n.id IS UNIQUE;CREATE INDEX Software_name FOR (n:Software) ON (n.name);CREATE INDEX Software_description FOR (n:Software) ON (n.description);CREATE INDEX Software_url FOR (n:Software) ON (n.url);
1,Organization,"id,name,city,state,country,ror,uei,duns,geonames,url","id:ID(Organization-ID),name:string,city:string,state:string,country:string,ror:string,uei:string,duns:string,geonames:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Organization.csv,"[name, city, state, country, ror, uei, duns, geonames, url]",CREATE CONSTRAINT Organization FOR (n:Organization) REQUIRE n.id IS UNIQUE;CREATE INDEX Organization_name FOR (n:Organization) ON (n.name);CREATE INDEX Organization_city FOR (n:Organization) ON (n.city);CREATE INDEX Organization_state FOR (n:Organization) ON (n.state);CREATE INDEX Organization_country FOR (n:Organization) ON (n.country);CREATE INDEX Organization_ror FOR (n:Organization) ON (n.ror);CREATE INDEX Organization_uei FOR (n:Organization) ON (n.uei);CREATE INDEX Organization_duns FOR (n:Organization) ON (n.duns);CREATE INDEX Organization_geonames FOR (n:Organization) ON (n.geonames);CREATE INDEX Organization_url FOR (n:Organization) ON (n.url);
2,Patent,"id,name,inventors,filingDate,grantedDate,status,applicants,url","id:ID(Patent-ID),name:string,inventors:string[],filingDate:date,grantedDate:date,status:string,applicants:string[],url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Patent.csv,"[name, status, url]",CREATE CONSTRAINT Patent FOR (n:Patent) REQUIRE n.id IS UNIQUE;CREATE INDEX Patent_name FOR (n:Patent) ON (n.name);CREATE INDEX Patent_status FOR (n:Patent) ON (n.status);CREATE INDEX Patent_url FOR (n:Patent) ON (n.url);
3,ResearchInitiative,"id,name,longName,description,researchInitiativeUrl,url,websites","id:ID(ResearchInitiative-ID),name:string,longName:string,description:string,researchInitiativeUrl:string,url:string,websites:string[]",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/ResearchInitiative.csv,"[name, longName, description, researchInitiativeUrl, url]",CREATE CONSTRAINT ResearchInitiative FOR (n:ResearchInitiative) REQUIRE n.id IS UNIQUE;CREATE INDEX ResearchInitiative_name FOR (n:ResearchInitiative) ON (n.name);CREATE INDEX ResearchInitiative_longName FOR (n:ResearchInitiative) ON (n.longName);CREATE INDEX ResearchInitiative_description FOR (n:ResearchInitiative) ON (n.description);CREATE INDEX ResearchInitiative_researchInitiativeUrl FOR (n:ResearchInitiative) ON (n.researchInitiativeUrl);CREATE INDEX ResearchInitiative_url FOR (n:ResearchInitiative) ON (n.url);
4,Presentation,"id,name,presenters,presentationUrl,videoUrl","id:ID(Presentation-ID),name:string,presenters:string,presentationUrl:string,videoUrl:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Presentation.csv,"[name, presenters, presentationUrl, videoUrl]",CREATE CONSTRAINT Presentation FOR (n:Presentation) REQUIRE n.id IS UNIQUE;CREATE INDEX Presentation_name FOR (n:Presentation) ON (n.name);CREATE INDEX Presentation_presenters FOR (n:Presentation) ON (n.presenters);CREATE INDEX Presentation_presentationUrl FOR (n:Presentation) ON (n.presentationUrl);CREATE INDEX Presentation_videoUrl FOR (n:Presentation) ON (n.videoUrl);


## Get headers from data files

Node data files

In [20]:
dirpath, _, filenames = next(walk(NODE_DATA))
csv_files = filter(lambda name: name.endswith(".csv"), filenames)
data_headers = [get_node_data_headers(dirpath, filename) for filename in csv_files]

Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Organization.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Publication_primary.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Researcher_investigators.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Patent.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Researcher_manually.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Dataset_manually.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes ResearchInitiative.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Presentation.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Software_manually.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Publication_secondary.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Event.csv
Node Data /Users/Peter/GitRepositories/radx-kg/kg/data/nodes Dataset.csv
Node Data /Users/Peter/GitReposi

In [21]:
node_data = pd.DataFrame(data_headers)

In [22]:
node_data

Unnamed: 0,node,dataHeader,dataPath
0,Organization,"id,name,city,state,country,ror,uei,duns,geonames,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Organization.csv
1,Publication,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Publication_primary.csv
2,Researcher,"id,name,fullName,firstName,middleName,lastName,orcid,profileId",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Researcher_investigators.csv
3,Patent,"id,name,inventors,filingDate,grantedDate,status,applicants,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Patent.csv
4,Researcher,"id,name,fullName,firstName,middleName,lastName,orcid,profileId",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Researcher_manually.csv
5,Dataset,"id,name,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Dataset_manually.csv
6,ResearchInitiative,"id,name,longName,description,researchInitiativeUrl,url,websites",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/ResearchInitiative.csv
7,Presentation,"id,name,presenters,presentationUrl,videoUrl",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Presentation.csv
8,Software,"id,name,description,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Software_manually.csv
9,Publication,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Publication_secondary.csv


Relationship data files

In [23]:
dirpath, _, filenames = next(walk(RELATIONSHIP_DATA))
csv_files = filter(lambda name: name.endswith(".csv"), filenames)
data_headers = [get_relationship_data_headers(dirpath, filename) for filename in csv_files]

Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships ResearchInitiative-SUPPORTED-Grant.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Researcher-DEVELOPED-Software.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Publication-CITES-Publication.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Grant-FUNDED-ResearchInitiative.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Researcher-CREATED-Dataset_manually.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Researcher-AUTHORED-Publication_investigators.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Presentation-PRESENTED_AT-Event.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Researcher-CREATED-Dataset.csv
Relationship Data /Users/Peter/GitRepositories/radx-kg/kg/data/relationships Funding

In [24]:
relationship_data = pd.DataFrame(data_headers)

In [25]:
relationship_data

Unnamed: 0,relationship,dataHeader,dataPath
0,SUPPORTED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/ResearchInitiative-SUPPORTED-Grant.csv
1,DEVELOPED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-DEVELOPED-Software.csv
2,CITES,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Publication-CITES-Publication.csv
3,FUNDED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Grant-FUNDED-ResearchInitiative.csv
4,CREATED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-CREATED-Dataset_manually.csv
5,AUTHORED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-AUTHORED-Publication_investigators.csv
6,PRESENTED_AT,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Presentation-PRESENTED_AT-Event.csv
7,CREATED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-CREATED-Dataset.csv
8,PROVIDES,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/FundingOpportunity-PROVIDES-Grant.csv
9,EMPLOYED_AT,"from,to,lastUpdated",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-EMPLOYED_AT-Organization.csv


## Merge metadata with data

In [26]:
matched_nodes = node_data.merge(node_metadata, on="node", how="outer")
matched_nodes["match"] = matched_nodes["dataHeader"] == matched_nodes["metadataHeader"]
matched_nodes.fillna("", inplace=True)

In [27]:
matched_nodes

Unnamed: 0,node,dataHeader,dataPath,metadataHeader,importHeader,metadataPath,stringProperties,index,match
0,Organization,"id,name,city,state,country,ror,uei,duns,geonames,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Organization.csv,"id,name,city,state,country,ror,uei,duns,geonames,url","id:ID(Organization-ID),name:string,city:string,state:string,country:string,ror:string,uei:string,duns:string,geonames:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Organization.csv,"[name, city, state, country, ror, uei, duns, geonames, url]",CREATE CONSTRAINT Organization FOR (n:Organization) REQUIRE n.id IS UNIQUE;CREATE INDEX Organization_name FOR (n:Organization) ON (n.name);CREATE INDEX Organization_city FOR (n:Organization) ON (n.city);CREATE INDEX Organization_state FOR (n:Organization) ON (n.state);CREATE INDEX Organization_country FOR (n:Organization) ON (n.country);CREATE INDEX Organization_ror FOR (n:Organization) ON (n.ror);CREATE INDEX Organization_uei FOR (n:Organization) ON (n.uei);CREATE INDEX Organization_duns FOR (n:Organization) ON (n.duns);CREATE INDEX Organization_geonames FOR (n:Organization) ON (n.geonames);CREATE INDEX Organization_url FOR (n:Organization) ON (n.url);,True
1,Publication,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Publication_primary.csv,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url","id:ID(Publication-ID),name:string,abstract:string,journal:string,year:int,type:string,doi:string,pmId:string,pmcId:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Publication.csv,"[name, abstract, journal, type, doi, pmId, pmcId, url]",CREATE CONSTRAINT Publication FOR (n:Publication) REQUIRE n.id IS UNIQUE;CREATE INDEX Publication_name FOR (n:Publication) ON (n.name);CREATE INDEX Publication_abstract FOR (n:Publication) ON (n.abstract);CREATE INDEX Publication_journal FOR (n:Publication) ON (n.journal);CREATE INDEX Publication_type FOR (n:Publication) ON (n.type);CREATE INDEX Publication_doi FOR (n:Publication) ON (n.doi);CREATE INDEX Publication_pmId FOR (n:Publication) ON (n.pmId);CREATE INDEX Publication_pmcId FOR (n:Publication) ON (n.pmcId);CREATE INDEX Publication_url FOR (n:Publication) ON (n.url);,True
2,Publication,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Publication_secondary.csv,"id,name,abstract,journal,year,type,doi,pmId,pmcId,url","id:ID(Publication-ID),name:string,abstract:string,journal:string,year:int,type:string,doi:string,pmId:string,pmcId:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Publication.csv,"[name, abstract, journal, type, doi, pmId, pmcId, url]",CREATE CONSTRAINT Publication FOR (n:Publication) REQUIRE n.id IS UNIQUE;CREATE INDEX Publication_name FOR (n:Publication) ON (n.name);CREATE INDEX Publication_abstract FOR (n:Publication) ON (n.abstract);CREATE INDEX Publication_journal FOR (n:Publication) ON (n.journal);CREATE INDEX Publication_type FOR (n:Publication) ON (n.type);CREATE INDEX Publication_doi FOR (n:Publication) ON (n.doi);CREATE INDEX Publication_pmId FOR (n:Publication) ON (n.pmId);CREATE INDEX Publication_pmcId FOR (n:Publication) ON (n.pmcId);CREATE INDEX Publication_url FOR (n:Publication) ON (n.url);,True
3,Researcher,"id,name,fullName,firstName,middleName,lastName,orcid,profileId",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Researcher_investigators.csv,"id,name,fullName,firstName,middleName,lastName,orcid,profileId","id:ID(Researcher-ID),name:string,fullName:string,firstName:string,middleName:string,lastName:string,orcid:string,profileId:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Researcher.csv,"[name, fullName, firstName, middleName, lastName, orcid, profileId]",CREATE CONSTRAINT Researcher FOR (n:Researcher) REQUIRE n.id IS UNIQUE;CREATE INDEX Researcher_name FOR (n:Researcher) ON (n.name);CREATE INDEX Researcher_fullName FOR (n:Researcher) ON (n.fullName);CREATE INDEX Researcher_firstName FOR (n:Researcher) ON (n.firstName);CREATE INDEX Researcher_middleName FOR (n:Researcher) ON (n.middleName);CREATE INDEX Researcher_lastName FOR (n:Researcher) ON (n.lastName);CREATE INDEX Researcher_orcid FOR (n:Researcher) ON (n.orcid);CREATE INDEX Researcher_profileId FOR (n:Researcher) ON (n.profileId);,True
4,Researcher,"id,name,fullName,firstName,middleName,lastName,orcid,profileId",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Researcher_manually.csv,"id,name,fullName,firstName,middleName,lastName,orcid,profileId","id:ID(Researcher-ID),name:string,fullName:string,firstName:string,middleName:string,lastName:string,orcid:string,profileId:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Researcher.csv,"[name, fullName, firstName, middleName, lastName, orcid, profileId]",CREATE CONSTRAINT Researcher FOR (n:Researcher) REQUIRE n.id IS UNIQUE;CREATE INDEX Researcher_name FOR (n:Researcher) ON (n.name);CREATE INDEX Researcher_fullName FOR (n:Researcher) ON (n.fullName);CREATE INDEX Researcher_firstName FOR (n:Researcher) ON (n.firstName);CREATE INDEX Researcher_middleName FOR (n:Researcher) ON (n.middleName);CREATE INDEX Researcher_lastName FOR (n:Researcher) ON (n.lastName);CREATE INDEX Researcher_orcid FOR (n:Researcher) ON (n.orcid);CREATE INDEX Researcher_profileId FOR (n:Researcher) ON (n.profileId);,True
5,Researcher,"id,name,fullName,firstName,middleName,lastName,orcid,profileId",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Researcher_primary_coauthors.csv,"id,name,fullName,firstName,middleName,lastName,orcid,profileId","id:ID(Researcher-ID),name:string,fullName:string,firstName:string,middleName:string,lastName:string,orcid:string,profileId:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Researcher.csv,"[name, fullName, firstName, middleName, lastName, orcid, profileId]",CREATE CONSTRAINT Researcher FOR (n:Researcher) REQUIRE n.id IS UNIQUE;CREATE INDEX Researcher_name FOR (n:Researcher) ON (n.name);CREATE INDEX Researcher_fullName FOR (n:Researcher) ON (n.fullName);CREATE INDEX Researcher_firstName FOR (n:Researcher) ON (n.firstName);CREATE INDEX Researcher_middleName FOR (n:Researcher) ON (n.middleName);CREATE INDEX Researcher_lastName FOR (n:Researcher) ON (n.lastName);CREATE INDEX Researcher_orcid FOR (n:Researcher) ON (n.orcid);CREATE INDEX Researcher_profileId FOR (n:Researcher) ON (n.profileId);,True
6,Patent,"id,name,inventors,filingDate,grantedDate,status,applicants,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Patent.csv,"id,name,inventors,filingDate,grantedDate,status,applicants,url","id:ID(Patent-ID),name:string,inventors:string[],filingDate:date,grantedDate:date,status:string,applicants:string[],url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Patent.csv,"[name, status, url]",CREATE CONSTRAINT Patent FOR (n:Patent) REQUIRE n.id IS UNIQUE;CREATE INDEX Patent_name FOR (n:Patent) ON (n.name);CREATE INDEX Patent_status FOR (n:Patent) ON (n.status);CREATE INDEX Patent_url FOR (n:Patent) ON (n.url);,True
7,Dataset,"id,name,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Dataset_manually.csv,"id,name,url","id:ID(Dataset-ID),name:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Dataset.csv,"[name, url]",CREATE CONSTRAINT Dataset FOR (n:Dataset) REQUIRE n.id IS UNIQUE;CREATE INDEX Dataset_name FOR (n:Dataset) ON (n.name);CREATE INDEX Dataset_url FOR (n:Dataset) ON (n.url);,True
8,Dataset,"id,name,url",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/Dataset.csv,"id,name,url","id:ID(Dataset-ID),name:string,url:string",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/Dataset.csv,"[name, url]",CREATE CONSTRAINT Dataset FOR (n:Dataset) REQUIRE n.id IS UNIQUE;CREATE INDEX Dataset_name FOR (n:Dataset) ON (n.name);CREATE INDEX Dataset_url FOR (n:Dataset) ON (n.url);,True
9,ResearchInitiative,"id,name,longName,description,researchInitiativeUrl,url,websites",/Users/Peter/GitRepositories/radx-kg/kg/data/nodes/ResearchInitiative.csv,"id,name,longName,description,researchInitiativeUrl,url,websites","id:ID(ResearchInitiative-ID),name:string,longName:string,description:string,researchInitiativeUrl:string,url:string,websites:string[]",/Users/Peter/GitRepositories/radx-kg/kg/metadata/nodes/ResearchInitiative.csv,"[name, longName, description, researchInitiativeUrl, url]",CREATE CONSTRAINT ResearchInitiative FOR (n:ResearchInitiative) REQUIRE n.id IS UNIQUE;CREATE INDEX ResearchInitiative_name FOR (n:ResearchInitiative) ON (n.name);CREATE INDEX ResearchInitiative_longName FOR (n:ResearchInitiative) ON (n.longName);CREATE INDEX ResearchInitiative_description FOR (n:ResearchInitiative) ON (n.description);CREATE INDEX ResearchInitiative_researchInitiativeUrl FOR (n:ResearchInitiative) ON (n.researchInitiativeUrl);CREATE INDEX ResearchInitiative_url FOR (n:ResearchInitiative) ON (n.url);,True


In [28]:
mismatched_nodes = matched_nodes[(matched_nodes["match"] == False) & (matched_nodes["dataPath"] != "")]
mismatched_nodes

Unnamed: 0,node,dataHeader,dataPath,metadataHeader,importHeader,metadataPath,stringProperties,index,match


In [29]:
if mismatched_nodes.shape[0] > 0:
    print("The following node data files do not match the metadata specification:")
    mismatched_nodes.to_csv(os.path.join(NEO4J_IMPORT, "mismatches_n.csv"), index=False)
    
mismatched_nodes

Unnamed: 0,node,dataHeader,dataPath,metadataHeader,importHeader,metadataPath,stringProperties,index,match


In [30]:
matched_relationships = relationship_data.merge(relationship_metadata, on="relationship", how="outer")
matched_relationships["match"] = matched_relationships["dataHeader"] == matched_relationships["metadataHeader"]
matched_relationships.fillna("", inplace=True)
matched_relationships["fullRelationship"] = matched_relationships["source"] + "-" + matched_relationships["relationship"] + "-" + matched_relationships["target"]
matched_relationships

Unnamed: 0,relationship,dataHeader,dataPath,metadataHeader,importHeader,source,target,metadataPath,match,fullRelationship
0,SUPPORTED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/ResearchInitiative-SUPPORTED-Grant.csv,"from,to",":START_ID(ResearchInitiative-ID),:END_ID(Grant-ID)",ResearchInitiative,Grant,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/ResearchInitiative-SUPPORTED-Grant.csv,True,ResearchInitiative-SUPPORTED-Grant
1,DEVELOPED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-DEVELOPED-Software.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Software-ID)",Researcher,Software,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-DEVELOPED-Software.csv,True,Researcher-DEVELOPED-Software
2,CITES,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Publication-CITES-Publication.csv,"from,to",":START_ID(Publication-ID),:END_ID(Publication-ID)",Publication,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Publication-CITES-Publication.csv,True,Publication-CITES-Publication
3,FUNDED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Grant-FUNDED-ResearchInitiative.csv,"from,to",":START_ID(Grant-ID),:END_ID(ResearchInitiative-ID)",Grant,ResearchInitiative,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Grant-FUNDED-ResearchInitiative.csv,True,Grant-FUNDED-ResearchInitiative
4,CREATED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-CREATED-Dataset_manually.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Dataset-ID)",Researcher,Dataset,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-CREATED-Dataset.csv,True,Researcher-CREATED-Dataset
5,CREATED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-CREATED-Dataset.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Dataset-ID)",Researcher,Dataset,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-CREATED-Dataset.csv,True,Researcher-CREATED-Dataset
6,AUTHORED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-AUTHORED-Publication_investigators.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Publication-ID)",Researcher,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-AUTHORED-Publication.csv,True,Researcher-AUTHORED-Publication
7,AUTHORED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-AUTHORED-Publication_other.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Publication-ID)",Researcher,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-AUTHORED-Publication.csv,True,Researcher-AUTHORED-Publication
8,PRESENTED_AT,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Presentation-PRESENTED_AT-Event.csv,"from,to",":START_ID(Presentation-ID),:END_ID(Event-ID)",Presentation,Event,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Presentation-PRESENTED_AT-Event.csv,True,Presentation-PRESENTED_AT-Event
9,PROVIDES,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/FundingOpportunity-PROVIDES-Grant.csv,"from,to",":START_ID(FundingOpportunity-ID),:END_ID(Grant-ID)",FundingOpportunity,Grant,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/FundingOpportunity-PROVIDES-Grant.csv,True,FundingOpportunity-PROVIDES-Grant


In [31]:
mismatched_relationships = matched_relationships[(matched_relationships["match"] == False) & (matched_relationships["dataPath"] != "")]
if mismatched_relationships.shape[0] > 0:
    print("The following relationship data files do not match the metadata specification:")
    mismatched_relationships.to_csv(os.path.join(NEO4J_IMPORT, "mismatches_r.csv"), index=False)
    
mismatched_relationships

Unnamed: 0,relationship,dataHeader,dataPath,metadataHeader,importHeader,source,target,metadataPath,match,fullRelationship


## Write Neo4j header files for bulk import

In [32]:
def save_node_header(name, import_header, NEO4J_IMPORT):
    df = pd.DataFrame([], columns=import_header.split(","))
    df.to_csv(os.path.join(NEO4J_IMPORT, "header_" + name + "_n.csv"), index=False)

Write node header files

In [33]:
matched_nodes.query("match == True", inplace=True)

In [34]:
out = matched_nodes.apply(lambda row: save_node_header(row["node"], row["importHeader"], NEO4J_IMPORT), axis=1)

Write relationship header files

In [35]:
def save_relationship_header(name, import_header, NEO4J_IMPORT):
    df = pd.DataFrame([], columns=import_header.split(","))
    df.to_csv(os.path.join(NEO4J_IMPORT, "header_" + name + "_r.csv"), index=False)

In [36]:
matched_relationships.query("match == True", inplace=True)

In [37]:
matched_relationships

Unnamed: 0,relationship,dataHeader,dataPath,metadataHeader,importHeader,source,target,metadataPath,match,fullRelationship
0,SUPPORTED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/ResearchInitiative-SUPPORTED-Grant.csv,"from,to",":START_ID(ResearchInitiative-ID),:END_ID(Grant-ID)",ResearchInitiative,Grant,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/ResearchInitiative-SUPPORTED-Grant.csv,True,ResearchInitiative-SUPPORTED-Grant
1,DEVELOPED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-DEVELOPED-Software.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Software-ID)",Researcher,Software,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-DEVELOPED-Software.csv,True,Researcher-DEVELOPED-Software
2,CITES,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Publication-CITES-Publication.csv,"from,to",":START_ID(Publication-ID),:END_ID(Publication-ID)",Publication,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Publication-CITES-Publication.csv,True,Publication-CITES-Publication
3,FUNDED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Grant-FUNDED-ResearchInitiative.csv,"from,to",":START_ID(Grant-ID),:END_ID(ResearchInitiative-ID)",Grant,ResearchInitiative,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Grant-FUNDED-ResearchInitiative.csv,True,Grant-FUNDED-ResearchInitiative
4,CREATED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-CREATED-Dataset_manually.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Dataset-ID)",Researcher,Dataset,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-CREATED-Dataset.csv,True,Researcher-CREATED-Dataset
5,CREATED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-CREATED-Dataset.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Dataset-ID)",Researcher,Dataset,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-CREATED-Dataset.csv,True,Researcher-CREATED-Dataset
6,AUTHORED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-AUTHORED-Publication_investigators.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Publication-ID)",Researcher,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-AUTHORED-Publication.csv,True,Researcher-AUTHORED-Publication
7,AUTHORED,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Researcher-AUTHORED-Publication_other.csv,"from,to",":START_ID(Researcher-ID),:END_ID(Publication-ID)",Researcher,Publication,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Researcher-AUTHORED-Publication.csv,True,Researcher-AUTHORED-Publication
8,PRESENTED_AT,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/Presentation-PRESENTED_AT-Event.csv,"from,to",":START_ID(Presentation-ID),:END_ID(Event-ID)",Presentation,Event,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/Presentation-PRESENTED_AT-Event.csv,True,Presentation-PRESENTED_AT-Event
9,PROVIDES,"from,to",/Users/Peter/GitRepositories/radx-kg/kg/data/relationships/FundingOpportunity-PROVIDES-Grant.csv,"from,to",":START_ID(FundingOpportunity-ID),:END_ID(Grant-ID)",FundingOpportunity,Grant,/Users/Peter/GitRepositories/radx-kg/kg/metadata/relationships/FundingOpportunity-PROVIDES-Grant.csv,True,FundingOpportunity-PROVIDES-Grant


In [38]:
out = matched_relationships.apply(lambda row: save_relationship_header(row["fullRelationship"], row["importHeader"], NEO4J_IMPORT), axis=1)

## Create MetaNode and MetaRelationship files

Compile a dictionary of all node properties

In [39]:
property_dir = {}
for header in matched_nodes["metadataHeader"]:
    for prop in header.split(","):
        property_dir[prop] = ""

Create dataframe with MetaNode data

In [40]:
node_list = []
for node, filepath in matched_nodes[["node","metadataPath"]].itertuples(index=False):
    node_list.append(create_meta_node(node, property_dir, filepath))
    
meta_nodes = pd.DataFrame(node_list)
meta_nodes.drop_duplicates(inplace=True)
meta_nodes.to_csv(os.path.join(NEO4J_IMPORT, "MetaNode_n.csv"), index=False)

In [41]:
meta_nodes

Unnamed: 0,id,name,city,state,country,ror,uei,duns,geonames,url,abstract,journal,year,type,doi,pmId,pmcId,fullName,firstName,middleName,lastName,orcid,profileId,inventors,filingDate,grantedDate,status,applicants,longName,description,researchInitiativeUrl,websites,presenters,presentationUrl,videoUrl,eventType,eventUrl,startDate,endDate,narrative,fundingMechanism,awardCode,researchInitiative,subProject,nodeName:ID(MetaNode-ID)
0,"Unique entity identifier: ror id if available, otherwise uei id (string)",Name of the organization (string),City of the organization (string),State of the organization (string),Country of the organization (string),Research Organization Registry id of the organization from https://ror.org/ (string),US government-owned Unique Entity Identifier (UEI) for the organization (https://grants.nih.gov/grants/guide/notice-files/NOT-OD-21-170.html) (string),Data Universal Numbering System (DUNS) by Dun & Bradstreet (string),Unique identifier for geolocation from https://www.geonames.org/ (string),Research Organization Registry landing page of the organization (string),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Organization
1,Unique identifier for the publication (string),Title of the publication (string),,,,,,,,URL of publication (string),Abstract of the publication (string),Name of the journal (string),Year of publication (int),primary or secondary citation (string),Digital Object Identifier of the publication (string),PubMed identifier of the publication (string),PubMedCentral identifier of the publication full text entry (string),,,,,,,,,,,,,,,,,,,,,,,,,,,,Publication
3,Unique researcher identifier (orcid preferred or NIH profile id) (string),Name of the researcher (lastname plus initials) (string),,,,,,,,,,,,,,,,First middle and last name (string),First name (string),Middle name or middle initial (string),Last name (string),Unique reseacher identifier from ORICD (https://orcid.org) (string),Unique researcher identifier from NIH Reporter (https://api.reporter.nih.gov/) (string),,,,,,,,,,,,,,,,,,,,,,Researcher
6,Identifier of the patents (string),Title of the patent (string),,,,,,,,URL of patent webpage (string),,,,,,,,,,,,,,Name of the inventors (string[]),Date the patent application was filed (date),Date the patent was granted (date),active or pending (string),Name of the organization(s) that filed the patent application (string[]),,,,,,,,,,,,,,,,,Patent
7,Unique identifier for the dataset (string),Name of the dataset (string),,,,,,,,URL to dataset landing page (string),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Dataset
9,Unique id of the research initiative (string),Abbreviation of the research initiative (string),,,,,,,,URL of the research initiative website (string),,,,,,,,,,,,,,,,,,,Long name for research initiative (string),Description of the research initiative (string),Description of the research initiative that funded the research initiative (string),URLs of related websites (string[]),,,,,,,,,,,,,ResearchInitiative
10,Unique identifier of the presentation (hash of event id and name) (string),Title of the presentation (string),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"List of presenters separated by the vertical bar ""|"" (string)",URL of the presentation link (string),URL of the presentation recording (string),,,,,,,,,,Presentation
11,Identifier of the software application or software repository (string),Name of the software application or software repository (string),,,,,,,,URL of the software application or software repository (string),,,,,,,,,,,,,,,,,,,,Description of the software or software repository (string),,,,,,,,,,,,,,,Software
12,Unique identifier of the event (date and event number) (string),Name of the event (string),City of event location (string),State or province of the event location (string),Country of event location (string),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Type of event: webinar, virtual conference, in-person conference, hybrid conference, virtual workshop, in-person workshop, hybrid workshop (string)",URL of the event website (string),Start date of event (date),End data of event (date),,,,,,Event
13,Unique identifier for the funding opportunity (Thesaurus:C20021) (string),Name of the funding opportunity (string),,,,,,,,URL to funding opportunity description (string),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FundingOpportunity


Compile a dictionary of all relationship properties

In [42]:
property_dir = {}
for header in matched_relationships["metadataHeader"]:
    for property in header.split(","):
        property_dir[property] = ""

Create dataframe with MetaRelationship data

In [43]:
relationship_list = []
for relationship, source, target, filepath in matched_relationships[["relationship", "source", "target", "metadataPath"]].itertuples(index=False):
    relationship_list.append(create_meta_relationship(relationship, source, target, property_dir, filepath))
    
meta_relationships = pd.DataFrame(relationship_list)
meta_relationships.drop_duplicates(inplace=True)
meta_relationships.to_csv(os.path.join(NEO4J_IMPORT, "MetaRelationship_r.csv"), index=False)

In [44]:
meta_relationships

Unnamed: 0,from,to,lastUpdated,isPi,isContactPi,isStudyInvestigator,relationshipName,source:START_ID(MetaNode-ID),target:END_ID(MetaNode-ID)
0,Unique identifier for the research initiative (string),Unique identifier for the grant (Thesaurus:C16644) (string),,,,,SUPPORTED,ResearchInitiative,Grant
1,Unique identifier for the researcher (string),Unique identifier for the software application or software repository (string),,,,,DEVELOPED,Researcher,Software
2,DOI of citing publication (string),DOI of cited publication (string),,,,,CITES,Publication,Publication
3,Unique identifier for the grant (Thesaurus:C16644) (string),Unique identifier for the research initiative (string),,,,,FUNDED,Grant,ResearchInitiative
4,Unique identifier for the researcher (string),Unique identifier (CRUIE preferred) for the dataset (string),,,,,CREATED,Researcher,Dataset
6,Unique identifier for the researcher (string),Unique identifier for publication (string),,,,,AUTHORED,Researcher,Publication
8,Unique identifier for the presentation (string),Unique identifier for the event (string),,,,,PRESENTED_AT,Presentation,Event
9,Unique identifier for the funding opportunity (Thesaurus:C20021) (string),Unique identifier for the grant (Thesaurus:C16644) (string),,,,,PROVIDES,FundingOpportunity,Grant
10,Unique identifier for the researcher (string),"Unique entity identifier: ror id if available, otherwise uei id (string)",NIH fiscal year this information was last updated (int),,,,EMPLOYED_AT,Researcher,Organization
11,Unique identifier for the researcher (string),Unique identifier (CRUIE preferred) for the dataset (string),,,,,USED,Researcher,Dataset


## Create Neo4j bulk upload command line arguments
See: https://neo4j.com/docs/operations-manual/current/tools/neo4j-admin/neo4j-admin-import/

In [45]:
args = ""
for node in matched_nodes["node"].unique():
    #args += f" --nodes={node}=header_{node}_n.csv,{node}*_n.csv"
    args += f" --nodes={node}=header_{node}_n.csv,{node}.*_n.csv"

In [46]:
args += " --nodes=MetaNode=MetaNode_n.csv"

In [47]:
rel_data = matched_relationships[["relationship", "fullRelationship"]].copy()
rel_data.drop_duplicates(inplace=True)
for relationship, fullRelationship in rel_data.itertuples(index=False):
    #args += f" --relationships={relationship}=header_{fullRelationship}_r.csv,{fullRelationship}*_r.csv"
    args += f" --relationships={relationship}=header_{fullRelationship}_r.csv,{fullRelationship}.*_r.csv"

In [48]:
args += " --relationships=MetaRelationship=MetaRelationship_r.csv"

In [49]:
file = open(os.path.join(NEO4J_IMPORT, "args.txt"), "w")
file.write(args)
file.close() 

In [50]:
print(args)

 --nodes=Organization=header_Organization_n.csv,Organization.*_n.csv --nodes=Publication=header_Publication_n.csv,Publication.*_n.csv --nodes=Researcher=header_Researcher_n.csv,Researcher.*_n.csv --nodes=Patent=header_Patent_n.csv,Patent.*_n.csv --nodes=Dataset=header_Dataset_n.csv,Dataset.*_n.csv --nodes=ResearchInitiative=header_ResearchInitiative_n.csv,ResearchInitiative.*_n.csv --nodes=Presentation=header_Presentation_n.csv,Presentation.*_n.csv --nodes=Software=header_Software_n.csv,Software.*_n.csv --nodes=Event=header_Event_n.csv,Event.*_n.csv --nodes=FundingOpportunity=header_FundingOpportunity_n.csv,FundingOpportunity.*_n.csv --nodes=Grant=header_Grant_n.csv,Grant.*_n.csv --nodes=MetaNode=MetaNode_n.csv --relationships=SUPPORTED=header_ResearchInitiative-SUPPORTED-Grant_r.csv,ResearchInitiative-SUPPORTED-Grant.*_r.csv --relationships=DEVELOPED=header_Researcher-DEVELOPED-Software_r.csv,Researcher-DEVELOPED-Software.*_r.csv --relationships=CITES=header_Publication-CITES-Publicat

## Create a Cypher script with default Constraints, indices, and fulltext indices

Create constraints for all node ids and indices and fulltext indices for all string properties

In [51]:
indexed_nodes = matched_nodes[["node", "index", "stringProperties"]].copy()

In [52]:
indices = "".join(indexed_nodes["index"].unique())

Create fulltext index

In [53]:
node_names = indexed_nodes["node"].values
node_names = list(set(node_names))
node_names = "|".join(f"{w}" for w in node_names)

In [54]:
property_names = indexed_nodes["stringProperties"].values
property_names = list(np.unique(np.concatenate(property_names).flat))
property_names = ",n.".join(f"{w}" for w in property_names)

In [55]:
fulltext_index = f"CREATE FULLTEXT INDEX fulltext FOR (n:{node_names}) ON EACH [n.{property_names}];"

In [56]:
indices = indices + fulltext_index
indices = indices.replace(";", ";\n")

In [57]:
print(indices)

CREATE CONSTRAINT Organization FOR (n:Organization) REQUIRE n.id IS UNIQUE;
CREATE INDEX Organization_name FOR (n:Organization) ON (n.name);
CREATE INDEX Organization_city FOR (n:Organization) ON (n.city);
CREATE INDEX Organization_state FOR (n:Organization) ON (n.state);
CREATE INDEX Organization_country FOR (n:Organization) ON (n.country);
CREATE INDEX Organization_ror FOR (n:Organization) ON (n.ror);
CREATE INDEX Organization_uei FOR (n:Organization) ON (n.uei);
CREATE INDEX Organization_duns FOR (n:Organization) ON (n.duns);
CREATE INDEX Organization_geonames FOR (n:Organization) ON (n.geonames);
CREATE INDEX Organization_url FOR (n:Organization) ON (n.url);
CREATE CONSTRAINT Publication FOR (n:Publication) REQUIRE n.id IS UNIQUE;
CREATE INDEX Publication_name FOR (n:Publication) ON (n.name);
CREATE INDEX Publication_abstract FOR (n:Publication) ON (n.abstract);
CREATE INDEX Publication_journal FOR (n:Publication) ON (n.journal);
CREATE INDEX Publication_type FOR (n:Publication) ON

In [58]:
with open(os.path.join(NEO4J_IMPORT, "indices.cypher"), "w") as f:
    f.write(indices)