# Create Node and Relationship files from a BioPortal Ontology
This notebook converts ontology files from [BioPortal](https://bioportal.bioontology.org/) to Node and Relationship files that represent the ontology trees. 

The Node and Relationship files can be uploaded into a Neo4j Graph Database using the [kg-import](https://github.com/sbl-sdsc/kg-import).

In [1]:
import os
import re
import glob
from pathlib import Path
import pandas as pd
from utils import parse_bioportal_csv

In [2]:
# reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
# configure pandas dataframe
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option('display.max_colwidth', None)

In [4]:
MAPPINGS = os.getenv('MAPPINGS', default='../mappings/')

In [5]:
NODE_METADATA = os.getenv('NODE_METADATA', default='../metadata/nodes/')
RELATIONSHIP_METADATA = os.getenv('RELATIONSHIP_METADATA', default='../metadata/relationships/')                  

In [6]:
if NODE_METADATA == '../metadata/nodes/':
    os.makedirs(os.path.join(NODE_METADATA), exist_ok=True)

In [7]:
if RELATIONSHIP_METADATA == '../metadata/relationships/':
    os.makedirs(os.path.join(RELATIONSHIP_METADATA), exist_ok=True)

In [8]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [9]:
if NODE_DATA == '../data/nodes/':
    os.makedirs(os.path.join(NODE_DATA), exist_ok=True)

In [10]:
if RELATIONSHIP_DATA == '../data/relationships/':
    os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [11]:
config = pd.read_csv('../mappings/chebi.csv')
#config = pd.read_csv('../mappings/mesh.csv')
#config = pd.read_csv('../mappings/doid.csv')
#config = pd.read_csv('../mappings/ncbi_taxonomy.csv')
#config = pd.read_csv('../mappings/uberon.csv')

In [12]:
config

Unnamed: 0,key,value
0,name,Chemical Entities of Biological Interest Ontology
1,downloadUrl,https://data.bioontology.org/ontologies/CHEBI/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv
2,curie,chebi
3,nodeFilename,Compound_chebi_209-2022-03-10.csv
4,relationshipFilename,Compound-IS_A-Compound_chebi_209-2022-03-10.csv
5,http://purl.obolibrary.org/obo/chebi/inchi,inchi
6,http://purl.obolibrary.org/obo/chebi/inchikey,inchiKey
7,http://purl.obolibrary.org/obo/chebi/smiles,smiles
8,http://purl.obolibrary.org/obo/chebi/formula,formula
9,http://purl.obolibrary.org/obo/chebi/mass,mass


In [13]:
node_filename = config.query('key == "nodeFilename"').values[0][1]
relationship_filename = config.query('key == "relationshipFilename"').values[0][1]

## Parse ontology file and create node and relationship dataframes

In [14]:
nodes, relationships = parse_bioportal_csv(config)

In [15]:
print('Number of nodes:', nodes.shape[0])
nodes.head()

Number of nodes: 178080


Unnamed: 0,id,name,synonyms,definition,url,inchi,inchiKey,smiles,formula,mass,xref
0,chebi:CHEBI_101465,"(2S,3S,4R)-4-(hydroxymethyl)-1-(2-methoxy-1-oxoethyl)-3-[4-(3-pyridinyl)phenyl]-2-azetidinecarbonitrile",,,http://purl.obolibrary.org/obo/CHEBI_101465,"InChI=1S/C19H19N3O3/c1-25-12-18(24)22-16(9-20)19(17(22)11-23)14-6-4-13(5-7-14)15-3-2-8-21-10-15/h2-8,10,16-17,19,23H,11-12H2,1H3/t16-,17+,19+/m1/s1",NONDGOMIDWLUNU-AOIWGVFYSA-N,COCC(=O)N1[C@H]([C@H]([C@H]1C#N)C2=CC=C(C=C2)C3=CN=CC=C3)CO,C19H19N3O3,337.373,LINCS:LSM-12828
1,chebi:CHEBI_159237,Leu-His-Glu,(2S)-2-[[(2S)-2-[[(2S)-2-amino-4-methylpentanoyl]amino]-3-(1H-imidazol-5-yl)propanoyl]amino]pentanedioic acid,,http://purl.obolibrary.org/obo/CHEBI_159237,"InChI=1S/C17H27N5O6/c1-9(2)5-11(18)15(25)22-13(6-10-7-19-8-20-10)16(26)21-12(17(27)28)3-4-14(23)24/h7-9,11-13H,3-6,18H2,1-2H3,(H,19,20)(H,21,26)(H,22,25)(H,23,24)(H,27,28)/t11-,12-,13-/m0/s1",KXODZBLFVFSLAI-AVGNSLFASA-N,O=C(N[C@@H](CC=1NC=NC1)C(=O)N[C@@H](CCC(O)=O)C(O)=O)[C@@H](N)CC(C)C,C17H27N5O6,397.432,Chemspider:58808728
2,chebi:CHEBI_101448,"2-fluoro-N-[(4S,7R,8S)-8-methoxy-4,7,10-trimethyl-11-oxo-2-oxa-5,10-diazabicyclo[10.4.0]hexadeca-1(12),13,15-trien-14-yl]benzamide",,,http://purl.obolibrary.org/obo/CHEBI_101448,"InChI=1S/C24H30FN3O4/c1-15-12-26-16(2)14-32-21-10-9-17(27-23(29)18-7-5-6-8-20(18)25)11-19(21)24(30)28(3)13-22(15)31-4/h5-11,15-16,22,26H,12-14H2,1-4H3,(H,27,29)/t15-,16+,22-/m1/s1",ULFQVHHILZHREN-ZMPRRUGASA-N,C[C@@H]1CN[C@H](COC2=C(C=C(C=C2)NC(=O)C3=CC=CC=C3F)C(=O)N(C[C@H]1OC)C)C,C24H30FN3O4,443.512,LINCS:LSM-12811
3,chebi:CHEBI_85476,O-hydroxyvaleroyl-L-carnitine,O-hydroxyvaleroyl-L-carnitines|O-hydroxyvaleroyl-(R)-carnitine|O-hydroxyvaleroyl-(R)-carnitines,An O-acyl-L-carnitine in which the acyl group specified is hydroxyvaleroyl in which the position of the hydroxy group is unspecified.,http://purl.obolibrary.org/obo/CHEBI_85476,,,C[N+](C)(C)C[C@@H](CC([O-])=O)OC([*])=O,C12H23NO5,261.31472,
4,chebi:CHEBI_179157,Furfuryl propyl disulfide,2-[(propyldisulanyl)methyl]uran,,http://purl.obolibrary.org/obo/CHEBI_179157,"InChI=1S/C8H12OS2/c1-2-6-10-11-7-8-4-3-5-9-8/h3-5H,2,6-7H2,1H3",YCXWJNAAXGVFED-UHFFFAOYSA-N,S(SCC=1OC=CC1)CCC,C8H12OS2,188.3,CAS:252736-36-0|HMDB:HMDB0032289|Chemspider:19571925


In [16]:
print('Number of relationships:', relationships.shape[0])
relationships.head()

Number of relationships: 230979


Unnamed: 0,from,to
0,chebi:CHEBI_101465,chebi:CHEBI_38193
1,chebi:CHEBI_159237,chebi:CHEBI_25676
2,chebi:CHEBI_101448,chebi:CHEBI_52898
3,chebi:CHEBI_101448,chebi:CHEBI_24995
4,chebi:CHEBI_85476,chebi:CHEBI_133449


## Save data files

In [17]:
nodes.to_csv(os.path.join(NODE_DATA, node_filename), index=False)

In [18]:
relationships.to_csv(os.path.join(RELATIONSHIP_DATA, relationship_filename), index=False)

## Create Metadata files

#### Pick a consise and representative example for the metadata file

In [19]:
def example_score(row):
    characters = 0
    score = 0
    
    for item in row:
        item = str(item)
        characters += len(item)
        # rows with the most complete data score higher
        if item != '':
            score += 1

    # rows with multiple synomyms score highter
    if '|' in row[2]:
        score += 0.5
         
    # shorter rows score higher
    score += len(row)/characters
    return score

In [20]:
nodes['score'] = nodes.apply(example_score, axis=1)

In [21]:
nodes = nodes.sort_values(by='score', ascending=False)

In [22]:
# pick example from place 50. The top scoring example is often atypical.
pick = min(50, nodes.shape[0])
example_row = nodes[50:]
examples = list(example_row.values[0])

#### Infer data types

In [23]:
nodes = nodes.convert_dtypes()

#### Create metadata dataframe

Node metadata

In [24]:
metadata = []
for node, dtype, exmpl in zip(nodes.columns, nodes.dtypes, examples):
    metadata.append({'property': node, 'type': dtype, 'description': node, 'example': exmpl})
    
node_metadata = pd.DataFrame(metadata)
# remove last row ('score')
node_metadata = node_metadata[:-1]
node_metadata

Unnamed: 0,property,type,description,example
0,id,string,id,chebi:CHEBI_75227
1,name,string,name,sodium metaborate
2,synonyms,string,synonyms,"Sodium borate|Boric acid, monosodium salt|NaBO2"
3,definition,string,definition,An inorganic sodium salt having metaborate as the counterion.
4,url,string,url,http://purl.obolibrary.org/obo/CHEBI_75227
5,inchi,string,inchi,InChI=1S/BO2.Na/c2-1-3;/q-1;+1
6,inchiKey,string,inchiKey,NVIFVTYDZMXWGX-UHFFFAOYSA-N
7,smiles,string,smiles,[Na+].[O-]B=O
8,formula,string,formula,BNaO2
9,mass,string,mass,65.80000


Relationship metadata

In [25]:
source = node_metadata['example'][0]
example_rel = relationships[relationships['from'] == source].copy()
example_rel = example_rel.reset_index(drop=True)

In [26]:
metadata = [{'property': 'from', 'type': 'string', 'description': 'Id of source node', 'example': example_rel['from'][0]},
            {'property': 'to', 'type': 'string', 'description': 'Id of target node', 'example': example_rel['to'][0]}]

relationship_metadata = pd.DataFrame(metadata)
relationship_metadata

Unnamed: 0,property,type,description,example
0,from,string,Id of source node,chebi:CHEBI_75227
1,to,string,Id of target node,chebi:CHEBI_38702


## Save metadata files

In [27]:
node_metadata_filename = re.split('\.|_', node_filename)[0]
node_metadata_filename += '.csv'

# split nodes from relationship
parts = relationship_filename.split('-', 2)
source_node = parts[0]
relationship = parts[1]
# remove any extra tag separated by underscore
target_node = parts[2].split('_')[0]
relationship_metadata_filename = source_node + '-' + relationship + '-' + target_node + '.csv'
    
node_metadata.to_csv(os.path.join(NODE_METADATA, node_metadata_filename), index=False)
relationship_metadata.to_csv(os.path.join(RELATIONSHIP_METADATA, relationship_metadata_filename), index=False)