# Create Node and Relationship files from a BioPortal Ontology
This notebook converts ontology files from [BioPortal](https://bioportal.bioontology.org/) to Node and Relationship files that represent the ontology trees. 

The Node and Relationship files can be uploaded into a Neo4j Graph Database using the [kg-import](https://github.com/sbl-sdsc/kg-import).

In [1]:
import os
import re
import glob
from pathlib import Path
import pandas as pd
from utils import parse_bioportal_csv

In [2]:
# reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
# configure pandas dataframe
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns
pd.set_option('display.max_colwidth', None)

In [4]:
MAPPINGS = os.getenv('MAPPINGS', default='../mappings/')

In [5]:
NODE_METADATA = os.getenv('NODE_METADATA', default='../metadata/nodes/')
RELATIONSHIP_METADATA = os.getenv('RELATIONSHIP_METADATA', default='../metadata/relationships/')                  

In [6]:
if NODE_METADATA == '../metadata/nodes/':
    os.makedirs(os.path.join(NODE_METADATA), exist_ok=True)

In [7]:
if RELATIONSHIP_METADATA == '../metadata/relationships/':
    os.makedirs(os.path.join(RELATIONSHIP_METADATA), exist_ok=True)

In [8]:
NODE_DATA = os.getenv('NODE_DATA', default='../data/nodes/')
RELATIONSHIP_DATA = os.getenv('RELATIONSHIP_DATA', default='../data/relationships/') 

In [9]:
if NODE_DATA == '../data/nodes/':
    os.makedirs(os.path.join(NODE_DATA), exist_ok=True)

In [10]:
if RELATIONSHIP_DATA == '../data/relationships/':
    os.makedirs(os.path.join(RELATIONSHIP_DATA), exist_ok=True)

In [11]:
#config = pd.read_csv('../mappings/chebi.csv')
#config = pd.read_csv('../mappings/mesh.csv')
#config = pd.read_csv('../mappings/ncit.csv')
#config = pd.read_csv('../mappings/doid.csv')
config = pd.read_csv('../mappings/mondo.csv')
#config = pd.read_csv('../mappings/ncbi_taxonomy.csv')
#config = pd.read_csv('../mappings/uberon.csv')

In [12]:
config

Unnamed: 0,key,value
0,name,Mondo Disease Ontology (Mondo)
1,downloadUrl,https://data.bioontology.org/ontologies/MONDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv
2,curie,mondo
3,nodeFilename,Disease_mondo_2021-01-29.csv
4,relationshipFilename,Disease-IS_A-Disease_mondo_2021-01-29.csv


In [13]:
node_filename = config.query('key == "nodeFilename"').values[0][1]
relationship_filename = config.query('key == "relationshipFilename"').values[0][1]

## Parse ontology file and create node and relationship dataframes

In [14]:
nodes, relationships = parse_bioportal_csv(config)

In [15]:
print('Number of nodes:', nodes.shape[0])
nodes.head()

Number of nodes: 24409


Unnamed: 0,id,name,synonyms,definition,url
0,mondo:MONDO_0001552,dyscalculia (disease),dyscalculia|disorder of arithmetical skills|mathematics disorder,A wide group of related learning disorders characterized by difficulties with mathematics and manipulating numbers; the difficulty with math may be caused or exacerbated by visuo-spatial or language processing difficulties.,http://purl.obolibrary.org/obo/MONDO_0001552
1,mondo:MONDO_0001722,central pterygium,,,http://purl.obolibrary.org/obo/MONDO_0001722
2,mondo:MONDO_0006787,hidrocystoma,eccrine hidrocystoma of skin|Hydrocystoma|hidrocystoma|hidrocystoma (morphologic abnormality)|cystadenoma|apocrine/eccrine hidrocystoma|apocrine cystadenoma|eccrine cystadenoma,"A benign cystic proliferation of the sweat glands with apocrine or eccrine differentiation. It usually presents as a dome-shaped, cystic papular or nodular lesion usually in the face and neck. It is a unilocular or mutlilocular lesion lined by an inner and an outer layer of epithelium. Complete excision is usually curative.",http://purl.obolibrary.org/obo/MONDO_0006787
3,mondo:MONDO_0007523,"Ehlers-Danlos syndrome, hypermobility type","Ehlers-Danlos syndrome, hypermobile type|BJHS|EDS III|Ehlers-Danlos syndrome, hypermobility type|benign joint hypermobility syndrome|HT-EDS|Ehlers-Danlos syndrome, hypermobility type; EDSHMB|Ehlers-Danlos syndrome type 3|Ehlers-Danlos syndrome type 3 (formerly)|Ehlers-Danlos syndrome, type III|Ehlers-Danlos syndrome, type 3|EDS 3|EDS3 (formerly)|hEDS|hypermobile EDS|EDSHMB|hypermobile Ehlers-Danlos syndrome|benign hypermobility syndrome","Ehlers-Danlos syndrome, hypermobility type (HT-EDS) is the most frequent form of EDS, a group of hereditary connective tissue diseases, and is characterized by joint hyperlaxity, mild skin hyperextensibility, tissue fragility and extra-musculoskeletal manifestations.",http://purl.obolibrary.org/obo/MONDO_0007523
4,mondo:MONDO_0007887,leiomyoma of vulva and esophagus,"leiomyoma of vulva and esophagus|leiomyomatosis, esophagogastric and vulvar|esophagogastric and vulvar leiomyomatosis",,http://purl.obolibrary.org/obo/MONDO_0007887


In [16]:
print('Number of relationships:', relationships.shape[0])
relationships.head()

Number of relationships: 38232


Unnamed: 0,from,to
0,mondo:MONDO_0001552,mondo:MONDO_0004681
1,mondo:MONDO_0001722,mondo:MONDO_0001723
2,mondo:MONDO_0006787,mondo:MONDO_0021489
3,mondo:MONDO_0007523,mondo:MONDO_0020066
4,mondo:MONDO_0007887,mondo:MONDO_0010641


## Save data files

In [17]:
nodes.to_csv(os.path.join(NODE_DATA, node_filename), index=False)

In [18]:
relationships.to_csv(os.path.join(RELATIONSHIP_DATA, relationship_filename), index=False)

## Create Metadata files

#### Pick a consise and representative example for the metadata file

In [19]:
def example_score(row):
    characters = 0
    score = 0
    
    for item in row:
        item = str(item)
        characters += len(item)
        # rows with the most complete data score higher
        if item != '':
            score += 1

    # rows with multiple synomyms score highter
    if '|' in row[2]:
        score += 0.5
         
    # shorter rows score higher
    score += len(row)/characters
    return score

In [20]:
nodes['score'] = nodes.apply(example_score, axis=1)

In [21]:
nodes = nodes.sort_values(by='score', ascending=False)

In [22]:
# pick example from place 50. The top scoring example is often atypical.
pick = min(50, nodes.shape[0])
example_row = nodes[50:]
examples = list(example_row.values[0])

#### Infer data types

In [23]:
nodes = nodes.convert_dtypes()

#### Create metadata dataframe

Node metadata

In [24]:
metadata = []
for node, dtype, exmpl in zip(nodes.columns, nodes.dtypes, examples):
    metadata.append({'property': node, 'type': dtype, 'description': node, 'example': exmpl})
    
node_metadata = pd.DataFrame(metadata)
# remove last row ('score')
node_metadata = node_metadata[:-1]
node_metadata

Unnamed: 0,property,type,description,example
0,id,string,id,mondo:MONDO_0024673
1,name,string,name,skin lymphangioma
2,synonyms,string,synonyms,skin lymphangioma|cutaneous lymphangioma
3,definition,string,definition,A lymphangioma arising from the skin.
4,url,string,url,http://purl.obolibrary.org/obo/MONDO_0024673


Relationship metadata

In [25]:
source = node_metadata['example'][0]
example_rel = relationships[relationships['from'] == source].copy()
example_rel = example_rel.reset_index(drop=True)

In [26]:
metadata = [{'property': 'from', 'type': 'string', 'description': 'Id of source node', 'example': example_rel['from'][0]},
            {'property': 'to', 'type': 'string', 'description': 'Id of target node', 'example': example_rel['to'][0]}]

relationship_metadata = pd.DataFrame(metadata)
relationship_metadata

Unnamed: 0,property,type,description,example
0,from,string,Id of source node,mondo:MONDO_0024673
1,to,string,Id of target node,mondo:MONDO_0002013


## Save metadata files

In [27]:
node_metadata_filename = re.split('\.|_', node_filename)[0]
node_metadata_filename += '.csv'

# split nodes from relationship
parts = relationship_filename.split('-', 2)
source_node = parts[0]
relationship = parts[1]
# remove any extra tag separated by underscore
target_node = parts[2].split('_')[0]
relationship_metadata_filename = source_node + '-' + relationship + '-' + target_node + '.csv'
    
node_metadata.to_csv(os.path.join(NODE_METADATA, node_metadata_filename), index=False)
relationship_metadata.to_csv(os.path.join(RELATIONSHIP_METADATA, relationship_metadata_filename), index=False)