# Biomedical Knowledge Graph Example Notebook

# [1] Use UMLS to Search for Concept CUIs

In [None]:
"""UMLS API."""
# Add API key

import lxml.html as lh
from lxml.html import fromstring
import requests
import json

uri="https://utslogin.nlm.nih.gov"
auth_endpoint = "/cas/v1/api-key"

class Authentication:
    
    def __init__(self, apikey):
        self.apikey = apikey
        self.service = "http://umlsks.nlm.nih.gov"

    def gettgt(self):
        params = {'apikey': self.apikey}
        h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
        r = requests.post(uri+auth_endpoint,data=params,headers=h)
        response = fromstring(r.text)
        ## extract the entire URL needed from the HTML form (action attribute) returned - looks similar to https://utslogin.nlm.nih.gov/cas/v1/tickets/TGT-36471-aYqNLN2rFIJPXKzxwdTNC5ZT7z3B3cTAKfSc5ndHQcUxeaDOLN-cas
        ## we make a POST call to this URL in the getst method
        tgt = response.xpath('//form/@action')[0]
        return tgt

    def getst(self,tgt):
        params = {'service': self.service}
        h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent":"python" }
        r = requests.post(tgt,data=params,headers=h)
        st = r.text
        return st
    

def get_cuis_for_concept(concept_name, page_size=1000, return_names=False):
    '''
    Get CUIs matching a query concept in UMLS
    '''
    API_KEY = '' # insert API key here
    VERSION = 'current'
    AuthClient = Authentication(API_KEY)

    tgt = AuthClient.gettgt()
    url = "https://uts-ws.nlm.nih.gov/rest/"

    content_endpoint = f'search/{VERSION}'

    query = {'ticket':AuthClient.getst(tgt), 'string':concept_name, 'pageSize':page_size}

    r = requests.get(url=url+content_endpoint, params=query)
    r.encoding = 'utf-8'
    items = json.loads(r.text)
    if return_names:
        data = [[res['ui'], res['name']] for res in items['result']['results']]
    else:
        data = [res['ui'] for res in items['result']['results']]
    return data


def get_concept_for_cui(concept_cui, page_size=1):
    '''
    Get concept for matching CUI.
    '''

    API_KEY = '' # insert API key here
    VERSION = 'current'
    AuthClient = Authentication(API_KEY)

    tgt = AuthClient.gettgt()
    url = "https://uts-ws.nlm.nih.gov/rest/"

    content_endpoint = f"/content/{VERSION}/CUI/{concept_cui}"

    query = {'ticket':AuthClient.getst(tgt), 'CUI':concept_cui, 'pageSize':page_size}
    
    r = requests.get(url=url+content_endpoint, params=query)
    r.encoding = 'utf-8'
    items = json.loads(r.text)
    
    try:
        name = items["result"]["name"]
    except:
        name = "NAME_NOT_FOUND"
        
    return name

In [None]:
"""Search for CUIs relating to concept."""
# Edit saved variables and run cell

search_term = "cardiovascular disease" # Edit
num_results = 5 # Edit

get_cuis_for_concept(search_term, page_size=num_results, return_names=True)

In [None]:
"""Save search parameters."""
# Edit every line of this cell

targets = ['C0002395', 'C0020676']

sn_types = ['DSYN', 'AAPP'] # leave empty for all sn types
sn_search_depth = 1
hetesim_metapath_length = 2

output_toggle = True
joint_output_toggle = True
output_identifier = ''
destination = '' # enter email here

epsilon = 0.05 # error tolerance
r = 0.95 # probability of achieving error tolerance

# [2] Run SemNet

In [None]:
"""Import semnet and other necessary libraries."""
# Run cell

from semnet import offline, offline_hetesim, randomized_hetesim
import pandas as pd

In [None]:
"""Load SemNet data in pandas dataframe."""
# Run cell (may return warning, this is okay)

# Load in semnet database (should take no more than 20 seconds)
semnet_df = pd.read_csv("/mitchell/semnet_related_data/edges_updated.csv", index_col=0)

In [None]:
"""Convert dataframe to dictionary."""
# Run cell. Should take ~100s.

edgelist = semnet_df.to_dict(orient='records')

In [None]:
"""Define SemNet relationships."""
# Run cell

rel2inv = {'PHYSICALLY_RELATED_TO': 'PHYSICALLY_RELATED_TO',
 'PART_OF': 'HAS_PART',
 'CONTAINS': 'CONTAINED_IN',
 'LOCATION_OF': 'HAS_LOCATION',
 'TEMPORALLY_RELATED_TO': 'TEMPORALLY_RELATED_TO',
 'CO-OCCURS_WITH': 'CO-OCCURS_WITH',
 'PRECEDES': 'FOLLOWS',
 'FUNCTIONALLY_RELATED_TO': 'FUNCTIONALLY_RELATED_TO',
 'PROCESS_OF': 'HAS_PROCESS',
 'CARRIES_OUT': 'CARRIED_OUT_BY',
 'INTERACTS_WITH': 'INTERACTS_WITH',
 'PRACTICES': 'PRACTICED_BY',
 'PRODUCES': 'PRODUCED_BY',
 'EXHIBITS': 'EXHIBITED_BY',
 'DISRUPTS': 'DISRUPTED_BY',
 'CAUSES': 'CAUSED_BY',
 'PREVENTS': 'PREVENTED_BY',
 'COMPLICATES': 'COMPLICATED_BY',
 'MANIFESTATION_OF': 'HAS_MANIFESTATION',
 'AFFECTS': 'AFFECTED_BY',
 'OCCURS_IN': 'HAS_OCCURRENCE',
 'MANAGES': 'MANAGED_BY',
 'TREATS': 'TREATED_BY',
 'USES': 'USED_BY',
 'INDICATES': 'INDICATED_BY',
 'RESULT_OF': 'HAS_RESULT',
 'CONCEPTUALLY_RELATED_TO': 'CONCEPTUALLY_RELATED_TO',
 'PROPERTY_OF': 'HAS_PROPERTY',
 'CONCEPTUAL_PART_OF': 'HAS_CONCEPTUAL_PART',
 'EVALUATION_OF': 'HAS_EVALUATION',
 'MEASURES': 'MEASURED_BY',
 'DIAGNOSES': 'DIAGNOSED_BY',
 'ASSESSES_EFFECT_OF': 'ASSESSED_FOR_EFFECT_BY',
 'ISSUE_IN': 'HAS_ISSUE',
 'ASSOCIATED_WITH': 'ASSOCIATED_WITH',
 'CONSISTS_OF': 'CONSTITUTES',
 'ADJACENT_TO': 'ADJACENT_TO',
 'CONNECTED_TO': 'CONNECTED_TO',
 'INTERCONNECTS': 'INTERCONNECTED_BY',
 'SURROUNDS': 'SURROUNDED_BY',
 'TRAVERSES': 'TRAVERSED_BY',
 'DERIVATIVE_OF': 'HAS_DERIVATIVE',
 'DEVELOPMENTAL_FORM_OF': 'HAS_DEVELOPMENTAL_FORM',
 'DEGREE_OF': 'HAS_DEGREE',
 'MEASUREMENT_OF': 'HAS_MEASUREMENT',
 'METHOD_OF': 'HAS_METHOD',
 'ISA': 'INVERSE_ISA',
 'BRINGS_ABOUT': 'BROUGHT_ABOUT_BY',
 'PERFORMS': 'PERFORMED_BY',
 'SPATIALLY_RELATED_TO': 'SPATIALLY_RELATED_TO',
 'ANALYZES': 'ANALYZED_BY',
 'BRANCH_OF': 'HAS_BRANCH',
 'TRIBUTARY_OF': 'HAS_TRIBUTARY',
 'INGREDIENT_OF': 'HAS_INGREDIENT',
 'COMPARED_WITH': 'COMPARED_WITH',
 'INHIBITS': 'INHIBITED_BY',
 'STIMULATES': 'STIMULATED BY',
 'CONVERTS_TO': 'CONVERTS_FROM',
 'NEG_ASSOCIATED_WITH': 'NEG_ASSOCIATED_WITH',
 'COEXISTS_WITH': 'COEXISTS_WITH',
 'NEG_CAUSES': 'NEG_CAUSED_BY',
 'PREDISPOSES': 'PREDISPOSED_BY',
 'HIGHER_THAN': 'LOWER_THAN',
 'LOWER_THAN': 'HIGHER_THAN',
 'NEG_TREATS': 'NEG_TREATED_BY',
 'AUGMENTS': 'AUGMENTED_BY',
 'ADMINISTERED_TO': 'ADMINISTERED_BY',
 'NEG_PROCESS_OF': 'NEG_HAS_PROCESS',
 'NEG_STIMULATES': 'NEG_STIMULATED_BY',
 'NEG_PART_OF': 'NEG_HAS_PART',
 'NEG_AFFECTS': 'NEG_AFFECTED_BY',
 'NEG_ADMINISTERED_TO': 'NEG_ADMINISTERED_BY',
 'NEG_PRODUCES': 'NEG_PRODUCED_BY',
 'NEG_COEXISTS_WITH': 'NEG_COEXISTS_WITH',
 'NEG_INTERACTS_WITH': 'NEG_INTERACTS_WITH',
 'NEG_AUGMENTS': 'NEG_AUGMENTED_BY',
 'NEG_LOCATION_OF': 'NEG_HAS_LOCATION',
 'NEG_ISA': 'NEG_ISA',
 'SAME_AS': 'SAME_AS',
 'NEG_INHIBITS': 'NEG_INHIBITED_BY',
 'NEG_DISRUPTS': 'NEG_DISRUPTED_BY',
 'NEG_USES': 'NEG_USED_BY',
 'NEG_MEASURES': 'NEG_MEASURED_BY',
 'NEG_PREDISPOSES': 'NEG_PREDISPOSED_BY',
 'NEG_PREVENTS': 'NEG_PREVENTED_BY',
 'NEG_OCCURS_IN': 'NEG_HAS_OCCURRENCE',
 'NEG_DIAGNOSES': 'NEG_DIAGNOSED_BY',
 'NEG_METHOD_OF': 'NEG_HAS_METHOD',
 'NEG_HIGHER_THAN': 'NEG_LOWER_THAN',
 'NEG_SAME_AS': 'NEG_SAME_AS',
 'NEG_PRECEDES': 'NEG_PRECEDED_BY',
 'NEG_CONVERTS_TO': 'NEG_CONVERTS_FROM',
 'NEG_MANIFESTATION_OF': 'NEG_HAS_MANIFESTATION',
 'NEG_COMPLICATES': 'NEG_COMPLICATED_BY',
 'NEG_LOWER_THAN': 'NEG_HIGHER_THAN',
 'NEG_MEASUREMENT_OF': 'NEG_HAS_MEASURMENT'}

In [None]:
"""Load SemNet dictionary into HetGraph object."""
# Run cell. Should take ~4min.

semnet_graph = offline.HetGraph(edgelist, rel2inv)

In [None]:
"""Find source nodes related to target node(s)."""
# Run cell

sn_set_list = []

for target in targets:
    target_sns = set()
    
    for in_set, in_path in semnet_graph._fan_in(target, depth=sn_search_depth):
        for node_type in in_set:
            if (node_type in sn_types) or (len(sn_types) == 0):
                target_sns.update(in_set[node_type])
                
    sn_set_list.append(target_sns)

sn_list = list(set.intersection(*sn_set_list))

In [None]:
"""Check number of source nodes."""
# Run cell

len(sn_list)

In [None]:
%%time

"""Calculate HeteSim scores."""
# Run cell. Will take a while.

results_df_list = []

for i, target in enumerate(targets):
    result_dict = offline_hetesim.mean_hetesim_scores(semnet_graph, sn_list, target, hetesim_metapath_length)
    sorted_result_dict = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))

    df = pd.DataFrame(list(sorted_result_dict.items()), columns = ['source_node', 'hetesim_score'])
    df['target_node'] = target
    
    for i in range(len(df.loc[:,"source_node"])):
        df.loc[i, "source_name"] = get_concept_for_cui(df.loc[i, "source_node"])
        
    df["target_name"] = get_concept_for_cui(target)
    df = df[['source_node', 'source_name', 'target_node', 'target_name', 'hetesim_score']]
    
    results_df_list.append(df)
    
    if output_toggle:
        if output_identifier != '':
            output_fn = "SemNet_results_target={}_{}.csv".format(target, output_identifier)
            df.to_csv(output_fn)
        else:
            output_fn = "SemNet_results_target={}.csv".format(target)
            df.to_csv(output_fn)

In [None]:
results_df_list[0]

In [None]:
%%time

"""Calculate approximate mean HeteSim scores."""
# Run cell. Will take a while.

results_df_list = []

for i, target in enumerate(targets):
    result_dict = offline_hetesim.approximate_mean_hetesim_scores(semnet_graph, sn_list, target, hetesim_metapath_length, epsilon, r)
    sorted_result_dict = dict(sorted(result_dict.items(), key=lambda item: item[1], reverse=True))

    df = pd.DataFrame(list(sorted_result_dict.items()), columns = ['source_node', 'approximate_mean_hetesim_score'])
    df['target_node'] = target
    
    for i in range(len(df.loc[:,"source_node"])):
        df.loc[i, "source_name"] = get_concept_for_cui(df.loc[i, "source_node"])
        
    df["target_name"] = get_concept_for_cui(target)
    df = df[['source_node', 'source_name', 'target_node', 'target_name', 'approximate_mean_hetesim_score']]
    
    results_df_list.append(df)
    
    if output_toggle:
        if output_identifier != '':
            output_fn = "approximate_mean_SemNet_results_target={}_{}.csv".format(target, output_identifier)
            df.to_csv(output_fn)
        else:
            output_fn = "approximate_mean_SemNet_results_target={}.csv".format(target)
            df.to_csv(output_fn)

In [None]:
results_df_list[0]

In [None]:
"""Create combined ranking dataframe."""
# Run cell.

for i, df in enumerate(results_df_list):
    results_df_list[i]['ranking'] = df.index

joint_df = results_df_list[0][['source_node', 'ranking']].copy()

for i in range(1, len(results_df_list)):
    joint_df = pd.merge(joint_df, results_df_list[i][['source_node', 'ranking']], on="source_node")

joint_df['mean_ranking'] = joint_df.mean(axis=1)
joint_df = joint_df.sort_values(by=['mean_ranking'])
joint_df = joint_df[['source_node']]

if len(targets) < 1:
    targets_string = targets
else:
    targets_string = '_'.join(targets)

if joint_output_toggle:
    if output_identifier != '':
        output_fn = "SemNet_joint_results_target={}_{}.csv".format(target, output_identifier)
        df.to_csv(output_fn)
    else:
        output_fn = "SemNet_joint_results_target={}.csv".format(target)
        df.to_csv(output_fn)

if output_toggle:
    output_fn = "SemNet_joint_results_target={}_{}.csv".format(targets_string, output_identifier)
    df.to_csv(output_fn)

In [None]:
joint_df