## Create literature-based graph for green tea from INDRA/REACH triples

### Steps for processing:
1. Map subjects, objects, and relations to OBO ontologies.
2. Add prefixes for all CURIEs.
3. Save triples as NetworkX graph.
4. Save triples as ntriples graph.

In [None]:
import pandas as pd
from rdflib.namespace import OWL, RDF, RDFS
import numpy as np

In [None]:
dfgt = pd.read_csv('../resources/predication_files/greentea_all_predicates_INDRA.tsv', sep='\t')
dfgt.info()

In [None]:
dfgt.head()

In [None]:
#filter columns
df = dfgt[['subject_cui', 'subject_name', 'subject_source', 'predicate', 'object_source',
       'object_cui', 'object_name', 'subj_reach_grounding', 'obj_reach_grounding', 'pmid', 
           'pub_year', 'belief', 'sentence']]

In [None]:
#add columns for OBO mappings
df['predicate_obo'] = None
df['subject_obo'] = None
df['object_obo'] = None

In [None]:
df.head()

In [None]:
## Mapping file for UMLS to GO and HPO
go_hpo_map = pd.read_csv('cui_to_ontology_maps/go_hpo_map.csv')

In [None]:
go_hpo_mapping_dict = {}
for i in range(len(go_hpo_map.index)):
    cui = go_hpo_map.at[i,'cui']
    if cui not in go_hpo_mapping_dict:
        go_hpo_mapping_dict[cui] = []
    code = go_hpo_map.at[i, 'code']
    code = code.replace(':', '_')
    go_hpo_mapping_dict[cui].append(code)
print(len(go_hpo_mapping_dict))

In [None]:
df = df.fillna('')

In [None]:
reach_grounding_dict = {}
for i in range(len(df.index)):
    sub_cui = df.at[i, 'subject_cui']
    reach_grounding_dict[sub_cui] = []
    obj_cui = df.at[i, 'object_cui']
    if obj_cui not in reach_grounding_dict:
        reach_grounding_dict[obj_cui] = []
    sub_ground = df.at[i, 'subj_reach_grounding']
    obj_ground = df.at[i, 'obj_reach_grounding']
    if sub_ground != np.nan:
        if 'CHEBI' in sub_ground or 'GO' in sub_ground:
            subject_map = sub_ground.split("'")[-2]
            subject_map = subject_map.replace(':', '_')
            reach_grounding_dict[sub_cui].append(subject_map)
        elif 'UP' in sub_ground:
            subject_map = sub_ground.split("'")[-2]
            subject_map = 'PR_' + subject_map
            reach_grounding_dict[sub_cui].append(subject_map)
    if obj_ground != np.nan:
        if 'CHEBI' in obj_ground or 'GO' in obj_ground:
            object_map = obj_ground.split("'")[-2]
            object_map = object_map.replace(':', '_')
            reach_grounding_dict[obj_cui].append(object_map)
        elif 'UP' in obj_ground:
            object_map = obj_ground.split("'")[-2]
            object_map = 'PR_' + object_map
            reach_grounding_dict[obj_cui].append(object_map)
print(len(reach_grounding_dict))

In [None]:
predMapD = {
    'regulateactivity':'RO_0011002',
    'regulateamount':'RO_0011003',
    'phosphorylation':'RO_0002447',
    'dephosphorylation':'GO_0006470',
    'ubiquitination':'RO_0002480',
    'deubiquitination':'GO_0016579',
    'sumoylation':'RO_0002436',
    'desumoylation':'RO_0002436',
    'hydroxylation':'GO_0018126',
    'dehydroxylation':'RO_0002436',
    'acetylation':'GO_0006473',
    'deacetylation':'GO_0006476',
    'glycosylation':'GO_0006486',
    'deglycosylation':'GO_0006517',
    'farnesylation':'RO_0002436',
    'defarnesylation':'RO_0002436',
    'geranylgeranylation':'RO_0002436',
    'degeranylgeranylation':'RO_0002436',
    'palmitoylation':'RO_0002436',
    'depalmitoylation':'RO_0002436',
    'myristoylation':'RO_0002436',
    'demyristoylation':'RO_0002436',
    'ribosylation':'RO_0002436',
    'deribosylation':'RO_0002436',
    'methylation':'GO_0006479',
    'demethylation':'GO_0006482',
    'activation':'RO_0002448',
    'inhibition':'RO_0002449',
    'increaseamount':'RO_0011009',
    'decreaseamount':'RO_0011010'
}

In [None]:
#if reach_grounding has multiple values, currently only first one is taken
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify is subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    if cui in reach_grounding_dict:
        if len(reach_grounding_dict[cui]):
            return reach_grounding_dict[cui][0]
    return ''

In [None]:
def relation_mapping(row):
    rel = row['predicate'].lower()
    if rel in predMapD:
        return predMapD[rel]
    else:
        return ''

In [None]:
df['predicate_obo'] = df.apply(relation_mapping, axis=1)

In [None]:
df['subject_obo'] = df.apply(get_obo_mapping, axis=1, col='subject')

In [None]:
df['object_obo'] = df.apply(get_obo_mapping, axis=1, col='object')

In [None]:
df.info()

In [None]:
#load ontorunner mappings and map CUIs

In [None]:
import pickle
with open('cui_to_ontology_maps/CUItoOBO_20220505.pickle', 'rb') as filep:
    onto_dict = pickle.load(filep)
len(onto_dict)

In [None]:
#get ontorunner mappings and add to df
def get_obo_onto_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
        mapping = row['subject_obo']
    elif col == 'object':
        cui = row['object_cui']
        mapping = row['object_obo']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if mapping == '':
        if cui in onto_dict:
            return onto_dict[cui]
    else:
        return mapping

In [None]:
df['subject_obo'] = df.apply(get_obo_onto_mapping, axis=1, col='subject')
df.head()

In [None]:
df['object_obo'] = df.apply(get_obo_onto_mapping, axis=1, col='object')
df.head()

In [None]:
def add_prefix(row, col):
    obo_prefix = 'http://purl.obolibrary.org/obo/'
    if col == 'predicate':
        predicate_obo = row['predicate_obo']
        return obo_prefix+predicate_obo
    elif col == 'subject':
        subject_obo = row['subject_obo']
        if isinstance(subject_obo, list) and subject_obo:
            subject_obo = subject_obo[0]
        if subject_obo == None:
            return ''
        subject_obo = subject_obo.replace(']', '')
        subject_obo = subject_obo.replace('[', '')
        subject_obo = subject_obo.replace(')', '')
        subject_obo = subject_obo.replace('(', '')
        if 'http' not in subject_obo:
            return obo_prefix+subject_obo
        else:
            return subject_obo
    elif col == 'object':
        object_obo = row['object_obo']
        if isinstance(object_obo, list) and object_obo:
            object_obo = object_obo[0]
        if object_obo == None:
            return ''
        object_obo = object_obo.replace(']', '')
        object_obo = object_obo.replace('[', '')
        object_obo = object_obo.replace(')', '')
        object_obo = object_obo.replace('(', '')
        if 'http' not in object_obo:
            return obo_prefix+object_obo
        else:
            return object_obo

In [None]:
#add OBO identifiers to the OBO mappings (where not present) - see df
#drop rows with no mappings
df['subject_obo'] = df.apply(add_prefix, axis=1, col='subject')
df['object_obo'] = df.apply(add_prefix, axis=1, col='object')
df['predicate_obo'] = df.apply(add_prefix, axis=1, col='predicate')

In [None]:
##fix napdi identifiers
for i in range(len(df.index)):
    subj = df.at[i, 'subject_obo']
    obj = df.at[i, 'object_obo']
    if 'napdi' in subj:
        subjnew = subj.replace('http://purl.obolibrary.org/obo/', 'http://napdi.org/')
        df.at[i, 'subject_obo'] = subjnew
    if 'napdi' in obj:
        objnew = obj.replace('http://purl.obolibrary.org/obo/', 'http://napdi.org/')
        df.at[i, 'object_obo'] = objnew

In [None]:
obo_prefix = 'http://purl.obolibrary.org/obo/'

In [None]:
#remove all rows with blank obo mappings and only prefix rows
df_new = df[df['subject_obo'] != '']
df_new = df_new[df_new['subject_obo'] != obo_prefix]
df_new.info()

In [None]:
df_new = df_new[df_new['object_obo'] != '']
df_new = df_new[df_new['object_obo'] != obo_prefix]
df_new.info()

In [None]:
df_new = df_new.drop_duplicates()
df_new.head()

In [None]:
df_new = df_new.reset_index(drop=True)
df_new.head()

In [None]:
df_new.to_csv('../resources/predication_files/greentea_all_predicates_INDRA_processed.tsv', sep='\t', index=False)

In [None]:
#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

# set-up environment variables
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')
napdi = Namespace('http://napdi.org/napdi_srs_imports:')

In [None]:
dfres = df_new[['subject_obo', 'predicate_obo', 'object_obo']]
dfres.head()

In [None]:
dfres.to_csv('../resources/predication_files/greentea_all_predicates_INDRA_processed_triples.tsv', sep='\t', index=False)

In [None]:
#create rdflib graph from dataframe triples and serialize as ntriples file
graph  = Graph()
pred_label = URIRef("http://www.w3.org/2000/01/rdf-schema#label")
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    pred = df_new.at[i, 'predicate_obo']
    subj_node = URIRef(subj)
    obj_node = URIRef(obj)
    predicate = URIRef(pred)
    subj_name = df_new.at[i, 'subject_name']
    obj_name = df_new.at[i, 'object_name']
    graph.add((subj_node, predicate, obj_node))
    graph.add((subj_node, pred_label, Literal(subj_name)))
    graph.add((obj_node, pred_label, Literal(obj_name)))

In [None]:
graph.serialize('output_graphs/machineread_greentea_INDRA.nt', format='nt')

In [None]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [None]:
for s,p,o in tqdm(graph):
    x = str(s)
    y = str(p)
    z = str(o)
    break

In [None]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - 
#pub_year, pmid, source graph, belief
nx_mdg = nx.MultiDiGraph()
for s, p, o in tqdm(graph):
    #do not save label predicate to gpickle
    subj = str(s)
    obj = str(o)
    pred = str(p)
    if pred == 'http://www.w3.org/2000/01/rdf-schema#label':
        continue
    pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
    
    pmid = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                   (df_new['predicate_obo'] == pred)]['pmid'].values[0])
    timestamp = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                               (df_new['predicate_obo'] == pred)]['pub_year'].values[0])
    belief_score = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                               (df_new['predicate_obo'] == pred)]['belief'].values[0]
    nx_mdg.add_node(s, key=n3(s))
    nx_mdg.add_node(o, key=n3(o))
    nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                             'pmid': pmid, 'timestamp': timestamp, 'source_graph': 'machine_read',
                            'belief': belief_score})
nx.write_gpickle(nx_mdg, "output_graphs/machineread_greentea_INDRA.gpickle")

In [None]:
#save node labels as dictionary
#key: URI, value is label
label_dict = {}
for i in range(len(df_new.index)):
    subj = str(df_new.at[i, 'subject_obo'])
    obj = str(df_new.at[i, 'object_obo'])
    pred = str(df_new.at[i, 'predicate_obo'])
    if subj not in label_dict:
        label_dict[subj] = {}
        label_dict[subj]['entity_type'] = 'NODES'
        label_dict[subj]['label'] = df_new.at[i, 'subject_name']
        label_dict[subj]['cui'] = df_new.at[i, 'subject_cui']
    if obj not in label_dict:
        label_dict[obj] = {}
        label_dict[obj]['entity_type'] = 'NODES'
        label_dict[obj]['label'] = df_new.at[i, 'object_name']
        label_dict[obj]['cui'] = df_new.at[i, 'object_cui']
    if pred not in label_dict:
        label_dict[pred] = {}
        label_dict[pred]['entity_type'] = 'RELATIONS'
        label_dict[pred]['label'] = df_new.at[i, 'predicate']
len(label_dict)

In [None]:
import pickle
with open('output_graphs/machineread_greentea_INDRA_NodeLabels.pickle', 'wb') as file_p:
    pickle.dump(label_dict, file_p)

In [None]:
dfmap = pd.DataFrame.from_dict(label_dict, orient='index')
dfmap.head()

In [None]:
dfmap = dfmap.reset_index()
dfmap.head()

In [None]:
dfmap = dfmap.rename(columns={"index":"entity_uri"})
dfmap.head()

In [None]:
dfmap.to_csv('output_graphs/machineread_greentea_INDRA_NodeLabels.tsv', index=False, sep='\t')