## Create literature-based graph for green tea and kratom from SemRep output

### Steps for processing:
1. Combine all SemRep output files.
2. Fix source sentences with errors.
3. Extract publication year from date of triple.
4. Map subjects, objects and relations to OBO ontologies.
5. Save triples as NetworkX graph.
6. Save triples as ntriples file.

In [None]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import re

In [None]:
import os
files = os.listdir('../resources/predication_files/semrep/')
#read all files in semrep_data
files[0][-3:]
df = pd.DataFrame(columns=['index', 'pmid', 'subject_cui', 'subject_type', 'relation', 'object_cui', 'object_type', 'year', 'sentence'])
for file in files:
    if file[-3:] == 'tsv':
        print('Loading file: ', file)
        df_temp = pd.read_csv('../resources/predication_files/semrep/'+file, sep='\t')
        print(df_temp.info())
        df = pd.concat([df, df_temp], ignore_index=True)
df.info()

In [None]:
df.head()

In [None]:
df = df.drop_duplicates(ignore_index=True)
df.info()

In [None]:
df = df.fillna('')

In [None]:
#use regular expression to fix year as format is not consistent
for i in range(len(df.index)):
    pub_date = df.at[i, 'year']
    x = re.findall(r'\d+', pub_date)
    if x:
        df.at[i, 'year'] = x[0]
df.head()

In [None]:
source_sent = df['sentence'].tolist()

In [None]:
len(source_sent)

In [None]:
##fix source sentence
last = len(source_sent)-1
sentences = []
count = 0
for sent in source_sent:
    sent = sent.strip()
    flag = 0
    if sent == '' or '|||' in sent:
        for i in range(count, -1, -1):
            if source_sent[i].strip() == '' or '|||' in source_sent[i].strip():
                continue
            else:
                sentences.append(source_sent[i].strip())
                flag = 1
                break
        if flag == 0:
            sentences.append(sent.strip())
    else:
        sentences.append(sent.strip())
    count += 1
            
len(sentences)

In [None]:
sentences = pd.Series(sentences)

In [None]:
df['source_text'] = sentences
df.head()

In [None]:
df = df.drop(['sentence', 'index'], axis=1)

In [None]:
df.head()

In [None]:
##filter SemRep output to only keep these predicates/relations
preds = ['affects',
'associated_with',
'augments',
'causes',
'coexists_with',
'complicates',
'disrupts',
'inhibits',
'interacts_with',
'part_of',
'precedes',
'predisposes',
'prevents',
'produces',
'stimulates',
'treats']

In [None]:
df['predicate'] = df['relation'].str.lower()
df.head()

In [None]:
df = df.drop(['relation'], axis=1)

In [None]:
dfn = df[df['predicate'].isin(preds)]
dfn.info()

In [None]:
dfn['predicate'].value_counts()

In [None]:
dfn['subject_map'] = None
dfn['object_map'] = None
dfn['predicate_obo'] = None
dfn['subject_obo'] = None
dfn['object_obo'] = None

In [None]:
##add relation ontology mapping
predMapSemRep = {
'affects': 'RO_0002596',
'associated_with': 'RO_0002610',
'augments': 'RO_0002598',
'causes': 'RO_0002566',
'coexists_with': 'RO_0002490',
'complicates': 'RO_0003309',
'disrupts': 'RO_0002212',
'inhibits': 'RO_0002449',
'interacts_with': 'RO_0002434',
'part_of': 'BFO_0000050',
'precedes': 'BFO_0000063',
'predisposes': 'RO_0003302',
'prevents': 'RO_0002599',
'produces': 'RO_0003000',
'stimulates': 'RO_0002213',
'treats': 'RO_0002606'
}

In [None]:
def relation_mapping(row):
    rel = row['predicate'].lower()
    if rel in predMapSemRep:
        return predMapSemRep[rel]
    else:
        return ''

In [None]:
dfn['predicate_obo'] = dfn.apply(relation_mapping, axis=1)
dfn.head()

In [None]:
dfn = dfn.reset_index(drop=True)
dfn.info()

In [None]:
#filter semantic types here and then start mapping
excluded_semtype = ['acty','bhvr','evnt','gora','mcha','ocac', #Occupational Activity
'clas',
'cnce',
'ftcn',
'grpa',
'idcn',
'inpr',
'lang',
'qlco',
'qnco',
'rnlw',
'spco',
'tmco',
'enty',
'mnob',
'phob',
'bmod',
'ocdi',
'hcro',
'orgt',
'pros',
'shro',
'eehu',
'hcpp']

In [None]:
dfn1 = dfn[~dfn['subject_type'].isin(excluded_semtype)]
dfn1.info()

In [None]:
dfn2 = dfn1[~dfn1['object_type'].isin(excluded_semtype)]
dfn2.info()

In [None]:
##exclude all concepts that occur in SemMedDB GENERIC.CONCEPT table
#Get CSV file - https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/SemMedDB_download.html -- doesn't work
#download from https://github.com/kilicogluh/lbd-covid/tree/master/preprocessing/conf
semmed = pd.read_csv('cui_to_ontology_maps/semmedVER43_2020_R_GENERIC_CONCEPT.csv', header=None)
semmed.info()

In [None]:
semmed = semmed.rename(columns={1: 'CUI', 2: 'concept_name'})
semmed.head()

In [None]:
semmed = semmed.drop([0], axis=1)
semmed.head()

In [None]:
#drop rows where subject and object matches generic concepts
generic_cui = semmed.CUI.tolist()
len(generic_cui)

In [None]:
dfn2.info()

In [None]:
dfn3 = dfn2[~dfn2['subject_cui'].isin(generic_cui)]
dfn3.info()

In [None]:
dfn4 = dfn3[~dfn3['object_cui'].isin(generic_cui)]
dfn4.info()

In [None]:
#map from UMLS to GO, HPO
with open('cui_to_ontology_maps/go_hpo_map_dict.pickle', 'rb') as filep:
    go_hpo_mapping_dict = pickle.load(filep)
len(go_hpo_mapping_dict)

In [None]:
def umls_go_hpo_map(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    return None

In [None]:
dfn4['subject_obo'] = dfn4.apply(umls_go_hpo_map, axis=1, col='subject')
dfn4.info()

In [None]:
dfn4['object_obo'] = dfn4.apply(umls_go_hpo_map, axis=1, col='object')
dfn4.info()

In [None]:
dfn4 = dfn4.reset_index(drop=True)

In [None]:
dfn4.head()

In [None]:
dfn_subset = dfn4

In [None]:
dfn_subset.head()

In [None]:
unmapped_cui = []
unmapped_string_umls = []
for i in range(len(dfn_subset.index)):
    if not dfn_subset.at[i, 'subject_obo']:
        subcui = dfn_subset.at[i, 'subject_cui']
        if subcui not in unmapped_cui:
            unmapped_cui.append(dfn_subset.at[i, 'subject_cui'])
            unmapped_string_umls.append(dfn_subset.at[i, 'subject_name'])
    if not dfn_subset.at[i, 'object_obo']:
        objcui = dfn_subset.at[i, 'object_cui']
        if objcui not in unmapped_cui:
            unmapped_cui.append(dfn_subset.at[i, 'object_cui'])
            unmapped_string_umls.append(dfn_subset.at[i, 'object_name'])
print(len(unmapped_cui), len(unmapped_string_umls))

In [None]:
with open('../resources/predication_files/semrep/unmapped_semrep_subset.txt', 'w') as fileo:
    for item in unmapped_string_umls:
        fileo.write(item+'\n')

In [None]:
with open('../resources/predication_files/semrep/unmapped_semrep_cui_subset.txt', 'w') as fileco:
    for item in unmapped_cui:
        fileco.write(item+'\n')

In [None]:
obomap = pd.read_csv('../resources/predication_files/semrep/unmapped_semrep_subset.csv')
obomap.info()

In [None]:
obomap.head()

In [None]:
unmapped_cui_new = []
for item in unmapped_cui:
    cui = item.strip()
    if cui != '':
        unmapped_cui_new.append(cui)
len(unmapped_cui_new)

In [None]:
obomap['CUI'] = unmapped_cui_new
obomap.head()

In [None]:
obomap = obomap.fillna('')
obomap.info()

In [None]:
def process_CURIE(row):
    curie = row['CURIE']
    if curie == '':
        return curie
    elif 'napdi' in curie:
        return curie
    elif 'SYNONYM' in curie:
        temp = curie.split('_')[0]
        temp = temp.replace(':', '_')
        return temp
    else:
        temp = curie.replace(':', '_')
        return temp

In [None]:
obomap['CURIE_new'] = obomap.apply(process_CURIE, axis=1)
obomap.head()

In [None]:
obosub = obosub.reset_index(drop=True)
obosub.info()

In [None]:
obosub.to_csv('cui_to_ontology_maps/mapped_semrep_subset.csv', index=False)

In [None]:
##create mapping dictionary
obomap_dict = {}
for i in range(len(obosub.index)):
    cui = obosub.at[i, 'CUI']
    curie = obosub.at[i, 'CURIE_new']
    obomap_dict[cui] = curie
len(obomap_dict)

In [None]:
with open('cui_to_ontology_maps/CUItoOBO_20220505.pickle', 'wb') as filep:
    pickle.dump(obomap_dict, filep)

In [None]:
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in obomap_dict:
        return obomap_dict[cui]
    return None

In [None]:
dfn_subset.head()

In [None]:
#MAPPING
dfn_subset['subject_obo'] = dfn_subset.apply(get_obo_mapping, axis=1, col='subject')
dfn_subset.head()

In [None]:
dfn_subset['object_obo'] = dfn_subset.apply(get_obo_mapping, axis=1, col='object')
dfn_subset.head()

In [None]:
dfn_subset.info()

In [None]:
##add labels for all subjects and objects

In [None]:
dfn_subset.to_csv('../resources/predication_files/semrep/semrep_all_predications_mapped.csv', index=False)

In [None]:
##read file after mapping
dfn_subset = pd.read_csv('../resources/predication_files/semrep/semrep_all_predications_mapped.csv')
dfn_subset.head()

In [None]:
def add_prefix(row, col):
    obo_prefix = 'http://purl.obolibrary.org/obo/'
    napdi_prefix = 'http://napdi.org/'
    if col == 'predicate':
        predicate_obo = row['predicate_obo']
        if isinstance(predicate_obo, str):
            return obo_prefix+predicate_obo
        else:
            return ''
            
    elif col == 'subject':
        
        subject_obo = row['subject_obo']
        if isinstance(subject_obo, str):
            if subject_obo == None:
                return ''
            if 'napdi' in subject_obo:
                return napdi_prefix+subject_obo
            else:
                return obo_prefix+subject_obo
        else:
            return ''
    elif col == 'object':
        object_obo = row['object_obo']
        if isinstance(object_obo, str):
            if object_obo == None:
                return ''
            if 'napdi' in object_obo:
                return napdi_prefix+object_obo
            else:
                return obo_prefix+object_obo
        else:
            return ''

In [None]:
#add OBO identifiers to the OBO mappings (where not present) - see df
#drop rows with no mappings
dfn_subset['subject_obo'] = dfn_subset.apply(add_prefix, axis=1, col='subject')
dfn_subset['object_obo'] = dfn_subset.apply(add_prefix, axis=1, col='object')
dfn_subset['predicate_obo'] = dfn_subset.apply(add_prefix, axis=1, col='predicate')

In [None]:
dfn_subset.to_csv('../resources/predication_files/semrep/semrep_all_predications_mapped_with_prefix.csv', index=False)

In [None]:
df_new = dfn_subset[dfn_subset['subject_obo'] != '']
df_new.info()

In [None]:
df_new = df_new[df_new['object_obo'] != '']
df_new.info()

In [None]:
df_new = df_new[df_new['predicate_obo'] != '']
df_new.info()

In [None]:
df_new = df_new.drop_duplicates()
df_new.info()

In [None]:
df_new = df_new.reset_index(drop=True)
df_new.info()

In [None]:
df_new = df_new.drop(['subject_map', 'object_map'], axis=1)
df_new.info()

In [None]:
df_new.to_csv('../resources/predication_files/semrep/semrep_predications_mapped_only.csv', index=False)

In [None]:
#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

# set-up environment variables
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')
napdi = Namespace('http://napdi.org/napdi_srs_imports:')

In [None]:
dfres = df_new[['subject_obo', 'predicate_obo', 'object_obo']]
dfres.head()

In [None]:
dfres = dfres.drop_duplicates()
dfres.info()

In [None]:
#create rdflib graph from dataframe triples and serialize as ntriples file
graph  = Graph()
pred_label = URIRef("http://www.w3.org/2000/01/rdf-schema#label")
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    pred = df_new.at[i, 'predicate_obo']
    subj_node = URIRef(subj)
    obj_node = URIRef(obj)
    predicate = URIRef(pred)
    subj_name = df_new.at[i, 'subject_name']
    obj_name = df_new.at[i, 'object_name']
    graph.add((subj_node, predicate, obj_node))
    graph.add((subj_node, pred_label, Literal(subj_name)))
    graph.add((obj_node, pred_label, Literal(obj_name)))

In [None]:
graph.serialize('output_graphs/machineread_semrep.nt', format='nt')

In [None]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [None]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - 
#pub_year, pmid, source graph, belief
nx_mdg = nx.MultiDiGraph()
for s, p, o in tqdm(graph):
    #do not save label predicate to gpickle
    subj = str(s)
    obj = str(o)
    pred = str(p)
    if pred == 'http://www.w3.org/2000/01/rdf-schema#label':
        continue
    else:

        pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
        pmid = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                       (df_new['predicate_obo'] == pred)]['pmid'].values[0])
        timestamp = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['year'].values[0])
        belief_score = 0.8
        nx_mdg.add_node(s, key=n3(s))
        nx_mdg.add_node(o, key=n3(o))
        nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                                 'pmid': pmid, 'timestamp': timestamp, 'source_graph': 'machine_read',
                                'belief': belief_score})
nx.write_gpickle(nx_mdg, "output_graphs/machineread_semrep.gpickle")

In [None]:
triples = len(graph)
nodes = len(set(list(graph.subjects()) + list(graph.objects())))
rels = len(set(list(graph.predicates())))
print(triples, nodes, rels)

In [None]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

In [None]:
#save node labels as dictionary
#key: URI, value is label
label_dict = {}
for i in range(len(df_new.index)):
    subj = str(df_new.at[i, 'subject_obo'])
    obj = str(df_new.at[i, 'object_obo'])
    pred = str(df_new.at[i, 'predicate_obo'])
    if subj not in label_dict:
        label_dict[subj] = {}
        label_dict[subj]['entity_type'] = 'NODES'
        label_dict[subj]['label'] = df_new.at[i, 'subject_name']
        label_dict[subj]['cui'] = df_new.at[i, 'subject_cui']
    if obj not in label_dict:
        label_dict[obj] = {}
        label_dict[obj]['entity_type'] = 'NODES'
        label_dict[obj]['label'] = df_new.at[i, 'object_name']
        label_dict[obj]['cui'] = df_new.at[i, 'object_cui']
    if pred not in label_dict:
        label_dict[pred] = {}
        label_dict[pred]['entity_type'] = 'RELATIONS'
        label_dict[pred]['label'] = df_new.at[i, 'predicate']
len(label_dict)

In [None]:
import pickle
with open('output_graphs/machineread_semrep_NodeLabels.pickle', 'wb') as file_p:
    pickle.dump(label_dict, file_p)

In [None]:
dfmap = pd.DataFrame.from_dict(label_dict, orient='index')
dfmap.head()

In [None]:
dfmap = dfmap.reset_index()
dfmap.head()

In [None]:
dfmap = dfmap.rename(columns={"index":"entity_uri"})
dfmap.head()

In [None]:
dfmap.to_csv('output_graphs/machineread_semrep_NodeLabels.tsv', index=False, sep='\t')

In [None]:
##run closure in separate notebook - see machine_read_closure.ipynb