### Notebook to merge machine read triples after extraction and processing from INDRA/REACH and SemRep

In [1]:
import pandas as pd
from rdflib.namespace import OWL, RDF, RDFS
import networkx as nx

In [None]:
##Previous run v1.0.1 - 2022-06-16
##Previous run v2.0.0 - 2023-04-23
##Latest run v3.0.0 - 2024-06-01

In [4]:
##read in machine reading graphs
g1 = '/home/sanya/PheKnowLator/literature_graphs/output_graphs/machineread_semrep_version2.nt'
g2 = '/home/sanya/PheKnowLator/literature_graphs/output_graphs/machineread_reach_version2.nt'

In [5]:
#if merging ntriples - concatenate files
#if merging gpickle - use nx.compose function. Also merge nodelabels and fix labels

In [7]:
with open(g1,'r') as file1:
    g = file1.read()
graph1 = g.split('\n')
len(graph1)

67804

In [8]:
graph1[0]

'<http://purl.obolibrary.org/obo/CHEBI_3429> <http://purl.obolibrary.org/obo/RO_0002434> <http://napdi.org/napdi_srs_imports:rosmarinus_officinalis> .'

In [9]:
with open(g2,'r') as file2:
    g = file2.read()
graph2 = g.split('\n')
len(graph2)

49569

In [10]:
print(len(graph1), len(graph2))

67804 49569


In [11]:
output_file = '/home/sanya/PheKnowLator/literature_graphs/output_graphs/machine_read_merged_20240601.nt'

In [12]:
with open(output_file, 'w') as fileo:
    for gt1 in graph1:
        fileo.write(gt1)
        fileo.write('\n')
    for gt2 in graph2:
        fileo.write(gt2)
        fileo.write('\n')
file1.close()
file2.close()
fileo.close()

In [13]:
with open(output_file,'r') as filei:
    g = filei.read()
mr = g.split('\n')
len(mr)

117374

In [15]:
inferred_file = '/home/sanya/PheKnowLator/literature_graphs/closure_output/inferred-transitive-and-symmetric.ntriples'
with open(inferred_file,'r') as file2:
    g2 = file2.read()
mrinf = g2.split('\n')
len(mrinf)

38373

In [16]:
mr[0]

'<http://purl.obolibrary.org/obo/CHEBI_3429> <http://purl.obolibrary.org/obo/RO_0002434> <http://napdi.org/napdi_srs_imports:rosmarinus_officinalis> .'

In [17]:
mrinf_new = []
for item in mrinf:
    if mrinf != '':
        temp = item.replace('>', '> ')
        mrinf_new.append(temp)
len(mrinf_new)

38373

In [18]:
outmerged = '/home/sanya/PheKnowLator/literature_graphs/output_graphs/machine_read_merged_with_closure_20240601.nt'
with open(outmerged, 'w') as fileo:
    for gt1 in mr:
        if gt1 != '':
            fileo.write(gt1)
            fileo.write('\n')
    for gt2 in mrinf_new:
        if gt2 != '':
            fileo.write(gt2)
            fileo.write('\n')
fileo.close()
filei.close()
file2.close()

In [19]:
outmerged = '/home/sanya/PheKnowLator/literature_graphs/output_graphs/machine_read_merged_with_closure_20240601.nt'
with open(outmerged,'r') as filei:
    g = filei.read()
mr = g.split('\n')
len(mr)

155742

In [22]:
mr[-2]

'<http://purl.obolibrary.org/obo/CHEBI_17347> <http://purl.obolibrary.org/obo/RO_0002436> <http://purl.obolibrary.org/obo/GO_0006412'

#### Merge gpickle files

In [23]:
import networkx as nx
g1 = nx.read_gpickle('/home/sanya/PheKnowLator/literature_graphs/output_graphs/machineread_semrep_version2.gpickle')
g2 = nx.read_gpickle('/home/sanya/PheKnowLator/literature_graphs/output_graphs/machineread_reach_version2.gpickle')
ginf = nx.read_gpickle('/home/sanya/PheKnowLator/literature_graphs/closure_output/machineread_inferred_symmetric_transitive.gpickle')

In [24]:
print(len(g1), len(g2), len(ginf))

8399 6529 4123


In [25]:
def get_stats(g):
    nodes = nx.number_of_nodes(g)
    edges = nx.number_of_edges(g)
    density = nx.density(g)
    avg_deg = float(edges)/nodes
    print(nodes, edges, density, avg_deg)

In [26]:
##compose graphs
g3 = nx.compose(g1,g2)
g_all = nx.compose(g3,ginf)

Was version 1 -
SemRep graph
4288 33594 0.0018274834713764975 7.834421641791045
Reach graph
6217 40245 0.0010414059593728245 6.473379443461477
Inferred graph
2174 18450 0.003905504157192203 8.486660533578657
Combined graphs
8782 84569 0.0010966645002845241 9.629810976998407

In [27]:
print('SemRep graph')
get_stats(g1)
print('Reach graph')
get_stats(g2)
print('Inferred graph')
get_stats(ginf)
print('Combined graphs')
get_stats(g_all)

SemRep graph
8399 58754 0.0008329788747404438 6.995356590070246
Reach graph
6529 42178 0.0009895988185441124 6.460101087455966
Inferred graph
4123 33333 0.0019613408786086925 8.08464710162503
Combined graphs
12190 120371 0.0008101213650926268 9.874569319114029


In [28]:
##save graph
nx.write_gpickle(g_all, '/home/sanya/PheKnowLator/literature_graphs/output_graphs/machine_read_merged_20240601.gpickle')

### Merge nodelabels for reach and semrep

In [29]:
import json
with open('/home/sanya/ontologies/ro.json', 'r') as file:
    ro = json.load(file)
len(ro)

1

In [30]:
relation_labels = {}
for node in ro['graphs'][0]['nodes']:
    try:
        if 'lbl' in node.keys():
            relation_labels[node['id']] = node['lbl']
    except:
        print(node)
        break
len(relation_labels)

767

In [31]:
relation_labels['http://purl.obolibrary.org/obo/RO_0002434']

'interacts with'

In [32]:
import pickle

In [33]:
nodelabels_sem = 'output_graphs/machineread_semrep_NodeLabels.pickle'
nodelabels_r = 'output_graphs/reach_version2_NodeLabels.pickle'

In [34]:
with open(nodelabels_sem, 'rb') as filep1:
    sem_labels = pickle.load(filep1)
len(sem_labels)

8415

In [35]:
with open(nodelabels_r, 'rb') as filep2:
    r_labels = pickle.load(filep2)
len(r_labels)

6542

In [36]:
##combine labels
all_labels = {**sem_labels, **r_labels}
len(all_labels)

12201

In [37]:
##fix labels of relations using relation_labels
for key in all_labels:
    if key in relation_labels.keys():
        all_labels[key]['label'] = relation_labels[key]
len(all_labels)

12201

In [38]:
all_labels['http://purl.obolibrary.org/obo/RO_0002448']

{'entity_type': 'RELATIONS', 'label': 'directly regulates activity of'}

In [39]:
##save labels
with open('output_graphs/machine_read_merged_NodeLabels_20240601.pickle', 'wb') as filep:
    pickle.dump(all_labels, filep)