# Read the llama2 validated relationships and create JSON which can be displayed by graph visualization

In [1]:
import os
import pickle
import json
from tqdm import tqdm
from owlready2 import get_ontology, default_world
import sys
import re
import math

sys.path.append("./loaders")


import urllib
from ChebiLoader import ChebiLoader, ChebiEntity

chebi_owl_path = "/local/sps-local/chebi/chebi.owl"
chebi = ChebiLoader(chebi_owl_path)

def load_relations():
    true_relations = []
    false_relations = []
    with open("/local/sps-local/ner-role-extraction/true_relations.pkl", "rb") as f:
        true_relations = pickle.load(f)
    with open("/local/sps-local/ner-role-extraction/false_relations.pkl", "rb") as f:
        false_relations = pickle.load(f)

    return true_relations, false_relations


loading chebi from: /local/sps-local/chebi/chebi.owl

loading chemicals and their synonyms
loading roles and their synonyms

found 409625 chemicals and 14176 roles.
Memory usage of ChebiLoader: 1567.29296875 MB


# create role-deduplication dict from chebi

In [2]:
fraction_of_data = 1
min_mentioned_relation = 1
true_relations, false_relations = load_relations()

def contains_only_non_alpha(string):
    # Define a regular expression pattern to match strings containing only non-alphabetic characters
    pattern = r'^[\W\d]+$'  # \W matches non-alphanumeric characters, \d matches digits

    # Use re.match() to check if the string matches the pattern
    if re.match(pattern, string):
        return True
    else:
        return False

def deduplicate_and_order(relations):    
    chems_to_chebi = {}
    roles_to_chebi = {}
    
    roles_to_chems = {} # dict of <role, dict<chems, [positional information]>>
    
    chems_to_id = {}
    roles_to_id = {}
    files_to_id = {}
    
    chems_dedup = set()
    roles_dedup = set()
    files_dedup = set()
    
    for chem_synonym, role_synonym, context_filepath, context_page, context_pos in tqdm(relations):
        if contains_only_non_alpha(chem_synonym):
            continue
            
        # role = role_dedup.get(role_synonym.lower())
        role = chebi.get_role(role_synonym)
        if not role:
            role = ChebiEntity(None, role_synonym)
        roles_to_chebi[role.label] = role.chebi_id
        
        chem = chebi.get_chem(chem_synonym)
        if not chem:
            chem = ChebiEntity(None, chem_synonym)
        chems_to_chebi[chem.label] = chem.chebi_id
        
        chems = roles_to_chems.get(role.label, {})
        pos = chems.get(chem.label, [])
        pos.append((context_filepath, context_page, context_pos))
        chems[chem.label] = pos
        roles_to_chems[role.label] = chems        
        
        chems_dedup.add(chem.label)
        roles_dedup.add(role.label)
        files_dedup.add(context_filepath)
    
    for i, x in enumerate(chems_dedup):
        chems_to_id[x] = i
    for i, x in enumerate(roles_dedup):
        roles_to_id[x] = i
    for i, x in enumerate(files_dedup):
        files_to_id[x] = i
        
    return roles_to_chems, chems_to_id, roles_to_id, files_to_id, chems_to_chebi, roles_to_chebi

fraction = true_relations[: math.floor(len(true_relations) * fraction_of_data)]
print(f"creating a know KG from {len(fraction)} sentences")

roles_to_chems, chems_to_id, roles_to_id, files_to_id, chems_to_chebi, roles_to_chebi = deduplicate_and_order(fraction)


creating a know KG from 58511 sentences


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 58511/58511 [00:00<00:00, 776662.49it/s]


In [3]:
def create_nodes_and_edges(roles_to_chems, chems_to_id, roles_to_id, files_to_id, chems_to_chebi, roles_to_chebi):
    nodes = []
    edges = []
    
    for role in roles_to_chems:       
        if len(role)<2:
            continue
        chems = roles_to_chems.get(role)
        for chem in chems:
            if len(chem)<2:
                continue
            pos = chems[chem]                         
            count = len(pos)
            if count >= min_mentioned_relation:
                role_id = roles_to_id[role]
                role_chebi_id = roles_to_chebi[role]
                chem_id = chems_to_id[chem]
                chem_chebi_id = chems_to_chebi[chem]
                
                nodes.append({ "data": { "id": f"r{role_id}", "type": "role", "label": role, "chebi_id": str(role_chebi_id) }});
                nodes.append({ "data": { "id": f"c{chem_id}", "type": "chem", "label": chem, "chebi_id": str(chem_chebi_id) }});                
                edges.append({ "data": { "source": f"r{role_id}", "target": f"c{chem_id}", "count":count}})

                # print(nodes[-1])
                
                
                
    return nodes, edges

nodes, edges = create_nodes_and_edges(roles_to_chems, chems_to_id, roles_to_id, files_to_id, chems_to_chebi, roles_to_chebi)

with open("/local/sps-local/ner-role-extraction/nodes.json", "w") as outfile:
    json.dump(nodes, outfile, indent=4)

with open("/local/sps-local/ner-role-extraction/edges.json", "w") as outfile:
    json.dump(edges, outfile, indent=4)

print("saved nodes and edges")

saved nodes and edges


In [4]:
def create_turtle(roles_to_chems, chems_to_chebi, roles_to_chebi):
    stats = []
    
    lines = [
        "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .",
        "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .",        
        "@prefix obo: <http://purl.obolibrary.org/obo/> .",
        "@prefix cear: <https://wwwiti.cs.uni-magdeburg.de/iti_dke/cear/> .",
        ""
    ]

    cear_chems = {}
    cear_roles = {}
    for role in roles_to_chems:
        if len(role)<2:
            continue
            
        chems = roles_to_chems.get(role)
        for chem in chems:
            if len(chem)<2:
                continue
            pos = chems[chem]                         
            count = len(pos)
            if count >= min_mentioned_relation:      
                role_chebi_id = roles_to_chebi[role]                
                chem_chebi_id = chems_to_chebi[chem]

                role_id_in_kg = ""
                chem_id_in_kg = ""
                if chem not in cear_chems:                    
                    if chem_chebi_id:
                        chem_id_in_kg = f'obo:{str(chem_chebi_id)[4:]}'
                    else:
                        chem_id_in_kg = f'cear:chem_{len(cear_chems)+1}'
                    lines.append(f'{chem_id_in_kg} rdf:type obo:CHEBI_24431 .')
                    lines.append(f'{chem_id_in_kg} rdfs:label "{chem}" .')
                    cear_chems[chem] = chem_id_in_kg
                        

                if role not in cear_roles:
                    if role_chebi_id:
                        role_id_in_kg = f'obo:{str(role_chebi_id)[4:]}'                        
                    else:
                        role_id_in_kg = f'cear:role_{len(cear_roles)+1}'
                    lines.append(f'{role_id_in_kg} rdf:type obo:CHEBI_50906 .')
                    lines.append(f'{role_id_in_kg} rdfs:label "{role}" .')
                    cear_roles[role] = role_id_in_kg

                lines.append(f'{cear_chems[chem]} obo:RO_0000087 {cear_roles[role]} .')
                lines.append("")

                stats.append((count, chem, cear_chems[chem], role, cear_roles[role]))
                        
    
    with open("/local/sps-local/ner-role-extraction/cear.ttl", "w") as f:
        f.writelines([line + "\n" for line in lines])
        f.flush()

    return stats, cear_chems, cear_roles

stats, cear_chems, cear_roles = create_turtle(roles_to_chems, chems_to_chebi, roles_to_chebi)  
    

In [5]:
def entity_stats(data):
    chebi = 0
    cear = 0
    for d in data:
        if str(data[d]).startswith("cear:"):
            cear += 1
        else:
            chebi += 1
    return cear, chebi

print("count of true relations:", len(true_relations))
print("count of false relations:", len(false_relations))
print()
print()

textpos_count = sum(x[0] for x in stats)
print ("number of relations:", len(stats))
print("relevant text_positions:", textpos_count)
print()
cear_chems_count, chebi_chems_count = entity_stats(cear_chems)
print("distinct chebi_chems_count:", chebi_chems_count)
print("distinct cear_chems_count:", cear_chems_count)
print()
cear_roles_count, chebi_roles_count = entity_stats(cear_roles)
print("distinct chebi_roles_count:", chebi_roles_count)
print("distinct cear_roles_count:", cear_roles_count)


count of true relations: 58511
count of false relations: 272053


number of relations: 28038
relevant text_positions: 57846

distinct chebi_chems_count: 3680
distinct cear_chems_count: 13818

distinct chebi_roles_count: 214
distinct cear_roles_count: 455


In [6]:
import random
random.shuffle(stats)
stats = sorted(stats, key=lambda x: x[0], reverse=True)
stats[:20]

[(1085, 'water', 'obo:CHEBI_15377', 'solvent', 'obo:CHEBI_46787'),
 (551, 'methanol', 'obo:CHEBI_17790', 'solvent', 'obo:CHEBI_46787'),
 (438, 'dimethyl sulfoxide', 'obo:CHEBI_28262', 'solvent', 'obo:CHEBI_46787'),
 (402,
  'N,N-dimethylformamide',
  'obo:CHEBI_17741',
  'solvent',
  'obo:CHEBI_46787'),
 (398, 'oxolane', 'obo:CHEBI_26911', 'solvent', 'obo:CHEBI_46787'),
 (388, 'acetonitrile', 'obo:CHEBI_38472', 'solvent', 'obo:CHEBI_46787'),
 (375,
  '2-[4-(2-hydroxyethyl)piperazin-1-yl]ethanesulfonic acid',
  'obo:CHEBI_42334',
  'buffer',
  'obo:CHEBI_35225'),
 (271, 'tris', 'obo:CHEBI_9754', 'buffer', 'obo:CHEBI_35225'),
 (268, 'ethanol', 'obo:CHEBI_16236', 'solvent', 'obo:CHEBI_46787'),
 (268, 'toluene', 'obo:CHEBI_17578', 'solvent', 'obo:CHEBI_46787'),
 (249, 'PBS', 'cear:chem_6660', 'buffer', 'obo:CHEBI_35225'),
 (217, 'dichloromethane', 'obo:CHEBI_15767', 'solvent', 'obo:CHEBI_46787'),
 (199, 'ethyl acetate', 'obo:CHEBI_27750', 'solvent', 'obo:CHEBI_46787'),
 (175, 'chloroform',

In [7]:
import random

stats = sorted(stats, key=lambda x: x[0], reverse=False)
filtered = [tup for tup in stats if tup[0] == min_mentioned_relation]
random.shuffle(filtered)
filtered[:100]


[(1, 'zirconium oxide', 'cear:chem_11722', 'catalyst', 'obo:CHEBI_35223'),
 (1,
  '3-(4,5-dimethylthiazol-2-yl)-2,5-diphenyltetrazolium bromide',
  'obo:CHEBI_53233',
  'indicator',
  'obo:CHEBI_47867'),
 (1,
  '2,4-ditert-butylphenyl) phosphite',
  'cear:chem_16908',
  'food stabiliser',
  'obo:CHEBI_77966'),
 (1, 'macromolecule', 'obo:CHEBI_33839', 'nonviral', 'cear:role_659'),
 (1, 'polychlorinated-Biphenyls', 'cear:chem_5441', 'drug', 'obo:CHEBI_23888'),
 (1, 'MJ995OF5', 'cear:chem_11324', 'inhibitor', 'obo:CHEBI_35222'),
 (1, 'manganese(0)', 'obo:CHEBI_35154', 'cofactor', 'obo:CHEBI_23357'),
 (1, 'phytocannabinoid', 'obo:CHEBI_67196', 'metabolite', 'obo:CHEBI_25212'),
 (1, 'NiBr2∙glyme', 'cear:chem_8622', 'reagent', 'obo:CHEBI_33893'),
 (1, 'dialkylphosphites145', 'cear:chem_8316', 'reagent', 'obo:CHEBI_33893'),
 (1, 'YZJ-1139', 'cear:chem_15184', 'pharmaceutical', 'obo:CHEBI_52217'),
 (1, 'KGluc', 'cear:chem_17426', 'cationophore', 'cear:role_516'),
 (1, 'cyclopropenone', 'obo:CH

In [9]:
for s in stats:
    if s[3] == "greenhouse gas":
        print(s)

(1, 'GHG', 'cear:chem_17378', 'greenhouse gas', 'obo:CHEBI_76413')
(1, 'dinitrogen oxide', 'obo:CHEBI_17045', 'greenhouse gas', 'obo:CHEBI_76413')
(2, 'carbon dioxide', 'obo:CHEBI_16526', 'greenhouse gas', 'obo:CHEBI_76413')
(4, 'methane', 'obo:CHEBI_16183', 'greenhouse gas', 'obo:CHEBI_76413')


In [16]:
stats = sorted(stats, key=lambda x: x[0], reverse=True)
i = 0
for s in stats:
    if s[2].startswith("cear:"):
        i += 1
        print(s)
        if i>=10: break

print("---")
i = 0
for s in stats:
    if s[4].startswith("cear:"):
        i += 1
        print(s)
        if i>=10: break

(249, 'PBS', 'cear:chem_6660', 'buffer', 'obo:CHEBI_35225')
(117, 'CH2Cl2', 'cear:chem_6484', 'solvent', 'obo:CHEBI_46787')
(76, 'metal', 'cear:chem_135', 'catalyst', 'obo:CHEBI_35223')
(62, 'ACN', 'cear:chem_883', 'solvent', 'obo:CHEBI_46787')
(45, 'Tris-HCl', 'cear:chem_9013', 'buffer', 'obo:CHEBI_35225')
(32, 'organolithium', 'cear:chem_6909', 'reagent', 'obo:CHEBI_33893')
(31, 'terpyridine', 'cear:chem_262', 'ligand', 'obo:CHEBI_52214')
(31, 'Et2O', 'cear:chem_3373', 'solvent', 'obo:CHEBI_46787')
(28, 'CH2Cl2', 'cear:chem_6484', 'reagent', 'obo:CHEBI_33893')
(26, 'metal', 'cear:chem_135', 'reagent', 'obo:CHEBI_33893')
---
(33, 'hydrogen atom', 'obo:CHEBI_49637', 'fuel', 'cear:role_50')
(24, 'ammonia', 'obo:CHEBI_16134', 'fuel', 'cear:role_50')
(19, 'carbon dioxide', 'obo:CHEBI_16526', 'feedstock', 'cear:role_12')
(16, 'methanol', 'obo:CHEBI_17790', 'fuel', 'cear:role_50')
(15, 'hydrocarbon', 'obo:CHEBI_24632', 'fuels', 'cear:role_42')
(15, 'ethanol', 'obo:CHEBI_16236', 'fuel', 'cea