In [39]:
import requests
import json

import os
import time
import pandas as pd

from tqdm.auto import tqdm

from termcolor import colored

In [31]:
API_URL = "https://uts-ws.nlm.nih.gov/rest"
KEY = "e9831397-cd97-4172-b34e-6e1b1b6d125d"

In [32]:
ROOT_PATH = os.path.dirname(os.getcwd())

DATA_CLEAN_PATH = os.path.join(ROOT_PATH, "data", "data_clean")
ENTITIES_PATH = os.path.join(ROOT_PATH, "data", "entities")

PAGE_SIZE = 50
NUM_MATCHES = 5
unsuspected
filename = "21616724.txt"

In [33]:
with open(os.path.join(DATA_CLEAN_PATH, filename), "r") as f:
    text = f.read()

with open(os.path.join(ENTITIES_PATH, filename.split(".")[0] + ".csv"), "r") as f:
    entities_df = pd.read_csv(f)

In [34]:
def get_semanticType(word):
    
    word = word.strip().lower()
    
    def get_search_url(term):
        return API_URL + f"/search/current/?apiKey={KEY}&string={term}&pageSize={PAGE_SIZE}"
    
    def add_api_key(url):
        return url + f"?apiKey={KEY}"
    
    with requests.get(get_search_url(word)) as r:
        results = json.loads(r.content)["result"]["results"]
        all_types = {}
        for cui in results:
            if cui["name"].strip().lower() == word:
                with requests.get(add_api_key(cui["uri"])) as r_word:
                    s_type = []
                    for s_t in json.loads(r_word.content)["result"]["semanticTypes"]:
                        return s_t["name"]
            else:
                with requests.get(add_api_key(cui["uri"])) as r_word:
                    s_type = []
                    for s_t in json.loads(r_word.content)["result"]["semanticTypes"]:
                        name = s_t["name"]
                        if name not in all_types.keys():
                            all_types[name] = 1
                        else:
                            all_types[name] += 1
        
        if len(all_types) != 0:                   
            return max(all_types, key=all_types.get)
        else:
            return "Unknown"

In [309]:
def get_color_map(entities_df):

    colors = ["red", "blue", "green", "yellow", "magenta", "cyan", "grey", "white"]

    types_to_color = {}
    cnt = 0
    for type_ent in list(entities_df["Type"].value_counts().index):
        try:
            types_to_color[type_ent] = colors[cnt]
        except:
            types_to_color[type_ent] = None
        cnt += 1

    return types_to_color

In [310]:
color_map = get_color_map(entities_df)
colored_text = ""
last_end = 0

for i in range(len(entities_df)):
    start = entities_df.iloc[i]["StartChar"]
    end = entities_df.iloc[i]["EndChar"]
    e_type = entities_df.iloc[i]["Type"]
    word = entities_df.iloc[i]["Word"].strip().lower()
    if last_end <= start:
        if e_type == "ENTITY":
            if word in entity_types.keys():
                colored_text += text[last_end:start] + colored(text[start:end], color_map[e_type]) + "(: " + entity_types[word] + ")"
            else:
                colored_text += text[last_end:start] + colored(text[start:end], color_map[e_type]) + "(: Error!)"
        else:
            colored_text += text[last_end:start] + colored(text[start:end], color_map[e_type])
        last_end = end
    elif last_end > start and last_end < end:
        colored_text += colored(text[last_end:end], color_map[e_type])
        last_end = end
if last_end != len(text):
    colored_text += text[last_end:]

In [42]:
def display_likely_types(entities):
    
    def get_search_url(term):
        return API_URL + f"/search/current/?apiKey={KEY}&string={term}&pageSize={PAGE_SIZE}"
    
    def add_api_key(url):
        return url + f"?apiKey={KEY}"
    
    def get_type(word):
        
        word = word.strip().lower()
         
        with requests.get(get_search_url(word)) as r:
            results = json.loads(r.content)["result"]["results"]
            all_types = {}
            
            for cui in results:
                # Direct match (first of the list usually, so this won't run forever)
                if cui["name"].strip().lower() == word:
                    with requests.get(add_api_key(cui["uri"])) as r_word:
                        s_type = []
                        for s_t in json.loads(r_word.content)["result"]["semanticTypes"]:
                            return s_t["name"]
                # No direct match. Runs through the list of matches and gathers the semantic types found       
                else:
                    with requests.get(add_api_key(cui["uri"])) as r_word:
                        s_type = []
                        for s_t in json.loads(r_word.content)["result"]["semanticTypes"]:
                            name = s_t["name"]
                            if name not in all_types.keys():
                                all_types[name] = 1
                            else:
                                all_types[name] += 1
        
            if len(all_types) != 0:                   
                return [(k, all_types[k]) for k in sorted(all_types, key=all_types.get, reverse=True)[:NUM_MATCHES]]
            else:
                return "Unknown"
        
    entity_types = {}
    entity_counts = {}
    
    for word in tqdm(entities_df.loc[entities_df["Type"] == "ENTITY"]["Word"]):
        
        word_s = word.strip().lower()
        
        if word_s not in entity_types.keys():
            count = 0
            e_type = None
            while e_type == None and count < 30:
                try:
                    e_type = get_type(word_s)
                except:
                    print(f"Connection error, retrying... ({count})")
                count += 1
                time.sleep(1)
            entity_types[word_s] = e_type
            
        else:
            e_type = entity_types[word_s]
            
        if type(e_type) is str:
            if e_type not in entity_counts.keys():
                entity_counts[e_type] = 1
            else:
                entity_counts[e_type] += 1
            print(f"{word}: {e_type}")
                
        else:
            disp = f"{word}: "
            for elem in e_type:
                disp += f"{elem} / "
            print(disp[: -3])
    
    return entity_types

In [43]:
entity_types = display_likely_types(entities_df)

  0%|          | 0/283 [00:00<?, ?it/s]

Laparoscopic sleeve gastrectomy: Therapeutic or Preventive Procedure
Connection error, retrying (0)...
procedure: ('Therapeutic or Preventive Procedure', 34) / ('Diagnostic Procedure', 11) / ('Laboratory Procedure', 4) / ('Health Care Activity', 1)
surgical management: Health Care Activity
morbid obesity: Disease or Syndrome
postoperative gastric leak: Unknown
procedure: ('Therapeutic or Preventive Procedure', 34) / ('Diagnostic Procedure', 11) / ('Laboratory Procedure', 4) / ('Health Care Activity', 1)
complication: Pathologic Function
incidence: Quantitative Concept
leak: ('Finding', 29) / ('Disease or Syndrome', 12) / ('Pathologic Function', 7) / ('Injury or Poisoning', 2)
leak: ('Finding', 29) / ('Disease or Syndrome', 12) / ('Pathologic Function', 7) / ('Injury or Poisoning', 2)
upper part: ('Body Location or Region', 19) / ('Body Part, Organ, or Organ Component', 14) / ('Congenital Abnormality', 4) / ('Injury or Poisoning', 4) / ('Sign or Symptom', 4)
staple line: ('Therapeutic o

stomach tube: ('Therapeutic or Preventive Procedure', 29) / ('Medical Device', 4) / ('Diagnostic Procedure', 3) / ('Health Care Activity', 3) / ('Laboratory Procedure', 3)
figure-of-8 suture: Unknown
staple line: ('Therapeutic or Preventive Procedure', 2) / ('Medical Device', 1)
air: Pharmacologic Substance
leaking: Functional Concept
repeat leak test: Unknown
persistent: Temporal Concept
air leak: ('Disease or Syndrome', 11) / ('Health Care Activity', 10) / ('Finding', 6) / ('Diagnostic Procedure', 6) / ('Therapeutic or Preventive Procedure', 3)
Connection error, retrying (0)...
region: ('Body Location or Region', 29) / ('Body Part, Organ, or Organ Component', 12) / ('Sign or Symptom', 3) / ('Disease or Syndrome', 2) / ('Injury or Poisoning', 2)
region: ('Body Location or Region', 29) / ('Body Part, Organ, or Organ Component', 12) / ('Sign or Symptom', 3) / ('Disease or Syndrome', 2) / ('Injury or Poisoning', 2)
evaluated: ('Finding', 19) / ('Health Care Activity', 5) / ('Therapeutic 

mechanism: ('Pharmacologic Substance', 13) / ('Functional Concept', 9) / ('Mental Process', 5) / ('Hormone', 4) / ('Amino Acid, Peptide, or Protein', 3)
injury: ('Injury or Poisoning', 47) / ('Finding', 2) / ('Disease or Syndrome', 1)
video recording: Machine Activity
procedure: ('Therapeutic or Preventive Procedure', 34) / ('Diagnostic Procedure', 11) / ('Laboratory Procedure', 4) / ('Health Care Activity', 1)
reviewed: Qualitative Concept
stomach wall: ('Body Part, Organ, or Organ Component', 25) / ('Body Location or Region', 18) / ('Neoplastic Process', 5) / ('Finding', 2)
injured: Functional Concept
tip: Spatial Concept
cartridge: Medical Device
rotating: ('Medical Device', 15) / ('Injury or Poisoning', 10) / ('Finding', 8) / ('Therapeutic or Preventive Procedure', 5) / ('Diagnostic Procedure', 3)
stapler: ('Medical Device', 29) / ('Therapeutic or Preventive Procedure', 16) / ('Diagnostic Procedure', 4) / ('Manufactured Object', 1)
attempt: Event
posterior wall: Spatial Concept
sto

In [47]:
entity_types

{'laparoscopic sleeve gastrectomy': 'Therapeutic or Preventive Procedure',
 'procedure': [('Therapeutic or Preventive Procedure', 34),
  ('Diagnostic Procedure', 11),
  ('Laboratory Procedure', 4),
  ('Health Care Activity', 1)],
 'surgical management': 'Health Care Activity',
 'morbid obesity': 'Disease or Syndrome',
 'postoperative gastric leak': 'Unknown',
 'complication': 'Pathologic Function',
 'incidence': 'Quantitative Concept',
 'leak': [('Finding', 29),
  ('Disease or Syndrome', 12),
  ('Pathologic Function', 7),
  ('Injury or Poisoning', 2)],
 'upper part': [('Body Location or Region', 19),
  ('Body Part, Organ, or Organ Component', 14),
  ('Congenital Abnormality', 4),
  ('Injury or Poisoning', 4),
  ('Sign or Symptom', 4)],
 'staple line': [('Therapeutic or Preventive Procedure', 2),
  ('Medical Device', 1)],
 'gastroesophageal junction': [('Neoplastic Process', 35),
  ('Diagnostic Procedure', 5),
  ('Therapeutic or Preventive Procedure', 4),
  ('Disease or Syndrome', 3),
 

In [49]:
all_types = []
for w, e in entity_types.items():
    if type(e) is str:
        all_types.append(e)
    else:
        for t, c in e:
            all_types.append(t)
print(len(set(all_types)))
set(all_types)

58


{'Acquired Abnormality',
 'Activity',
 'Amino Acid, Peptide, or Protein',
 'Anatomical Abnormality',
 'Biomedical Occupation or Discipline',
 'Body Location or Region',
 'Body Part, Organ, or Organ Component',
 'Body Space or Junction',
 'Cell Component',
 'Cell Function',
 'Clinical Attribute',
 'Congenital Abnormality',
 'Diagnostic Procedure',
 'Disease or Syndrome',
 'Educational Activity',
 'Event',
 'Finding',
 'Food',
 'Functional Concept',
 'Geographic Area',
 'Health Care Activity',
 'Health Care Related Organization',
 'Hormone',
 'Idea or Concept',
 'Individual Behavior',
 'Injury or Poisoning',
 'Inorganic Chemical',
 'Intellectual Product',
 'Laboratory Procedure',
 'Laboratory or Test Result',
 'Machine Activity',
 'Manufactured Object',
 'Medical Device',
 'Mental Process',
 'Mental or Behavioral Dysfunction',
 'Molecular Biology Research Technique',
 'Neoplastic Process',
 'Occupational Activity',
 'Organic Chemical',
 'Organization',
 'Pathologic Function',
 'Patient o

In [1]:
import sys
!{sys.executable} -m pip install Cython
!{sys.executable} -m pip install owlready2
!{sys.executable} -m pip install owlready2_optimized

[31mERROR: Could not find a version that satisfies the requirement owlready2_optimized (from versions: none)[0m
[31mERROR: No matching distribution found for owlready2_optimized[0m


In [1]:
import Cython
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *
default_world.set_backend(filename = "pym.sqlite3")
import_umls("umls-2022AA-full.zip", terminologies = ["CUI"])
default_world.save()



Importing UMLS from umls-2022AA-full.zip with Python version 3.8.10 and Owlready version 2-0.37...
Full UMLS release - importing UMLS from inner Zip file 2022AA-full/2022aa-1-meta.nlm...
  Parsing 2022AA/META/MRSTY.RRF.gz as MRSTY with encoding UTF-8
  Parsing 2022AA/META/MRRANK.RRF.gz as MRRANK with encoding UTF-8
  Parsing 2022AA/META/MRCONSO.RRF.aa.gz as MRCONSO with encoding UTF-8
  Parsing 2022AA/META/MRCONSO.RRF.ab.gz as MRCONSO with encoding UTF-8
  Parsing 2022AA/META/MRCONSO.RRF.ac.gz as MRCONSO with encoding UTF-8
  Parsing 2022AA/META/MRDEF.RRF.gz as MRDEF with encoding UTF-8
Full UMLS release - importing UMLS from inner Zip file 2022AA-full/2022aa-2-meta.nlm...
  Parsing 2022AA/META/MRREL.RRF.aa.gz as MRREL with encoding UTF-8
  Parsing 2022AA/META/MRREL.RRF.ab.gz as MRREL with encoding UTF-8
  Parsing 2022AA/META/MRREL.RRF.ac.gz as MRREL with encoding UTF-8
  Parsing 2022AA/META/MRREL.RRF.ad.gz as MRREL with encoding UTF-8
  Parsing 2022AA/META/MRSAT.RRF.aa.gz as MRSAT wit