##### Project: NLP Entity Linking
##### Step: All code for the final dashboard
##### Author: Fabio C. Souza - Jun/2022

### Please, see the dashboard in the following link:
- https://dataplatform.cloud.ibm.com/dashboards/010767fc-2d5a-412a-88bb-12998e988292/view/587ff16404901cc26acbdce407ca7a077831260ee4bb8751848d7b495c347697a93d1192c87d4c538915046afbbd475ccb

In [1]:
#Basic parameters
mapping_threshold = 0.98
n_top_kps = 5
#n_top_k = 50
n_top_aq = 1000
n_top_terms_by_kp = 15

#Domain
domain = "medical_demo"

#Topic
topic = "Left heart catheterization, left ventriculography, coronary angiography, "
topic += "and successful stenting of tight lesion in the distal circumflex "
topic += "andmoderately tight lesion in the mid-right coronary artery."

##### Notebook preparation ...

In [2]:
# Imports

import io
import requests
import pathlib
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

Credentials are in a hidden cell bellow

In [3]:
# The code was removed by Watson Studio for sharing.

In [4]:
# Storage access object

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

sto = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint=ibm_auth_endpoint,
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_url
)

In [5]:
# Import Dabater

file = 'debater_python_api.zip'
sto.download_file(Bucket=bucket, Key=file, Filename=file)
with zipfile.ZipFile(file, 'r') as zf:
    zf.extractall()
    
from debater_python_api.api.debater_api import DebaterApi

In [6]:
# Loading the API key and create a client for the Term Wikifier service

debater_api = DebaterApi(apikey=api_key)
term_wikifier_client = debater_api.get_term_wikifier_client()
arg_quality_client = debater_api.get_argument_quality_client()
keypoints_client = debater_api.get_keypoints_client()

##### Data preparation for KPA ...

In [7]:
# Reading input dataset

file = 'mtsamples_descriptions_clean.txt'
sto.download_file(Bucket=bucket, Key=file, Filename=file)
df = pd.read_csv(file)

In [8]:
df.head(3)

Unnamed: 0,id,id_description,medical_specialty_new,text,year,borough
0,0,0,Gastroenterology,EGD with photos and biopsies,2013.0,Merton
1,1,0,Gastroenterology,This is a 75-year-old female who presents wit...,2013.0,Merton
2,2,0,Gastroenterology,She has a previous history of hiatal hernia,2013.0,Merton


In [9]:
df['borough'].unique()

array(['Merton', 'Harrow', 'Lewisham', 'Barking and Dagenham', 'Bexley',
       'Ealing', 'Camden', 'Croydon', 'Bromley', 'Barnet', 'Enfield',
       'Wandsworth', 'Tower Hamlets', 'City of London', 'Newham',
       'Greenwich', 'Westminster', 'Southwark', 'Lambeth',
       'Kensington and Chelsea', 'Kingston upon Thames', 'Havering',
       'Sutton', 'Hammersmith and Fulham', 'Hillingdon', 'Islington',
       'Brent', 'Hackney', 'Hounslow', 'Richmond upon Thames',
       'Waltham Forest', 'Haringey', 'Redbridge', nan], dtype=object)

In [11]:
df.groupby(by=['id_description','borough']).agg({'id':'count'}).sort_values(by='id', ascending=False)[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,id
id_description,borough,Unnamed: 2_level_1
847,Ealing,7
1209,Ealing,7
427,Bexley,7
1016,Ealing,7
1292,Haringey,6


In [12]:
df.query("id_description==847 and borough=='Ealing'")

Unnamed: 0,id,id_description,medical_specialty_new,text,year,borough
813,813,847,Radiology,Transesophageal echocardiogram for aortic ste...,2010.0,Ealing
814,814,847,Radiology,Normal left ventricular size and function,2010.0,Ealing
815,815,847,Radiology,Benign Doppler flow pattern,2010.0,Ealing
816,816,847,Radiology,Doppler study essentially benign,2010.0,Ealing
817,817,847,Radiology,Aorta essentially benign,2010.0,Ealing
818,818,847,Radiology,Atrial septum intact,2010.0,Ealing
819,819,847,Radiology,Study was negative.,2010.0,Ealing


In [13]:
# Removing missing values

print(df.shape)
df = df.dropna()
print(df.shape)

(3248, 6)
(3245, 6)


In [14]:
# Transforming the data so that identifiers are strings

df["id"] = df["id"].astype(str)
df["id_description"] = df["id_description"].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3245 entries, 0 to 3246
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3245 non-null   object 
 1   id_description         3245 non-null   object 
 2   medical_specialty_new  3245 non-null   object 
 3   text                   3245 non-null   object 
 4   year                   3245 non-null   float64
 5   borough                3245 non-null   object 
dtypes: float64(1), object(5)
memory usage: 177.5+ KB


In [15]:
# Reshaping the dataset so that each row is an ordered dictionary and creating a data structure sentences to use with the API

from collections import OrderedDict
sentences = df.to_dict(into=OrderedDict, orient="records")
sentences[0]

OrderedDict([('id', '0'),
             ('id_description', '0'),
             ('medical_specialty_new', ' Gastroenterology'),
             ('text', ' EGD with photos and biopsies'),
             ('year', 2013.0),
             ('borough', 'Merton')])

In [16]:
# Runnnig argument quality to get scores over sentences

sentences_topic = [
    { "sentence": sentence["text"], "topic": topic, }
    for sentence in sentences
]

scores = arg_quality_client.run(sentences_topic)

ArgumentQualityClient: 100%|██████████| 3245/3245 [00:50<00:00, 64.41it/s] 


In [18]:
scores[:3]

[0.3908921778202057, 0.8073611259460449, 0.6512947082519531]

In [73]:
print(len(sentences_topic), len(scores))

3245 3245


In [75]:
i = 0
for i in range(len(sentences_topic)):
        if sentences_topic[i].get('sentence').find('Lumbar muscle strain and chronic back pain') >-1:
            print(i, sentences_topic[i].get('sentence'), scores[i])
            break
        i +=1

705  Lumbar muscle strain and chronic back pain 0.7702313661575317


In [19]:
# Selecting the top 1,000 sentences most closely related to the chosen topic

sentences_sorted = [
    s for s in sorted(zip(sentences, scores), 
                      key=lambda x: x[1], reverse=True)
]

sentences_top_aq = sentences_sorted[:n_top_aq]

In [21]:
sentences_top_aq[0]

(OrderedDict([('id', '843'),
              ('id_description', '863'),
              ('medical_specialty_new', ' Cardiovascular / Pulmonary'),
              ('text',
               ' Cardiac Catheterization - An obese female with a family history of coronary disease and history of chest radiation for Hodgkin disease, presents with an acute myocardial infarction with elevated enzymes.'),
              ('year', 2010.0),
              ('borough', 'Lewisham')]),
 0.9284269213676453)

In [22]:
# Reshaping the data into sentences_texts and sentences_id to use for the KPA job (for the 1k selected sentences)

sentences_texts = [
    sentence[0]["text"] 
    for sentence in sentences_top_aq
]

sentences_ids = [
    sentence[0]["id"]
    for sentence in sentences_top_aq
]

In [23]:
# Clearing the domain and loading the data for the Key Point Analysis job

keypoints_client.delete_domain_cannot_be_undone(domain)

keypoints_client.upload_comments(
    domain=domain,
    comments_ids=sentences_ids,
    comments_texts=sentences_texts,
    dont_split=True,
)

keypoints_client.wait_till_all_comments_are_processed(domain=domain)

In [24]:
# Running the KPA job

run_params = {
    "mapping_threshold": mapping_threshold,
    "n_top_kps": n_top_kps,
}

future = keypoints_client.start_kp_analysis_job(
    domain=domain,
    comments_ids=sentences_ids,
    run_params=run_params,
)

kpa_result = future.get_result(
    high_verbosity=True,
    polling_timout_secs=10,
)
    
future.get_job_id();

Stage 1/1: |--------------------------------------------------| 0.0% Complete

Stage 1/1: |--------------------------------------------------| 0.0% Complete

Stage 1/1: |--------------------------------------------------| 0.0% Complete

Stage 1/1: |█████████-----------------------------------------| 18.2% Complete

Stage 1/1: |██████████████████████----------------------------| 45.5% Complete

Stage 1/1: |██████████████████████████████████████------------| 77.3% Complete

Stage 1/1: |███████████████████████████████████████████████---| 95.5% Complete

Stage 1/1: |██████████████████████████████████████████████████| 100.0% Complete


Stage 1/1: |██████████████████████████████████████████████████| 100.0% Complete


Stage 1/1: |██████████████████████████████████████████████████| 100.0% Complete


Stage 1/1: |██████████████████████████████████████████████████| 100.0% Complete


Stage 1/1: |██████████████████████████████████████████████████| 100.0% Complete


Stage 1/1: |█████████████████████

In [28]:
kpa_result.keys()

dict_keys(['keypoint_matchings'])

In [42]:
len(kpa_result.get('keypoint_matchings'))

6

In [43]:
kpa_result.get('keypoint_matchings')[0].keys()

dict_keys(['keypoint', 'matching'])

In [44]:
for i in range(6):
    print(kpa_result.get('keypoint_matchings')[i].get('keypoint'))

none
 Possible inflammatory bowel disease
 Lumbar muscle strain and chronic back pain
 Risk factors for coronary heart disease.
 Urgent cardiac catheterization with coronary angiogram.
 Hemoptysis and history of lung cancer


In [50]:
kpa_result.get('keypoint_matchings')[1].get('matching')[0]

{'domain': 'medical_demo',
 'comment_id': '1924',
 'sentence_id': 0,
 'sents_in_comment': 1,
 'span_start': 0,
 'span_end': 36,
 'num_tokens': 4,
 'argument_quality': 0.6176589727401733,
 'sentence_text': ' Possible inflammatory bowel disease',
 'score': 1.0}

In [51]:
# Reshaping the KPA results as a pandas DataFrame

matchings_rows = []
    
for keypoint_matching in kpa_result["keypoint_matchings"]:
    kp = keypoint_matching["keypoint"]

    for match in keypoint_matching["matching"]:
        match_row = [
            kp,
            match["sentence_text"],
            match["score"],
            match["comment_id"],
            match["sentence_id"],
            match["sents_in_comment"],
            match["span_start"],
            match["span_end"],
            match["num_tokens"],
            match["argument_quality"],
        ]

        matchings_rows.append(match_row)

cols = [
    "kp",
    "sentence_text",
    "match_score",
    "comment_id",
    "sentence_id",
    "sents_in_comment",
    "span_start",
    "span_end",
    "num_tokens",
    "argument_quality",
]
    
match_df = pd.DataFrame(matchings_rows, columns=cols)

In [69]:
match_df.sample(3)

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality
325,none,Psychiatric consultation has been requested a...,0.0,624,0,1,0,230,36,0.717444
732,Lumbar muscle strain and chronic back pain,Lumbar osteomyelitis and need for durable cen...,0.997172,1316,0,1,0,69,9,0.630215
492,none,"She had a recent D&C and laparoscopy, and enla...",0.0,6,0,1,0,132,26,0.522513


In [70]:
match_df[match_df["sentence_text"].apply(lambda x: x.find('Lumbar muscle strain and chronic back pain')>-1)]

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality
646,Lumbar muscle strain and chronic back pain,Lumbar muscle strain and chronic back pain,1.0,705,0,1,0,43,7,0.582514


In [76]:
# Merging the KPA results with related sentences from our input dataset

print(match_df.shape)
df_merge = match_df.merge(
    df[["id", "id_description", "medical_specialty_new"]],
    left_on = "comment_id",
    right_on = "id",
    validate = "one_to_one")
print(df_merge.shape)

(928, 10)
(928, 13)


In [78]:
df_merge.sample(3)

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality,id,id_description,medical_specialty_new
45,none,"Anterior cervical decompression, anterior spi...",0.0,1056,0,1,0,136,15,0.672675,1056,1013,Orthopedic
683,Lumbar muscle strain and chronic back pain,Bilateral degenerative arthritis of the knees,0.999704,1606,0,1,0,46,6,0.517437,1606,1313,Orthopedic
854,Urgent cardiac catheterization with coronary ...,Stress test with imaging for further classifi...,0.999424,273,0,1,0,73,11,0.689821,273,361,Radiology


In [79]:
# Removing any rows that have a key point "none", then compare the absolute size of each key point cluster

print(df_merge.shape)
df_clean = df_merge[df_merge["kp"] != "none"]
print(df_clean.shape)

df_clean["kp"].value_counts()

(928, 13)
(432, 13)


 Possible inflammatory bowel disease                        150
 Lumbar muscle strain and chronic back pain                  99
 Risk factors for coronary heart disease.                    76
 Urgent cardiac catheterization with coronary angiogram.     61
 Hemoptysis and history of lung cancer                       46
Name: kp, dtype: int64

In [80]:
# Saving df_clean for the dashboard

file = 'df_clean.csv'
df_clean.to_csv(file)
sto.upload_file(Bucket=bucket, Key=file, Filename=file)

#### Data Preparation for Term Wikifier ...

In [81]:
# Defining a function that uses the Term Wikifier service to map from a given medical transcription to a set of mentions in Wikipedia

def get_sentence_to_mentions (sentences_texts):
    mentions_list = term_wikifier_client.run(sentences_texts)
    sentence_to_mentions = {}

    for sentence_text, mentions in zip(sentences_texts, mentions_list):
        sentence_to_mentions[sentence_text] = set([
            mention["concept"]["title"]
            for mention in mentions
        ])
    
    return sentence_to_mentions

In [82]:
# Getting references in Wikipedia for each identified key point

terms = {}

for kp in set(df_clean["kp"].values):
    
    sentence_to_mentions = get_sentence_to_mentions(
        df_clean["sentence_text"][df_clean["kp"] == kp].values
    )
    
    all_mentions = []

    for sentence in sentence_to_mentions:
        for mention in sentence_to_mentions[sentence]:
            all_mentions.append(mention)        

    term_count = dict(Counter(all_mentions))
    
    if "History" in term_count.keys():
        term_count.pop("History")
   
    terms[kp] = term_count

TermWikifierClient: 100%|██████████| 61/61 [00:02<00:00, 30.22it/s]
TermWikifierClient: 100%|██████████| 99/99 [00:01<00:00, 95.22it/s]
TermWikifierClient: 100%|██████████| 46/46 [00:00<00:00, 48.37it/s]
TermWikifierClient: 100%|██████████| 76/76 [00:01<00:00, 72.30it/s]
TermWikifierClient: 100%|██████████| 150/150 [00:01<00:00, 143.21it/s]


In [88]:
df_clean["sentence_text"][df_clean["kp"] == kp] #foi passado isso para a função na última iteração

496                  Possible inflammatory bowel disease
497     Comprehensive Evaluation - Diabetes, hyperten...
498     Gastroesophageal reflux disease, hiatal herni...
499     Pulmonary Medicine Clinic for followup evalua...
500     Patient with a diagnosis of pancreatitis, dev...
                             ...                        
641     She sustained a concussion with postconcussiv...
642               Diagnosis of benign rolandic epilepsy.
643     Nasal septal deviation with bilateral inferio...
644     The patient failed to improve satisfactorily ...
645     Antral erythema; 2 cm polypoid pyloric channe...
Name: sentence_text, Length: 150, dtype: object

In [89]:
sentence_to_mentions #foi retornado isso na última iteração

{' Possible inflammatory bowel disease': {'Inflammatory bowel disease'},
 ' Comprehensive Evaluation - Diabetes, hypertension, irritable bowel syndrome, and insomnia.': {'Diabetes',
  'Hypertension',
  'Insomnia',
  'Irritable bowel syndrome'},
 ' Gastroesophageal reflux disease, hiatal hernia, and enterogastritis.': {'Gastroesophageal reflux disease',
  'Hiatal hernia'},
 ' Pulmonary Medicine Clinic for followup evaluation of interstitial disease secondary to lupus pneumonitis.': {'Clinic',
  'Disease',
  'Evaluation',
  'Lupus',
  'Pneumonitis',
  'Pulmonology'},
 ' Patient with a diagnosis of pancreatitis, developed hypotension and possible sepsis and respiratory, as well as renal failure.': {'Diagnosis',
  'Hypotension',
  'Kidney failure',
  'Pancreatitis',
  'Patient',
  'Respiratory system',
  'Sepsis'},
 ' Colonoscopy due to rectal bleeding, constipation, abnormal CT scan, rule out inflammatory bowel disease.': {'Abnormality (behavior)',
  'CT scan',
  'Colonoscopy',
  'Constip

In [90]:
all_mentions #esses foram os termos capturados da ultima iteracao

['Inflammatory bowel disease',
 'Hypertension',
 'Diabetes',
 'Insomnia',
 'Irritable bowel syndrome',
 'Hiatal hernia',
 'Gastroesophageal reflux disease',
 'Lupus',
 'Pulmonology',
 'Evaluation',
 'Pneumonitis',
 'Clinic',
 'Disease',
 'Hypotension',
 'Respiratory system',
 'Kidney failure',
 'Patient',
 'Sepsis',
 'Pancreatitis',
 'Diagnosis',
 'CT scan',
 'Abnormality (behavior)',
 'Inflammatory bowel disease',
 'Rectal bleeding',
 'Constipation',
 'Colonoscopy',
 'Atrophic vaginitis',
 'Cystoscopy',
 'Urinary tract infection',
 'Sepsis',
 'Shortness of breath',
 'Malignant pleural effusion',
 'Pelvic pain',
 'Uterus',
 'Enterocele',
 'Adhesion',
 'Uterine fibroid',
 'Chronic condition',
 'Pelvis',
 'Pelvic inflammatory disease',
 'Vomiting',
 'Diarrhea',
 'Fever',
 'Nausea',
 'Hypertrophy',
 'Otitis media',
 'Chronic condition',
 'Sigmoidoscopy',
 'Evaluation',
 'Anemia',
 'Gastrointestinal bleeding',
 'Laparoscopy',
 'Cholecystectomy',
 'Chronic condition',
 'Gallstone',
 'Cholec

In [92]:
term_count #esse foi o conteudo da última iteração

{'Inflammatory bowel disease': 3,
 'Hypertension': 4,
 'Diabetes': 4,
 'Insomnia': 1,
 'Irritable bowel syndrome': 1,
 'Hiatal hernia': 3,
 'Gastroesophageal reflux disease': 3,
 'Lupus': 1,
 'Pulmonology': 1,
 'Evaluation': 2,
 'Pneumonitis': 1,
 'Clinic': 1,
 'Disease': 2,
 'Hypotension': 1,
 'Respiratory system': 1,
 'Kidney failure': 2,
 'Patient': 27,
 'Sepsis': 4,
 'Pancreatitis': 1,
 'Diagnosis': 3,
 'CT scan': 5,
 'Abnormality (behavior)': 2,
 'Rectal bleeding': 2,
 'Constipation': 1,
 'Colonoscopy': 3,
 'Atrophic vaginitis': 1,
 'Cystoscopy': 1,
 'Urinary tract infection': 2,
 'Shortness of breath': 1,
 'Malignant pleural effusion': 1,
 'Pelvic pain': 6,
 'Uterus': 3,
 'Enterocele': 1,
 'Adhesion': 5,
 'Uterine fibroid': 6,
 'Chronic condition': 9,
 'Pelvis': 7,
 'Pelvic inflammatory disease': 2,
 'Vomiting': 5,
 'Diarrhea': 5,
 'Fever': 6,
 'Nausea': 8,
 'Hypertrophy': 4,
 'Otitis media': 3,
 'Sigmoidoscopy': 1,
 'Anemia': 5,
 'Gastrointestinal bleeding': 3,
 'Laparoscopy': 4

In [103]:
# 5-Top Terms for each KP

pairs_key_terms = []
df_terms = None

for kp in df_clean["kp"].unique():
    
    print('***',kp)

    terms_by_kp = list(terms[kp].items()),
    
    lst_top_terms = []
    
    for term in terms_by_kp[0]:
        
        lst_top_terms.append([kp, term[0], term[1]]) #term[0] is the term itself, term[1] is the n_mentios
        
    df_stg = pd.DataFrame(lst_top_terms, columns=['KP', 'Term', 'n_mentions']).\
                sort_values(by='n_mentions', ascending=False)[:n_top_terms_by_kp]

    if df_terms is None:
        df_terms = df_stg.copy()
    else:
        df_terms = pd.concat([df_terms, df_stg], axis=0)

df_terms.sample(3)

***  Possible inflammatory bowel disease
***  Lumbar muscle strain and chronic back pain
***  Risk factors for coronary heart disease.
***  Urgent cardiac catheterization with coronary angiogram.
***  Hemoptysis and history of lung cancer


Unnamed: 0,KP,Term,n_mentions
45,Hemoptysis and history of lung cancer,Metastasis,6
16,Possible inflammatory bowel disease,Patient,27
20,Possible inflammatory bowel disease,CT scan,5


In [104]:
list(terms[kp].items())[:3]

[('Hemoptysis', 1), ('Lung cancer', 2), ('Urinary bladder', 2)]

In [105]:
# Saving df_terms for the dashboard

file = 'df_terms.csv'
df_terms.to_csv(file)
sto.upload_file(Bucket=bucket, Key=file, Filename=file)

In [106]:
print('ok')

ok
