###### Project: NLP Entity Linking
###### Step: Data Preparation and KPA
###### Author: Fabio C. Souza - Jun/2022

In [1]:
# Imports

import pandas as pd
import io
import requests
import matplotlib.pyplot as plt
import pathlib
import zipfile

In [2]:
# The code was removed by Watson Studio for sharing.

In [3]:
# Storage access object

import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

sto = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=ibm_api_key_id,
    ibm_auth_endpoint=ibm_auth_endpoint,
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_url)

In [4]:
# Import Dabater

key = 'debater_python_api.zip'
sto.download_file(Bucket=bucket, Key=key, Filename=key)
with zipfile.ZipFile(key, 'r') as zf:
    zf.extractall()
    
from debater_python_api.api.debater_api import DebaterApi

In [5]:
# Downloading file from storage

key='mtsamples_descriptions_clean.txt'
sto.download_file(Bucket=bucket, Key=key, Filename=key)

In [6]:
# 1-Reading input dataset

key='mtsamples_descriptions_clean.txt'
df = pd.read_csv(key)

In [7]:
# Um KPA (la de baixo neste notebook é um text?)

txt=' Congestive heart failure, stable on current regimen'
df[df['text']==txt]
#Sim

txt=' Microinvasive carcinoma of the cervix.'
df[df['text']==txt]
#Sim

Unnamed: 0,id,id_description,medical_specialty_new,text,year,borough
38,38,31,Obstetrics / Gynecology,Microinvasive carcinoma of the cervix.,2016.0,Merton


In [8]:
# 2-Removing any rows that have missing values

print(df.shape)
df = df.dropna()
print(df.shape)

(3248, 6)
(3245, 6)


In [9]:
# 3-Transforming the data so that identifiers are strings

df["id"] = df["id"].astype(str)
df["id_description"] = df["id_description"].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3245 entries, 0 to 3246
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3245 non-null   object 
 1   id_description         3245 non-null   object 
 2   medical_specialty_new  3245 non-null   object 
 3   text                   3245 non-null   object 
 4   year                   3245 non-null   float64
 5   borough                3245 non-null   object 
dtypes: float64(1), object(5)
memory usage: 177.5+ KB


In [10]:
# 3/4-Reshaping the dataset so that each row is an ordered dictionary and create a data structure sentences to use with the API

from collections import OrderedDict
sentences = df.to_dict(into=OrderedDict, orient="records")
sentences[0]

OrderedDict([('id', '0'),
             ('id_description', '0'),
             ('medical_specialty_new', ' Gastroenterology'),
             ('text', ' EGD with photos and biopsies'),
             ('year', 2013.0),
             ('borough', 'Merton')])

In [11]:
# 5-Loading your API key into the api_key variable

print('API_Key already loaded before')

API_Key already loaded before


In [12]:
# 6-Initializing clients for the two services being used

debater_api = DebaterApi(apikey=api_key)
arg_quality_client = debater_api.get_argument_quality_client()
keypoints_client = debater_api.get_keypoints_client()

In [13]:
# 7.1-Setting a topic, then run argument quality to get scores over sentences

topic = """
The patient is a 30-year-old who was admitted with symptoms including obstructions, failures, and pain that started four days ago.
"""

sentences_topic = [
    { "sentence": sentence["text"], "topic": topic, }
    for sentence in sentences
]

scores = arg_quality_client.run(sentences_topic)

ArgumentQualityClient: 100%|██████████| 3245/3245 [00:37<00:00, 87.65it/s] 


In [20]:
# 7.2-Selecting the top 1,000 sentences most closely related to the chosen topic

sentences_sorted = [
    s for s in sorted(zip(sentences, scores), key=lambda x: x[1], reverse=True)
]

top_k = 1000
sentences_top_1000_aq = sentences_sorted[:top_k]

In [23]:
# 8-Setting parameters for Key Point Analysis service

mapping_threshold = 0.95
n_top_kps = 20

domain = "medical_demo"

run_params = {
    "mapping_threshold": mapping_threshold,
    "n_top_kps": n_top_kps,
}

In [34]:
# 9-Reshape the data into sentences_texts and sentences_id to use for the KPA job (for the 1k selected sentences)

sentences_texts = [
    sentence[0]["text"] 
    for sentence in sentences_top_1000_aq
]

sentences_ids = [
    sentence[0]["id"]
    for sentence in sentences_top_1000_aq
]

In [41]:
# FCS testing - loading 1000k comments from original file

df1 = df.copy()
df1['text'] = df1['text'].apply(lambda x: ' '+str(x).strip()+'.'.replace('..','.'))
df1 = df1.groupby(by='id_description').agg({'text':'sum'}).reset_index().iloc[:150]

keypoints_client.delete_domain_cannot_be_undone(domain)

keypoints_client.upload_comments(
    domain=domain,
    comments_ids=[str(x) for x in df1['id_description'].values.tolist()],
    comments_texts=df1['text'].values.tolist(),
    dont_split=False,
)

keypoints_client.wait_till_all_comments_are_processed(domain=domain)

future = keypoints_client.start_kp_analysis_job(
    domain=domain,
    comments_ids=[str(x) for x in df1['id_description'].values.tolist()],
    run_params=run_params,
)

kpa_result = future.get_result(
    high_verbosity=False,
    polling_timout_secs=5,
)

future.get_job_id();

In [68]:
# FCS testing - continuing

lst_kpa = []
for i in range(len(kpa_result.get('keypoint_matchings'))):
    
    for j in range(len(kpa_result.get('keypoint_matchings')[i])):
    
        kp = kpa_result.get('keypoint_matchings')[i].get('keypoint')
        
        for k in range(len(kpa_result.get('keypoint_matchings')[i].get('matching'))):
            
            comment_id = kpa_result.get('keypoint_matchings')[i].get('matching')[k].get('comment_id')
            
            lst_kpa.append([kp, comment_id])
            
df_kpa = pd.DataFrame(lst_kpa, columns=['KP', 'CommentId'])
df_kpa['CommentId'] = df_kpa['CommentId'].astype(int)
df1 = df1.merge(df_kpa, how='left', left_on='id_description', right_on='CommentId')

In [70]:
# FCS testing - continuing

df1[df1['KP']=='Patient with a history of gross hematuria.']

Unnamed: 0,id_description,text,KP,CommentId
4,0,EGD with photos and biopsies. This is a 75-ye...,Patient with a history of gross hematuria.,0.0
5,0,EGD with photos and biopsies. This is a 75-ye...,Patient with a history of gross hematuria.,0.0
6,0,EGD with photos and biopsies. This is a 75-ye...,Patient with a history of gross hematuria.,0.0
7,0,EGD with photos and biopsies. This is a 75-ye...,Patient with a history of gross hematuria.,0.0
55,26,"Syncope, end-stage renal disease requiring he...",Patient with a history of gross hematuria.,26.0
56,26,"Syncope, end-stage renal disease requiring he...",Patient with a history of gross hematuria.,26.0
92,50,Discharge summary of patient with leiomyosarc...,Patient with a history of gross hematuria.,50.0
93,50,Discharge summary of patient with leiomyosarc...,Patient with a history of gross hematuria.,50.0
218,136,Chest CT - Thymoma and history of ocular myas...,Patient with a history of gross hematuria.,136.0
219,136,Chest CT - Thymoma and history of ocular myas...,Patient with a history of gross hematuria.,136.0


In [37]:
# 10-Just to be safe, in case this analysis was run previously, we’ll clear the domain. 
# Then load the data for the Key Point Analysis job

keypoints_client.delete_domain_cannot_be_undone(domain)

keypoints_client.upload_comments(
    domain=domain,
    comments_ids=sentences_ids,
    comments_texts=sentences_texts,
    dont_split=True,
)

keypoints_client.wait_till_all_comments_are_processed(domain=domain)

In [38]:
# 11-Running the KPA job

future = keypoints_client.start_kp_analysis_job(
    domain=domain,
    comments_ids=sentences_ids,
    run_params=run_params,
)

kpa_result = future.get_result(
    high_verbosity=False,
    polling_timout_secs=5,
)
    
future.get_job_id();

In [45]:
# 12-Examining the structure of one of the matching KPA results.

kpa_result["keypoint_matchings"][0]["matching"][:3]

[{'domain': 'medical_demo',
  'comment_id': '1220',
  'sentence_id': 0,
  'sents_in_comment': 1,
  'span_start': 0,
  'span_end': 157,
  'num_tokens': 26,
  'argument_quality': 0.650917649269104,
  'sentence_text': '   The patient is a 1-year-old male with a history of chronic otitis media with effusion and conductive hearing loss refractory to outpatient medical therapy',
  'score': 0},
 {'domain': 'medical_demo',
  'comment_id': '577',
  'sentence_id': 0,
  'sents_in_comment': 1,
  'span_start': 0,
  'span_end': 53,
  'num_tokens': 9,
  'argument_quality': 0.44222593307495117,
  'sentence_text': '  At the time of discharge, the patient had improved.',
  'score': 0},
 {'domain': 'medical_demo',
  'comment_id': '1591',
  'sentence_id': 0,
  'sents_in_comment': 1,
  'span_start': 0,
  'span_end': 121,
  'num_tokens': 22,
  'argument_quality': 0.49245700240135193,
  'sentence_text': '  CO2 insufflation was done to a maximum pressure of 15 mmHg and a 12-mm VersaStep port was placed throug

In [46]:
# 13-Reshaping the KPA results as a pandas DataFrame, then sampling the results

matchings_rows = []
    
for keypoint_matching in kpa_result["keypoint_matchings"]:
    kp = keypoint_matching["keypoint"]

    for match in keypoint_matching["matching"]:
        match_row = [
            kp,
            match["sentence_text"],
            match["score"],
            match["comment_id"],
            match["sentence_id"],
            match["sents_in_comment"],
            match["span_start"],
            match["span_end"],
            match["num_tokens"],
            match["argument_quality"],
        ]

        matchings_rows.append(match_row)

cols = [
    "kp",
    "sentence_text",
    "match_score",
    "comment_id",
    "sentence_id",
    "sents_in_comment",
    "span_start",
    "span_end",
    "num_tokens",
    "argument_quality",
]
    
match_df = pd.DataFrame(matchings_rows, columns=cols)
match_df.tail()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality
887,Sepsis due to urinary tract infection.,"Fever, otitis media, and possible sepsis.",0.999713,56,0,1,0,42,6,0.614363
888,Sepsis due to urinary tract infection.,The patient was discovered to have a MRSA bac...,0.999438,1693,0,1,0,155,27,0.552165
889,Sepsis due to urinary tract infection.,Recurrent urinary tract infection in a patien...,0.998664,718,0,1,0,116,16,0.657505
890,Clinical correlation is recommended.,Clinical correlation is recommended.,1.0,929,0,1,0,37,4,0.614774
891,Clinical correlation is recommended.,Maculopapular rash in kind of a linear patter...,0.995213,3042,0,1,0,135,25,0.5533


In [47]:
match_df['kp'].unique()

array(['none', ' Possible inflammatory bowel disease',
       ' Lumbar muscle strain and chronic back pain',
       ' Risk factors for coronary heart disease.',
       ' Hemoptysis and history of lung cancer',
       ' Enlarged fibroid uterus, hypermenorrhea, and secondary anemia',
       ' No evidence of polyps or malignancy.',
       ' Unilateral transpedicular T11 vertebroplasty.',
       ' Microinvasive carcinoma of the cervix.',
       ' Cerebral palsy, worsening seizures',
       ' Routine colorectal cancer screening',
       ' No complications were encountered throughout the procedure.',
       ' Cognitive linguistic impairment secondary to stroke',
       ' Congestive heart failure, stable on current regimen',
       ' Palpitations, possibly related to anxiety',
       ' Hematemesis in a patient with longstanding diabetes',
       ' This patient has reoccurring ingrown infected toenails.',
       ' Lumbar epidural steroid injection for lumbar radiculopathy.',
       ' Brachythe

In [48]:
# 14-Merging the KPA results with related sentences from our input dataset, ...
# ...saving these results to a CSV file df_merge.csv for later use.

print(match_df.shape)
df_merge = match_df.merge(
    df[["id", "id_description", "medical_specialty_new"]],
    left_on = "comment_id",
    right_on = "id",
    validate = "one_to_one")
print(df_merge.shape)

key = "df_merge.csv"
df_merge.to_csv(key, index=False)
sto.upload_file(Bucket=bucket, Key=key, Filename=key)

(892, 10)
(892, 13)


In [49]:
df_merge.head()

Unnamed: 0,kp,sentence_text,match_score,comment_id,sentence_id,sents_in_comment,span_start,span_end,num_tokens,argument_quality,id,id_description,medical_specialty_new
0,none,The patient is a 1-year-old male with a his...,0.0,1220,0,1,0,157,26,0.650918,1220,1114,Pediatrics - Neonatal
1,none,"At the time of discharge, the patient had im...",0.0,577,0,1,0,53,9,0.442226,577,595,General Medicine
2,none,CO2 insufflation was done to a maximum press...,0.0,1591,0,1,0,121,22,0.492457,1591,1304,Gastroenterology
3,none,"Intrauterine pregnancy of 39 weeks, Herpes s...",0.0,1302,0,1,0,257,34,0.630517,1302,1160,Obstetrics / Gynecology
4,none,"L1 laminotomy, microdissection, retrieval of...",0.0,1100,0,1,0,161,20,0.588591,1100,1036,Neurosurgery


In [50]:
print('ok')

ok
