In [2]:
import concurrent.futures
import logging
from threading import Semaphore
import os
import pandas as pd
from pymetamap import MetaMap
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from datetime import datetime

# Symptom Extraction using Metamap

This notebook implements the BiLSTM model which will take TF-IDF based symptom representation and Word2Vec based symptom representation and predicts diagnoses codes.

---

- Input : symptom_disease_dict_{RUN_TAG}.json - Contains HADM_ID to Symptom text and Diagnosis mapping as json object
- Input : icd9_dict_{RUN_TAG}.json - Contains ICD9 Codes of TOP N Diagnoses
- Input : weight_i_j_norm{tag}.csv - TF-IDF weights for symptom representation

In [8]:
cwd = os.getcwd()
print(f"Current working directory : {cwd}")
# Metamap needs to be installed locally for this to work
METAMAP_PATH = cwd + "/../../public_mm"

# Data is not stored as part of project because of its restricted use.
data_dir = cwd + "/../../data/"
NOTES_FILE = data_dir + 'filtered_notes.csv'
DIAG_FILE = data_dir +  'DIAGNOSES_ICD.csv'
DIAG_DICT_FILE = data_dir + 'D_ICD_DIAGNOSES.csv'
SYMPTOMS_FILE = "Symptoms.txt"
mm = MetaMap.get_instance(METAMAP_PATH + '/bin/metamap18')
IRRELEVANT_SECTIONS = [
    "SOCIAL HISTORY:",
    "MEDICATION ON ADMISSION:",
    "DISCHARGE DIAGNOSIS:",
    "ADMISSION DATE"
]

Current working directory : /Users/vijaymi/Studies/CS-598-DL4Health/Project/135-Disease-Inference-Method/disease_pred_using_bilstm/source


### Load Data from MIMIC-III dataset

In [11]:
def load_data():
    notes_data = pd.read_csv(NOTES_FILE)
    print(notes_data.head())
    diag_codes = pd.read_csv(DIAG_FILE)
    diag_dict = pd.read_csv(DIAG_DICT_FILE)
    return notes_data, diag_codes, diag_dict

In [12]:
notes_data, diag_codes, diag_dict = load_data()

  INDEX ROW_ID SUBJECT_ID HADM_ID CHARTDATE CHARTTIME STORETIME  \
0     0    174      22532  167853    8/4/51       NaN       NaN   
1     1    175      13702  107527   6/14/18       NaN       NaN   
2     2    176      13702  167118   5/25/19       NaN       NaN   
3     3    177      13702  196489   8/18/24       NaN       NaN   
4     4    178      26880  135453   3/25/62       NaN       NaN   

            CATEGORY DESCRIPTION  CGID  ISERROR  \
0  Discharge summary      Report   NaN      NaN   
1  Discharge summary      Report   NaN      NaN   
2  Discharge summary      Report   NaN      NaN   
3  Discharge summary      Report   NaN      NaN   
4  Discharge summary      Report   NaN      NaN   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  
2  Admission Date:  [**2119-5-4**]              D...  
3  Admission Date:  [**2124-7-21**]              ...  
4  Admission Da

#### Using nltk package, remove words in negated context

In [13]:
def remove_negative_context_words(text):
    tokens = nltk.word_tokenize(text)
    tokens_neg_marked = nltk.sentiment.util.mark_negation(tokens)
    tokens_without_negative_words = [word for word in tokens_neg_marked if not word.endswith("_NEG")]
    return " ".join(tokens_without_negative_words)

In [14]:
example_text = "Sam is having cough but not fever."
remove_negative_context_words(example_text)

'Sam is having cough but not .'

#### Remove some irrevalent section fron Notes 

In [15]:
def remove_irrelevant_sections(text):
    lines = text.split("\n")
    output_lines = []
    skip = False
    for line in lines:
        line = line.strip()
        # If Skipping lines, look for end of section indicator - new line for now.
        if skip:
            if not line:
                skip = False
                continue
        else:
            for section_name in IRRELEVANT_SECTIONS:
                if line.upper().startswith(section_name):
                    # print(f"Skipping section : {line}")
                    skip = True

            if not skip:
                output_lines.append(line)
    return "\n".join(output_lines)

In [16]:
sample_text = """
SOCIAL HISTORY: Some words here.

CLINICAL NOTES: Person is having fever.
"""

remove_irrelevant_sections(sample_text)

'\nCLINICAL NOTES: Person is having fever.\n'

### Extracting Symptoms from clinical text using Metamap

Metamap download link - https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/run-locally/MainDownload.html <br>
Installation link - https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/Installation.html <br>


In [17]:
def extract_symptoms_using_metamap(text):
    symptoms = []
    # print(text)
    try:
        concepts, error = mm.extract_concepts([text])
        # print(f"concepts : {concepts}")
        for concept in concepts:
            if hasattr(concept, "semtypes"):
                if "sosy" in concept.semtypes or "dsyn" in concept.semtypes:
                    # print(f"concept : {concept}")
                    symptoms.append(concept.preferred_name)
        return symptoms
    except Exception as e:
        print(f"Exception occurred {e} ")
        logger.error(f"Exception occurred {e} ")
        return symptoms

In [20]:
sample_text = """ Service:
ADDENDUM:

RADIOLOGIC STUDIES:  Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.  This also
moderate-sized left pleural effusion.

HEAD CT:  Head CT showed no intracranial hemorrhage or mass
effect, but old infarction consistent with past medical
history.

ABDOMINAL CT:  Abdominal CT showed lesions of
T10 and sacrum most likely secondary to osteoporosis. These can
be followed by repeat imaging as an outpatient.

"""

extract_symptoms_using_metamap(sample_text)

['Tuberculosis',
 'Macrophage Activation Syndrome',
 'Osteoporosis',
 'Pleural effusion disorder',
 'Mass of body region']

### Integrating all above functions to extract symptoms from Clinical notes file

In [21]:
def process_notes(text):
    # print(f"Discharge Summary : {text}")
    # Remove non-relevant sections
    filtered_text = remove_irrelevant_sections(text)

    # Remove negative words
    filtered_neg_text = remove_negative_context_words(filtered_text)

    # Identify relevant concepts from text
    symptoms = extract_symptoms_using_metamap(filtered_neg_text)
    return symptoms


In [22]:
def save_to_file(notes_data, symptoms_list):
    with semaphore_object:
        print(f"printing for {symptoms_list}")
        with open(DATA_BASE_PATH + SYMPTOMS_FILE + "-" + timestamp, 'a') as writer:
            for idx, symptoms in enumerate(symptoms_list):
                print(f"notes_data : {notes_data.iloc[idx, 0]}")
                writer.write(str(notes_data.iloc[idx, 0]) + "," + str(notes_data.iloc[idx, 1])
                             + "," + str(notes_data.iloc[idx, 2])
                             + "," + str(notes_data.iloc[idx, 3]) + ",")
                symptom_str = "|".join(symptoms)
                symptom_str = symptom_str.replace(",", " ")
                writer.write(symptom_str)
                writer.write("\n")

In [23]:
def process_chunk(notes_data):
    global record_processed
    print(notes_data.shape)
    record_processed += notes_data.shape[0]
    if record_processed <= LAST_RECORD_DONE:
        print(f"Skipping - record processed - {record_processed}")
        return

    print(f"processing record : {record_processed}")
    symptoms_list = []
    number_of_rec = notes_data.shape[0]
    for index in range(number_of_rec):
        symptoms = process_notes(notes_data.iloc[index, 11])
        print(f"Symptoms : {symptoms}")
        symptoms_list.append(symptoms)

    save_to_file(notes_data, symptoms_list)


In [24]:
def main():
    print("Pre-processing starting now!")
    data_iterator = pd.read_csv(DATA_BASE_PATH + NOTES_FILE, chunksize=10)

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(process_chunk, data_iterator)