In [2]:
import concurrent.futures
import logging
from threading import Semaphore
import os
import pandas as pd
from pymetamap import MetaMap
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from datetime import datetime

# Symptom Extraction using Metamap

This notebook implements symptom extraction functionality from raw discharge summary notes.

Note: We didn't use jupyter notebook for this processing as Metamap symptom extraction process requires significant processing power. We executed symptom extraction process on google cloud (it took ~2000 cpu hours to process). pre_process_notes_multi.py python script was used for actual extraction. However, this notebook uses most of the same code but doesn't do multi-threading, etc.

---

- Input : NOTEEVENTS.csv - MIMIC-III dataset file containing clinical notes
- Output : symptoms.csv - Contains symptoms extracted from discharge summary using Metamap (used in 02 - data preparation)
- Output : discharge_summary.csv - Contains filtered discharge summary records from NOTEEVENTS.csv (used in 04 - word2vev training)

In [63]:
cwd = os.getcwd()
print(f"Current working directory : {cwd}")
# Metamap needs to be installed locally for this to work
METAMAP_PATH = cwd + "/../../public_mm"

# Data is not stored as part of project because of its restricted use.
data_dir = cwd + "/../../data/"
RUN_TAG = "_v2.0"
NOTEEVENTS_FILE_PATH = data_dir + 'NOTEEVENTS.csv'
DISCHARGE_SUMMARY = data_dir + f"discharge_summary_{RUN_TAG}.csv"
NOTES_FILE = data_dir + 'filtered_notes.csv'
SYMPTOMS_FILE = data_dir + f"symptoms_{RUN_TAG}.csv"
mm = MetaMap.get_instance(METAMAP_PATH + '/bin/metamap18')
IRRELEVANT_SECTIONS = [
    "SOCIAL HISTORY:",
    "MEDICATION ON ADMISSION:",
    "DISCHARGE DIAGNOSIS:",
    "ADMISSION DATE"
]

Current working directory : /Users/vijaymi/Studies/CS-598-DL4Health/Project/135-Disease-Inference-Method/disease_pred_using_bilstm/source


### Load Data from MIMIC-III dataset

In [4]:
# Read the NOTEEVENTS table
notevents_df = pd.read_csv(NOTEEVENTS_FILE_PATH)
print('Number of notes: ', len(notevents_df.index))

# Drop any duplicates
notevents_df = notevents_df.drop_duplicates()
print('Number of notes after filtering duplicates: ', len(notevents_df.index))

# NOTEVENTS file contains various types of notes. We will filter the notes that contain discharge
# summaries.
discharge_summaries_df = notevents_df[(notevents_df['CATEGORY'] == 'Discharge summary')]
discharge_summaries_df.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Number of notes:  2083180
Number of notes after filtering duplicates:  2083180


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [5]:
# Save the discharge summaries data to a csv file so that we don't have to load the NOTEEVENTS table
# for subsequent runs
print('Number of discharge summaries: ', len(discharge_summaries_df.index))
discharge_summaries_df.to_csv(DISCHARGE_SUMMARY, index=False)

Number of discharge summaries:  59652


In [6]:
def load_data():
    notes_data = pd.read_csv(DISCHARGE_SUMMARY)
    print(notes_data.head())
    return notes_data

In [7]:
notes_data = load_data()

   ROW_ID  SUBJECT_ID   HADM_ID   CHARTDATE  CHARTTIME  STORETIME  \
0     174       22532  167853.0  2151-08-04        NaN        NaN   
1     175       13702  107527.0  2118-06-14        NaN        NaN   
2     176       13702  167118.0  2119-05-25        NaN        NaN   
3     177       13702  196489.0  2124-08-18        NaN        NaN   
4     178       26880  135453.0  2162-03-25        NaN        NaN   

            CATEGORY DESCRIPTION  CGID  ISERROR  \
0  Discharge summary      Report   NaN      NaN   
1  Discharge summary      Report   NaN      NaN   
2  Discharge summary      Report   NaN      NaN   
3  Discharge summary      Report   NaN      NaN   
4  Discharge summary      Report   NaN      NaN   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  
2  Admission Date:  [**2119-5-4**]              D...  
3  Admission Date:  [**2124-7-21**]              ...  
4  

#### Using nltk package, remove words in negated context

In [8]:
def remove_negative_context_words(text):
    tokens = nltk.word_tokenize(text)
    tokens_neg_marked = nltk.sentiment.util.mark_negation(tokens)
    tokens_without_negative_words = [word for word in tokens_neg_marked if not word.endswith("_NEG")]
    return " ".join(tokens_without_negative_words)

In [9]:
example_text = "Sam is having cough but not fever."
remove_negative_context_words(example_text)

'Sam is having cough but not .'

#### Remove some irrevalent section fron Notes 

In [10]:
def remove_irrelevant_sections(text):
    lines = text.split("\n")
    output_lines = []
    skip = False
    for line in lines:
        line = line.strip()
        # If Skipping lines, look for end of section indicator - new line for now.
        if skip:
            if not line:
                skip = False
                continue
        else:
            for section_name in IRRELEVANT_SECTIONS:
                if line.upper().startswith(section_name):
                    # print(f"Skipping section : {line}")
                    skip = True

            if not skip:
                output_lines.append(line)
    return "\n".join(output_lines)

In [11]:
sample_text = """
SOCIAL HISTORY: Some words here.

CLINICAL NOTES: Person is having fever.
"""

remove_irrelevant_sections(sample_text)

'\nCLINICAL NOTES: Person is having fever.\n'

### Extracting Symptoms from clinical text using Metamap

Metamap download link - https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/run-locally/MainDownload.html <br>
Installation link - https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/documentation/Installation.html <br>


In [12]:
def extract_symptoms_using_metamap(text):
    symptoms = []
    # print(text)
    try:
        concepts, error = mm.extract_concepts([text])
        # print(f"concepts : {concepts}")
        for concept in concepts:
            if hasattr(concept, "semtypes"):
                if "sosy" in concept.semtypes or "dsyn" in concept.semtypes:
                    # print(f"concept : {concept}")
                    symptoms.append(concept.preferred_name)
        return symptoms
    except Exception as e:
        print(f"Exception occurred {e} ")
        logger.error(f"Exception occurred {e} ")
        return symptoms

In [14]:
sample_text = """ Service:
ADDENDUM:

RADIOLOGIC STUDIES:  Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.  This also
moderate-sized left pleural effusion.

HEAD CT:  Head CT showed no intracranial hemorrhage or mass
effect, but old infarction consistent with past medical
history.

ABDOMINAL CT:  Abdominal CT showed lesions of
T10 and sacrum most likely secondary to osteoporosis. These can
be followed by repeat imaging as an outpatient.

"""

extract_symptoms_using_metamap(sample_text)

['Tuberculosis',
 'Macrophage Activation Syndrome',
 'Osteoporosis',
 'Pleural effusion disorder',
 'Mass of body region']

### Integrating all above functions to extract symptoms from Clinical notes file

In [65]:
# Varaibles
timestamp = datetime.now().strftime('%d-%H-%M-%S')
# Keeping it low as we don't want to process all records here in notebook
NUM_OF_RECORDS_TO_PROCESS = 20
LAST_RECORD_DONE = 0
record_processed = 0
semaphore_object = Semaphore(1)

#### Takes discharge summary notes, removes irrelevant sections, negated contexts words and finally uses metamap to extract symptoms.

In [60]:
def process_notes(text):
    # print(f"Discharge Summary : {text}")
    # Remove non-relevant sections
    filtered_text = remove_irrelevant_sections(text)

    # Remove negative words
    filtered_neg_text = remove_negative_context_words(filtered_text)

    # Identify relevant concepts from text
    symptoms = extract_symptoms_using_metamap(filtered_neg_text)
    return symptoms


#### appends record in csv format to symptoms file

In [64]:
def save_to_file(notes_data, symptoms_list):
    with semaphore_object:
        print(f"printing for {symptoms_list}")
        with open(SYMPTOMS_FILE, 'a') as writer:
            for idx, symptoms in enumerate(symptoms_list):
                print(f"notes_data : {notes_data.iloc[idx, 0]}")
                writer.write(str(notes_data.iloc[idx, 0]) + "," + str(notes_data.iloc[idx, 1])
                             + "," + str(notes_data.iloc[idx, 2])
                             + "," + str(notes_data.iloc[idx, 3]) + ",")
                symptom_str = "|".join(symptoms)
                symptom_str = symptom_str.replace(",", " ")
                writer.write(symptom_str)
                writer.write("\n")

#### Processes batches of notes. Some logic here to control how many records we want to process. Symptom extraction is a very slow cpu intensive process. 

In [66]:
def process_chunk(notes_data):
    global record_processed
    print(notes_data.shape)
    record_processed += notes_data.shape[0]
    if record_processed > NUM_OF_RECORDS_TO_PROCESS:
        return False
    if record_processed <= LAST_RECORD_DONE:
        print(f"Skipping - record processed - {record_processed}")
        return True

    symptoms_list = []
    number_of_rec = notes_data.shape[0]
    for index in range(number_of_rec):
        symptoms = process_notes(notes_data.iloc[index, 10])
        print(f"Symptoms : {symptoms}")
        symptoms_list.append(symptoms)

    save_to_file(notes_data, symptoms_list)
    return True


#### Main program to call above defined functions

In [68]:
record_processed = 0
output_result = None
chunk_size = 10
# Add header to csv file
with open(SYMPTOMS_FILE, 'w') as writer:
    writer.write("INDEX,ROW_ID,SUBJECT_ID,HADM_ID,SYMPTOMS\n")
    
for chunk in pd.read_csv(DISCHARGE_SUMMARY, chunksize=chunk_size):
    ret = process_chunk(chunk)
    if not ret:
        break

(10, 11)
processing record : 10
Symptoms : ['Tuberculosis', 'Osteoporosis', 'Pleural effusion disorder']
Symptoms : ['Leukocytosis', 'Muscle Weakness', 'Paresis', 'Dyspnea', 'Angina Pectoris', 'Hematuria', 'Tracheomalacia', 'Hypothyroidism', 'Obesity', 'Hyperglycemia', 'Chronic Obstructive Airway Disease', 'Angina, Unstable', 'Hypertensive disease', 'Lipomucopolysaccharidosis', 'Respiratory Failure', 'Pharyngitis', 'Sore Throat', 'Empty Sella Syndrome', 'Headache', 'Cerebrovascular accident', 'Dizziness', 'Hiatal Hernia', 'Lightheadedness', 'Chest Pain', 'Fever', 'Nausea', 'Vomiting', 'Infantile Neuroaxonal Dystrophy', 'Wheezing', 'Chills', 'Cyanosis', 'Exanthema', 'Labored breathing', 'Disease', 'Pneumonia', 'Lymphadenopathy', 'Communicable Diseases', 'Weakness', 'Chronic obstructive pulmonary disease of horses', 'Angina decubitus', 'Clubbing', 'Left lower zone pneumonia', 'Night sweats', 'SHORT STATURE, ONYCHODYSPLASIA, FACIAL DYSMORPHISM, AND HYPOTRICHOSIS SYNDROME', 'Symptoms', 'St

Symptoms : ['Chest Pain', 'Aortic Valve Stenosis', 'Diabetes Mellitus', 'Dyspnea', 'Hematuria', 'Pulmonary Edema', 'Pain', 'Atrial Fibrillation', 'Peripheral Arterial Diseases', 'Hyperlipidemia', 'Left Ventricular Hypertrophy', 'Premature ventricular contractions', 'Mastodynia', 'Hypertensive disease', 'Heart failure', 'Aortic Valve Insufficiency', 'Degenerative polyarthritis', 'Ventricular Fibrillation, Paroxysmal Familial, 1', 'Vasovagal syncope', 'Kidney Failure, Acute', 'Peripheral Vascular Diseases', 'Angina Pectoris', 'Congestive heart failure', 'Atrial Premature Complexes', 'Cerebrovascular accident', 'Scoliosis, unspecified', 'Kidney Failure', 'Renal Insufficiency', 'Coronary Arteriosclerosis', 'Coronary Artery Disease', 'Non-ST Elevated Myocardial Infarction', 'Dyslipidemias', 'Xanthoma', 'Infantile Neuroaxonal Dystrophy', 'Nausea', 'SYNOVITIS, GRANULOMATOUS, WITH UVEITIS AND CRANIAL NEUROPATHIES (disorder)', 'Ventricular Fibrillation', 'Coronary heart disease', 'Bilateral ple

Symptoms : ['Kidney Failure, Chronic', 'Dyspnea', 'Pain', 'Kidney Failure', 'Atrial Fibrillation', 'Erythema', 'Chronic Obstructive Airway Disease', 'Edema', 'Chest Pain', 'Obesity', 'Coughing', 'Discharge, body substance', 'Facial Hemiatrophy', 'Pleural effusion disorder', 'Dizziness', 'Lightheadedness', 'Diverticular disease of colon', 'Coronary Arteriosclerosis', 'Coronary Artery Disease', 'Tachycardia, Ventricular', 'Allergic rhinitis (disorder)', 'Deep Vein Thrombosis', 'Gout', 'Cryopyrin-Associated Periodic Syndromes', 'Pneumocystis jiroveci pneumonia', 'Wheezing', 'Chronic kidney disease stage 5', 'Congestive heart failure', 'Chronic obstructive pulmonary disease of horses', 'SYNOVITIS, GRANULOMATOUS, WITH UVEITIS AND CRANIAL NEUROPATHIES (disorder)', 'Leg Ulcer', 'Coronary heart disease', 'Psoriasis', 'Anemia', 'Disease', 'Pneumothorax', 'Pulmonary Edema', 'Skin Diseases, Infectious', 'Heart failure', 'Hypertensive disease', 'Pitting edema', 'Chronic ulcer', 'SHORT STATURE, ONY

### Showing content of Output files

In [71]:
notes_data = pd.read_csv(DISCHARGE_SUMMARY)
notes_data.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [70]:
symptoms_extracted = pd.read_csv(SYMPTOMS_FILE)
symptoms_extracted.head()

Unnamed: 0,INDEX,ROW_ID,SUBJECT_ID,HADM_ID,SYMPTOMS
0,174,22532,167853.0,2151-08-04,Tuberculosis|Osteoporosis|Pleural effusion dis...
1,175,13702,107527.0,2118-06-14,Leukocytosis|Muscle Weakness|Paresis|Dyspnea|A...
2,176,13702,167118.0,2119-05-25,Chronic Obstructive Airway Disease|Deep Vein T...
3,177,13702,196489.0,2124-08-18,Dyspnea|Chronic Obstructive Airway Disease|Hia...
4,178,26880,135453.0,2162-03-25,Encephalomalacia|Pain|Obesity|Aspiration Pneum...
