In [2]:
import concurrent.futures
import logging
from threading import Semaphore
import os
import pandas as pd
from pymetamap import MetaMap
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from datetime import datetime

# Symptom Extraction using Metamap

This notebook implements the BiLSTM model which will take TF-IDF based symptom representation and Word2Vec based symptom representation and predicts diagnoses codes.

---

- Input : symptom_disease_dict_{RUN_TAG}.json - Contains HADM_ID to Symptom text and Diagnosis mapping as json object
- Input : icd9_dict_{RUN_TAG}.json - Contains ICD9 Codes of TOP N Diagnoses
- Input : weight_i_j_norm{tag}.csv - TF-IDF weights for symptom representation

In [8]:
cwd = os.getcwd()
print(f"Current working directory : {cwd}")
# Metamap needs to be installed locally for this to work
METAMAP_PATH = cwd + "/../../public_mm"

# Data is not stored as part of project because of its restricted use.
data_dir = cwd + "/../../data/"
NOTES_FILE = data_dir + 'filtered_notes.csv'
DIAG_FILE = data_dir +  'DIAGNOSES_ICD.csv'
DIAG_DICT_FILE = data_dir + 'D_ICD_DIAGNOSES.csv'
SYMPTOMS_FILE = "Symptoms.txt"
mm = MetaMap.get_instance(METAMAP_PATH + '/bin/metamap18')
IRRELEVANT_SECTIONS = [
    "SOCIAL HISTORY:",
    "MEDICATION ON ADMISSION:",
    "DISCHARGE DIAGNOSIS:",
    "ADMISSION DATE"
]

Current working directory : /Users/vijaymi/Studies/CS-598-DL4Health/Project/135-Disease-Inference-Method/disease_pred_using_bilstm/source


### Load Data from MIMIC-III dataset

In [11]:
def load_data():
    notes_data = pd.read_csv(NOTES_FILE)
    print(notes_data.head())
    diag_codes = pd.read_csv(DIAG_FILE)
    diag_dict = pd.read_csv(DIAG_DICT_FILE)
    return notes_data, diag_codes, diag_dict

In [12]:
notes_data, diag_codes, diag_dict = load_data()

  INDEX ROW_ID SUBJECT_ID HADM_ID CHARTDATE CHARTTIME STORETIME  \
0     0    174      22532  167853    8/4/51       NaN       NaN   
1     1    175      13702  107527   6/14/18       NaN       NaN   
2     2    176      13702  167118   5/25/19       NaN       NaN   
3     3    177      13702  196489   8/18/24       NaN       NaN   
4     4    178      26880  135453   3/25/62       NaN       NaN   

            CATEGORY DESCRIPTION  CGID  ISERROR  \
0  Discharge summary      Report   NaN      NaN   
1  Discharge summary      Report   NaN      NaN   
2  Discharge summary      Report   NaN      NaN   
3  Discharge summary      Report   NaN      NaN   
4  Discharge summary      Report   NaN      NaN   

                                                TEXT  
0  Admission Date:  [**2151-7-16**]       Dischar...  
1  Admission Date:  [**2118-6-2**]       Discharg...  
2  Admission Date:  [**2119-5-4**]              D...  
3  Admission Date:  [**2124-7-21**]              ...  
4  Admission Da