# Data Prep For Training Word2Vec NN Model

Here we will prepare the input dataset(X, Y) which will be used for training a NN model using Word2Vec representations.

In [80]:
import pandas as pd
import os
import numpy as np
import json

### Load Symptoms and Diagnoses Data

Here we will load symptoms extracted from the discharge summary notes using MetaMap. Also we will load the DIAGNOSES table which contains patient diagnosis for each hospital admission in the MIMIC III dataset.

In [81]:
cwd = os.getcwd()
data_dir = cwd + "/../data/"
print(f"Current working directory : {cwd}")
print(f"Data directory : {data_dir}")

# Load the symptoms data extracted by MetaMap
# TODO replace this file with the latest extracted file with all data from M
symptoms_df = pd.read_csv(data_dir + "symptoms.csv")
print('Total discharge notes with symptoms extracted: ', len(symptoms_df.index))
symptoms_df.head()

Current working directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src
Data directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src/../data/
Total discharge notes with symptoms extracted:  48459


Unnamed: 0,INDEX,ROW_ID,SUBJECT_ID,HADM_ID,SYMPTOMS
0,61,208,5239,125055.0,Autoimmune hemolytic anemia|Dyspnea|Congestive...
1,62,209,5239,125055.0,Hydrocephalus Normal Pressure|Congestive hear...
2,63,210,21449,139542.0,Acute Chest Syndrome|Hypertensive disease|Hype...
3,64,211,40273,124821.0,Obesity|Chronic Kidney Diseases|Hypertensive d...
4,65,212,76874,113329.0,Benign Rolandic Epilepsy|Sleeplessness|Familia...


In [82]:
# Load the table that has the dignosis codes for each hospital visit for a patient
diagnoses_df = pd.read_csv(data_dir + "DIAGNOSES_ICD.csv")
# We will just take the first three digits of the ICD code and not worry about sub-diseases
diagnoses_df['ICD9_3CHAR'] = diagnoses_df['ICD9_CODE'].str[:3]

# Use only needed columns, and drop others
diagnoses_df = diagnoses_df.drop(["ICD9_CODE", "SEQ_NUM", "ROW_ID"], axis=1)
diagnoses_df = diagnoses_df.drop_duplicates()
diagnoses_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_3CHAR
0,109,172335,403
1,109,172335,486
2,109,172335,582
3,109,172335,585
4,109,172335,425


### Join Symptoms and Diagnosis

Here we will join the symptoms and dignoses tables on the hospital admission ID (HADM_ID). So now we will have information on what sysmptoms led to what disease diagnosis for each hospital admission for the patient.

In [83]:
# Join symptoms and disease data frames
symp_diag_df = symptoms_df.set_index('HADM_ID').join(diagnoses_df.set_index('HADM_ID'), lsuffix='_symp', rsuffix='_diag')
symp_diag_df = symp_diag_df.reset_index()

# Rename the subject ID column and take only needed columns
symp_diag_df.rename(columns={'SUBJECT_ID_symp': 'SUBJECT_ID'}, inplace=True)
symp_diag_df = symp_diag_df[['SUBJECT_ID', 'HADM_ID', 'SYMPTOMS', 'ICD9_3CHAR']]

# Drop duplicates and filter out NAN rows for HADM_ID
symp_diag_df = symp_diag_df.drop_duplicates()
symp_diag_df = symp_diag_df[symp_diag_df['HADM_ID'].notnull()]

# Convert the HADM_ID column to int
symp_diag_df.HADM_ID = symp_diag_df.HADM_ID.astype(int)
print(f"symp_diagnoses.shape : {symp_diag_df.shape}")

# Group by disease count and count the number of unique diseases
# Note: here we get count as 936, the paper has the total count as 931
disease_count = symp_diag_df.groupby(['ICD9_3CHAR'])['ICD9_3CHAR'].count()
print(f"disease_count.shape : {disease_count.shape}")
symp_diag_df.head()

symp_diagnoses.shape : (494219, 4)
disease_count.shape : (936,)


Unnamed: 0,SUBJECT_ID,HADM_ID,SYMPTOMS,ICD9_3CHAR
0,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,531
1,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,285
2,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,70
3,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,571
4,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,456


### Filter Top 50 Diseases and Related Symptoms in the Notes

In [84]:
n = 50
top_diag_df = symp_diag_df.groupby(['ICD9_3CHAR'])['ICD9_3CHAR'].count().sort_values(ascending=False)
# taking only top 50 diseases
top_n_diag_df = top_diag_df[:n]
print(f"top_50_diagnoses.shape : {top_n_diag_df.shape}")
top_n_diag_df.head()

top_50_diagnoses.shape : (50,)


ICD9_3CHAR
401    17933
427    14717
276    12570
414    12421
272    12091
Name: ICD9_3CHAR, dtype: int64

In [85]:
print(top_n_diag_df.index)

Index(['401', '427', '276', '414', '272', '250', '428', '518', '285', '584',
       'V45', '599', '530', 'V58', '585', 'E87', '038', 'V10', '403', '410',
       '424', '997', '995', '780', '998', '785', '244', '305', '458', '486',
       '996', '496', '041', 'V15', '287', '507', '765', '790', 'V12', 'E93',
       '511', '493', '774', '311', '412', '707', 'V29', '348', 'V30', '571'],
      dtype='object', name='ICD9_3CHAR')


In [87]:
# Filter symptom data which corresponds to top 50 diseases
print(f"symp_diag_df.shape : {symp_diag_df.shape}")
symp_diag_top_n_df = symp_diag_df[symp_diag_df.ICD9_3CHAR.isin(top_n_diag_df.index)]
print(f"symp_diag_top_n_df.shape : {symp_diag_top_n_df.shape}")
symp_diag_top_n_df.SYMPTOMS = symp_diag_top_n_df.SYMPTOMS.astype(str)
symp_diag_top_n_df = symp_diag_top_n_df.reset_index() 
symp_diag_top_n_df.head()

symp_diag_df.shape : (494219, 4)
symp_diag_top_n_df.shape : (287157, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,index,SUBJECT_ID,HADM_ID,SYMPTOMS,ICD9_3CHAR
0,1,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,285
1,3,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,571
2,6,54610,100003,Cirrhosis|Liver Cirrhosis|Back Pain|Ascites|Er...,401
3,11,23018,100007,Pain|Communicable Diseases|Abdominal Pain|Pneu...,997
4,12,23018,100007,Pain|Communicable Diseases|Abdominal Pain|Pneu...,486


### Create Symptoms <-> Diseases Mapping for each HADM_ID

In [88]:
# Create a dictionary of HADM_ID -> {symptoms[], diseases[]}
# These will become the inputs and outputs when training the neural network

symptom_disease_dict = {}
# print(len(symp_diag_top_n_df))

# For each record in the dataframe, we have multiple rows with same HADM_ID and symptom list
# but the diseases for that HADM_ID are listed in separate rows. Here we will collect those diseases
# so that for each hospital admission, we have symptoms and diseases in one place, i.e. a tuple of
# <symptom_list, disease_list>.
for index, record in symp_diag_top_n_df.iterrows():
    hadm_id = record['HADM_ID']
    # Check if this id exists in the dictionary
    if hadm_id in symptom_disease_dict.keys():
        # Get symptoms, diseases tuple from the dictionary
        symp_disease_tuple = symptom_disease_dict.get(hadm_id)
        # Get the disease list
        disease_list = symp_disease_tuple[1]
        # Add this disease ICD code to the list of diseases for this HADM_ID
        # i.e. collect diseases for this HADM_ID
        disease_list.append(record['ICD9_3CHAR'])
    elif hadm_id not in symptom_disease_dict.keys():
        # Create the value entry for this hadm_id key
        symp_list = record['SYMPTOMS'].split("|")
        # For notes containing more than 1 symptom
        if len(symp_list) > 1:
            disease_list = []
            disease_list.append(record['ICD9_3CHAR'])
            symptom_disease_dict[hadm_id] = (symp_list, disease_list)
len(symptom_disease_dict)

43651

### Save the Disease Index and Symptom <-> Disease Mapping to Files

In [91]:
# Save the symptom disease dictionary to a file
with open(data_dir + "symptom_disease_dict.csv", 'w') as f:
    json.dump(symptom_disease_dict, f)

In [90]:
# Save ICD9 code to index mapping in a dictionary
icd9_dict = {}
for i, item in enumerate(top_n_diag_df.index):
    icd9_dict[item] = i
# print(icd9_dict)

# Save the diagnoses to index mapping dictionary to a file
with open(data_dir + "icd9_dict.csv", 'w') as f:
    json.dump(icd9_dict, f)