# Training Word2Vec Model

Using discharge notes from NOTEVEENTS table of the MIMIC III dataset to train a Word2Vec model.
This model will then be used to generate embeddings for patient's symptoms that are extracted from
the discharge notes using METAMAP.

In [48]:
import pandas as pd
import random
import re
import os

# Gensim (for Word2Vec) needs to be installed for the following two imports
import gensim
from gensim.models import Word2Vec
from gensim import utils
from time import time

### Load Notes Text Data

In [49]:
cwd = os.getcwd()
data_dir = cwd + "/../data/"
model_dir = cwd + "/model/"
print(f"Current working directory : {cwd}")
print(f"Data directory : {data_dir}")

# Read the NOTEEVENTS table
notevents_df = pd.read_csv(data_dir + 'NOTEEVENTS.csv')
print('Number of notes: ', len(notevents_df.index))

# Drop any duplicates
notevents_df = notevents_df.drop_duplicates()
print('Number of notes after filtering duplicates: ', len(notevents_df.index))

# NOTEVENTS file contains various types of notes. We will filter the notes that contain discharge
# summaries.
discharge_summaries_df = notevents_df[(notevents_df['CATEGORY'] == 'Discharge summary')]
discharge_summaries_df.head(5)

Current working directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src
Data directory : /Users/ratanbajpai/Education/UIUC/DLH/project/src/../data/


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Number of notes:  2083180
Number of notes after filtering duplicates:  2083180


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


### Save Discharge Summaries

In [50]:
# Save the discharge summaries data to a csv file so that we don't have to load the NOTEEVENTS table
# for subsequent runs
print('Number of discharge summaries: ', len(discharge_summaries_df.index))
discharge_summaries_df.to_csv(data_dir + 'discharge_summaries.csv', index=False)

Number of discharge summaries:  59652


In [51]:
# Read discharge summaries csv file
discharge_summaries_df = pd.read_csv(data_dir + 'discharge_summaries.csv')
discharge_summaries_df.head(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [52]:
# Find the total number of rows in the discharge summaries data frame
len(discharge_summaries_df.index)

59652

### Split Data to Train and Test

In [53]:
# Set seed
seed = 1234
# Helper function to split data to train, and test
def build_data_buckets(num_records):    
    index = list(range(num_records))
    random.seed(seed)
    random.shuffle(index)
    index_train = index[0 : int(num_records * 0.80)]
    index_test = index[int(num_records * 0.80) : num_records]

    return index_train, index_test


index_train, index_test = build_data_buckets(discharge_summaries_df.shape[0])
len(index_train)

47721

In [54]:
# Copy training data into a separate dataframe
training_data_df = discharge_summaries_df.iloc[index_train].copy()
training_data_df.head(5)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
45907,47509,20849,195254.0,2137-10-11,,,Discharge summary,Report,,,Admission Date: [**2137-9-15**] ...
56307,56268,25466,144778.0,2194-12-30,,,Discharge summary,Addendum,,,"Name: [**Known lastname 6610**], [**Known fir..."
1132,1605,65003,116768.0,2167-06-02,,,Discharge summary,Report,,,Admission Date: [**2167-5-26**] ...
4766,4772,76853,119383.0,2113-05-25,,,Discharge summary,Report,,,Admission Date: [**2113-5-16**] ...
54224,48638,9602,163874.0,2177-08-21,,,Discharge summary,Report,,,Admission Date: [**2177-8-18**] ...


### Clean and Prep Discharge Summaries Notes Data

In [56]:
# Remove some special characters etc., and split each discharge summary note into a list of tokens
def clean_note_text(note_text):
    note_text = re.sub('<[^>]*>', '', note_text)
    note_text = re.sub('[\W]+', ' ', note_text.lower())
    note_tokens = note_text.split()
    return note_tokens

notes_tokens_list = list(training_data_df['TEXT'].apply(clean_note_text))
len(notes_tokens_list)
# notes_tokens_list[:5]

47721

### Train the Word2Vec Model

In [57]:
# Word2Vec model parameters from the paper:

# Window size (window): 5, model will use total (left + right) 5 words for context
# Min count (min_count): 5 (default) i.e. words occurring in less than 5 notes will be removed
# Size of output vector: 128, each word will be mapped to 128 dimension vector
# Skip gram: sg = 1 implies skip gram is used (paper uses skip gram instead of CBOW)
# Negative (negative): 5, negative sampling speeds up the training process
# Down sampling (sample): 1e-3, parameter for down sampling high frequency words

# Create model
word2vec_model = Word2Vec(window = 5, size = 128, sample = 1e-3, negative = 5, sg = 1)

# Build vocabulary using tokens created from the discharge summary notes
word2vec_model.build_vocab(notes_tokens_list)

index = list(range(len(notes_tokens_list)))
# print(len(index))
start_time = time()

# Do multiple runs, shuffling the data for each run for improved accuracy
for epoch in range(5):
    random.shuffle(index)
    note_tokens = [notes_tokens_list[i] for i in index]
    # print(len(note_tokens))
    word2vec_model.train(note_tokens, total_examples = word2vec_model.corpus_count, epochs = word2vec_model.epochs)
    print(epoch)
    
training_time = time() - start_time
print("Time taken to train the Word2Vec model: ", training_time, "seconds")

0
1
2
3
4
Time taken to train the Word2Vec model:  3283.499179840088 seconds


### Save Trained Model

In [58]:
# Save the trained Word2Vec model to be used later
word2vec_model.save(model_dir + 'word2vec_model_sg_128')

# Open a saved Word2Vec model 
# word2vec_model = gensim.models.Word2Vec.load(model_dir + 'word2vec_model')

# Store the input-hidden weight matrix
word2vec_model.wv.save_word2vec_format(model_dir + 'word2vec_model_sg_128.txt', binary = False)

### Test Model

In [65]:
word2vec_model.wv.most_similar('heart')
# word2vec_model.wv['heart']
# word2vec_model.wv['communicable']

[('rate', 0.606135904788971),
 ('congestive', 0.598778247833252),
 ('irregular', 0.5547537207603455),
 ('heartrate', 0.5539854764938354),
 ('rhythm', 0.5538603067398071),
 ('attack', 0.5464314818382263),
 ('lungs', 0.546188473701477),
 ('systolic', 0.5344693660736084),
 ('diastolic', 0.533440351486206),
 ('tachycardiac', 0.5053322315216064)]