# CS 271 CUI2VEC

In [1]:
# Define imports
import random
import pandas as pd
import numpy as np
import tensorflow as tf
from numpy import asarray
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
import gensim.downloader as api
import matplotlib.pyplot as plt
import sys
import pickle
import os
import csv
from collections import defaultdict
import time
import re

### Step 1: Read in relevant Data Files

Make sure the files below are in the same directory as your jupyter file

In [2]:
# Define constants
PATIENT_DATA_FILE = 'B220_SAA_v1.csv'
CLEANED_LABELS_FILE = 'ICD_Label_Cleaned_Oct_25.csv'
CODE_DESC_FILE = 'BIODS220_ICD_Dx_10_9_v7 - icd_dx_10_9_v7.csv'
LANG_MODEL = 'CUI2VEC'
CUI2VEC_MODEL = 'CUI2Vec_embedding.pickle'

In [3]:
def read_csv_to_dict(file_path: str, key: int, value: int):
    ret_dict = {}
    with open(file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            ret_dict[row[key]] = row[value]
    print("Reading {} complete!".format(file_path))
    return ret_dict

In [4]:
def get_category_dict():
    category_dict = {
        'Circulatory': 0,
        'Dermatologic': 4,
        'Endocrine & Immune': 6,
        'Gastrointestinal': 1,
        'Genitourinary': 1, 
        'Hematologic': 4,
        'Infectious': 6,
        'Injury': 2,
        'Injury & Poisoning': 2,
        'Poisoning': 2,
        'Musculoskeletal': 2,
        'Neurologic': 3,
        'Other': 4,
        'Obstetric': 5,
        'Neoplastic': 4,
        'Psychiatric': 3,
        'Respiratory': 0,
        'Substance use': 2}
    #use to_categorical()
    return category_dict

In [5]:
NUM_CLASSES = 2

In [7]:
# Create labels dict i.e. code -> label, i.e. A840 -> 'Neurologic'
label_dict = read_csv_to_dict(CLEANED_LABELS_FILE, key=0, value=1)

# Create descriptions dict i.e. code -> description
codes_dict = read_csv_to_dict(CODE_DESC_FILE, key=0, value=2)

# Create dict for label to int
category_dict = get_category_dict()

Reading ICD_Label_Cleaned_Oct_25.csv complete!
Reading BIODS220_ICD_Dx_10_9_v7 - icd_dx_10_9_v7.csv complete!


### Step 2: Create one-hot feature vectors

For the sake of reducing feature space, we are only including three features in our embedding. Future iterations of our embeddings will include more features, such as the patient's county.

In [8]:
def create_one_hot(patient_data):
    # Creates one-hot vectors
    columns_to_one_hot = ['Sex','Race']
    one_hot = pd.get_dummies(patient_data[columns_to_one_hot])
    
    ordinal_columns = ['Age']
    one_hot = pd.concat([patient_data[ordinal_columns], one_hot], axis=1)
    
    # Normalize age
    x = one_hot.Age.values.reshape(-1,1)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    one_hot.Age = x_scaled
    
    return one_hot.values

In [17]:
# Read in patient_data - takes a few mins
patient_read_start = time.time()
patient_data = pd.read_csv(PATIENT_DATA_FILE, dtype=str, usecols=['ID', 'Sex','Race','Age', 'Date','Dx10_prin'])
print("Reading in patient_data took {}".format(time.time() - patient_read_start))

# One-hot encode visit features
one_hot_start = time.time()
one_hot = create_one_hot(patient_data)
print("Creating one-hot encodings took {}".format(time.time() - one_hot_start))
print(one_hot.shape)

Reading in patient_data took 49.75160312652588
Creating one-hot encodings took 9.033980369567871
(27977932, 11)


In [18]:
patient_data.head()

Unnamed: 0,ID,Date,Age,Sex,Race,Dx10_prin
0,1,2016-06-05,35,F,White,S300XXA
1,1,2017-07-16,36,F,White,N938
2,1,2017-08-15,36,F,White,F10129
3,1,2018-07-12,37,F,White,R0789
4,2,2015-12-29,42,M,Hispanic,N390


In [11]:
# (num_patients, 311)

# Step 3: Embed ICD Codes + concat with one-hot vectors

Read the presaved cui2vec embeddings dictionary (ICD10_Code_first_3_chars : float_embedding_vector of size 300)

Given the embedding from the pretrained cui2vec embeddings dictionary (in a pickle) that stores the first three chars of the ICD10 code and its corresponding embeddings vector of floats. If there is no embedding for the sentence then return the embedding for the label.

In [12]:
pickle_in = open(CUI2VEC_MODEL, 'rb')
EMBEDDINGS = pickle.load(pickle_in)
pickle_in.close()
CONST_EMBEDDING_SIZE = 300

Given an ICD code, we retrieve the embedding (300x1). We use dynamic programming to save embeddings. Often, only the first three characters of an ICD code are enough to determine the general diagnosis. Longer codes will only have slightly different descriptions (if at all). Thus, to reduce computational complexity, we store the first three characters of ICD codes. Future ICD codes that share the same first three characters will automatically use the same embedding, regardless of any remaining characters.

In [13]:
def code_to_embedding(code):
    try:
        embedding = EMBEDDINGS[str(code)]
    except:
        try:
            embedding = EMBEDDINGS[str(code[:3])]
        except:
            embedding = np.zeros((CONST_EMBEDDING_SIZE,))
    return embedding

For each row of ICD Codes (i.e ['E839', 'SA920']), we retrieve the corresponding embeddings. For patient visits with n ICD Codes where n>1, we give 0.75 weight to the primary ICD code and 0.25/n-1 weight to the remaining (secondary) codes.

In [14]:
PRIMARY_WEIGHT = 0.5
SECONDARY_WEIGHT = 0.5

def row_to_embedding(input_row):
    """
    input_row: A list of ICD10 codes
    returns a 300x1 embedding
    """
    n = len(input_row)
    code = input_row[0]
    # Primary ICD code:
    primary_embedding = code_to_embedding(code)
    if n < 2: return primary_embedding
    
    # Subsequent ICD codes:
    secondary_embedding = None
    for i in range(1, n):
        code = input_row[i]
        curr_embed = code_to_embedding(code)
        if secondary_embedding is None:
            secondary_embedding = curr_embed
        else:
            secondary_embedding = np.sum([secondary_embedding, curr_embed], axis=0)
    
    final_embedding = np.sum([primary_embedding * PRIMARY_WEIGHT, secondary_embedding * (SECONDARY_WEIGHT/(n-1))], axis=0)
    
    return final_embedding


We read in the patient visit csv file again and create embeddings for each visit as we read the file. For each visit, we have already computed the one-hot vector encoding for categorical and ordinal variables. Here, we combine those encodings with ICD_10 code embeddings. To reduce model complexity, we limit the number of patients (not total visits) for which to create embeddings. Here, we are working under the assumption that all visits for a patient appear one after the other in the csv file, thus, we can stop reading from the file once `patient id > num_patients`. Takes a few mins to run

In [31]:
NUM_PATIENTS = .1e6

def get_visit_embedding(input_file_path: str):
    num_invalid_patient_codes = 0
    start_time = time.time()
    embeddings = []
    patient_visit = []
    max_len = 0
#     patient_visit = {
#         1 : [[1, embed],[4, embed],[2,embed]],
#         2 : [[embed],[embed],....],
#         .
#         .
#         .
#     }
    with open(input_file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in data: # <- top to bottom
            if count == 0: # Skip the first row
                count += 1
                continue
            
            if int(row[0]) > NUM_PATIENTS: # Used to limit num patients, reducing model space
                break
                
            ICD_code_embeddings = row_to_embedding([entry for entry in row[16:41] if entry is not ''])
            # check which patients had none of their ICD 10 codes in the pretrained embeddings model (CUI2VEC)
            if (not ICD_code_embeddings.any()): num_invalid_patient_codes+=1
            
            # Combine one-hot with w2v embeddings
            visit_embedding = np.concatenate((one_hot[count-1, :], ICD_code_embeddings), axis=0)
            embeddings.append(visit_embedding)
            patient_visit.append([row[0], row[1]]) # store patient and visit info
            
            # Tracking progress
            if count % 250000 == 0:
                print("Completed {} visit embeddings in {}".format(count, (time.time() - start_time)))
            count +=1
            
    print("Number of patients with none of their ICD10 codes in pretrained embeddings: ({},)".format(num_invalid_patient_codes))        
    print("Shape of embeddings: ({},{})".format(len(embeddings), len(embeddings[0])))
    print("Shape of patient_visit: ({},{})".format(len(patient_visit), len(patient_visit[0])))
    embedding_vecs = np.array(embeddings)
    result = np.hstack((np.array(patient_visit),embedding_vecs)) # patient id, visit id, embedding vector
    return result, count-1, num_invalid_patient_codes

embedding_dset, num_visits, num_invalid_patient_codes = get_visit_embedding(PATIENT_DATA_FILE)

Number of patients with none of their ICD10 codes in pretrained embeddings: (2,)
Shape of embeddings: (215240,311)
Shape of patient_visit: (215240,2)


## Step 4: Get Labels (Adam's version)

In [32]:
# Read in patient_data - takes about a minute
patient_read_start = time.time()
patient_data = pd.read_csv(PATIENT_DATA_FILE, dtype=str, usecols=['ID', 'Sex','Race','Age', 'Date'], nrows = num_visits)
print("Reading in patient_data took {}".format(time.time() - patient_read_start))

Reading in patient_data took 0.3386993408203125


In [33]:
def get_y_labels(patient_data):

    patient_data['Date'] = pd.to_datetime(patient_data['Date'])  #convert date column to date-time type
    y_diff = abs(patient_data.groupby(['ID'])['Date'].diff(periods=-1))
    y_diff = y_diff.dt.days
    y_label = []
    for i in y_diff:
        if pd.isnull(i):
            y_label.append(2)
        else:
            if i <= 30:
                y_label.append(1)
            else:
                y_label.append(0)
    return y_label, len(y_label)

In [34]:
# Get y_dset
label_start = time.time()
y_dset, num_labels = get_y_labels(patient_data)
print("Creating labels took {}".format(time.time() - label_start))

Creating labels took 6.306676149368286


In [35]:
embedding_dset.shape

(215240, 313)

In [60]:
len(np.unique(embedding_dset[:,0]))

34324

In [36]:
# Ensures the number of labels corresponds to the number of patient visits
assert(num_labels == num_visits)

In [37]:
new_dset = np.hstack([embedding_dset, np.array(y_dset).reshape(len(y_dset),1)])
print(new_dset.shape)

(215240, 314)


In [28]:
#del embedding_dset, y_dset

In [55]:
discards = np.not_equal(new_dset[:,-1].astype('float'),np.ones((len(new_dset),))*2.0)

In [56]:
new_dset = new_dset[discards]

In [57]:
print(new_dset.shape)

(180916, 314)


In [61]:
len(np.unique(new_dset[:,0]))

34324

### Step 5: Save embeddings and labels

In [62]:
filename = 'embeddings_{}_binary.pickle'.format(LANG_MODEL)

start_time = time.time()
pickle_out = open(filename, 'wb')
pickle.dump((embedding_dset, y_dset), pickle_out, protocol=4)
pickle_out.close()
print("Saving embeddings took: {}".format(time.time() - start_time))

Saving embeddings took: 71.72887325286865


In [20]:
# filename = 'embeddings_{}.pickle'.format(LANG_MODEL)
# start_time = time.time()
# pickle_in = open(filename, 'rb')
# embedding_dset, y_dset = pickle.load(pickle_in)
# pickle_in.close()
# print("Saving embeddings took: {}".format(time.time() - start_time))

In [66]:
## collated_data = embedding_dset
cols =  ['patient_id', 'visit_id'] + ['embed_vec'+ str(i) for i in range(new_dset.shape[1]-4)] + ['label']
df = pd.DataFrame(data = collated_data, columns = cols)

In [67]:
# Clear variables
collated_data = None
new_dset = None
embedding_dset = None
y_dset = None

Transform the data types 

In [22]:
df['patient_id'] = df['patient_id'].astype(str).astype('int64')
df['visit_id'] = df['visit_id'].astype(str).astype('int64')
for col in cols[2:-1]:
    df[col] = df[col].astype(str).astype(np.float32)

df['label'] = df['label'].astype(str).astype(np.int16)

In [23]:
df.dtypes

patient_id        int64
visit_id          int64
embed_vec0      float32
embed_vec1      float32
embed_vec2      float32
                 ...   
embed_vec307    float32
embed_vec308    float32
embed_vec309    float32
embed_vec310    float32
label             int16
Length: 314, dtype: object

Comment out the pickle dump/load sections below based on need

In [68]:
filename = 'patient_dataframe_{}_Binary.pkl'.format(LANG_MODEL)
pickle_out = open(filename, 'wb')
pickle.dump(df, pickle_out, protocol=4)
pickle_out.close()

In [25]:
# filename = 'patient_dataframe_{}.pkl'.format(LANG_MODEL)
# pickle_in = open(filename, 'rb')
# df = pickle.load(pickle_in)
# pickle_in.close()

In [69]:
df.head()

Unnamed: 0,patient_id,visit_id,embed_vec0,embed_vec1,embed_vec2,embed_vec3,embed_vec4,embed_vec5,embed_vec6,embed_vec7,...,embed_vec301,embed_vec302,embed_vec303,embed_vec304,embed_vec305,embed_vec306,embed_vec307,embed_vec308,embed_vec309,label
0,1,1,0.2966101694915254,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.20245825,0.046511,0.2628399999999999,-0.1723597499999999,0.04502025,0.050136,-0.0164269999999999,0.1393435,-0.1375205,-0.12840625
1,1,2,0.3050847457627119,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.132496,-0.009801,0.31007,-0.178832,-0.138072,0.256234,-0.132928,0.205226,0.038524,-0.138742
2,1,3,0.3050847457627119,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.018475,-0.141793,0.165538,0.077155,-0.2430875,-0.1152685,-0.0830385,-0.16332,0.1618245,0.044875
3,1,4,0.3135593220338983,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0108409999999999,0.08353975,0.106417,-0.0402489999999999,0.064595,-0.011732,-0.19003875,0.03784,0.00579525,-0.0789639999999999
4,2,1,0.3559322033898305,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,-0.199834,-0.04717,0.157169,-0.075038,-0.148919,0.013595,-0.008292,-0.079288,-0.043639,0.127146


### Step 10: Create the binary classification model

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [None]:
properties = list(df.columns.values)
X = df[properties[2:-1]]
y = df['label']
print(X.shape, len(y))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(X.shape[1],)),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=128)

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
