# CS 271 Project Data Processing

In [1]:
# Define imports
import pandas as pd
import numpy as np
import tensorflow as tf
from numpy import asarray
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
import gensim.downloader as api
import sys
import pickle
import os
import csv
import time
import re

ModuleNotFoundError: No module named 'tensorflow'

### Step 1: Read in relevant Data Files

Make sure the files below are in the same directory as your jupyter file

In [2]:
# Define constants
PATIENT_DATA_FILE = 'B220_SAA_v1.csv'
CLEANED_LABELS_FILE = 'ICD_Label_Cleaned_Oct_25.csv'
CODE_DESC_FILE = 'BIODS220_ICD_Dx_10_9_v7 - icd_dx_10_9_v7.csv'
# May want to consider: glove-wiki-gigaword-100 in the future
# See more here: https://github.com/RaRe-Technologies/gensim-data
LANG_MODEL = 'word2vec-google-news-300'

In [3]:
def read_csv_to_dict(file_path: str, key: int, value: int):
    ret_dict = {}
    with open(file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            ret_dict[row[key]] = row[value]
    print("Reading {} complete!".format(file_path))
    return ret_dict

In [4]:
def get_category_dict():
    category_dict = {
        'Circulatory': 1,
        'Dermatologic': 2,
        'Endocrine & Immune': 3,
        'Gastrointestinal': 4,
        'Genitourinary': 5, 
        'Hematologic': 6,
        'Infectious': 7,
        'Injury & Poisoning': 8,
        'Musculoskeletal': 9,
        'Neurologic': 10,
        'Other': 11,
        'Obstetric': 12,
        'Neoplastic': 13,
        'Psychiatric': 14,
        'Respiratory': 15,
        'Substance use': 16}
    #use to_categorical()
    return category_dict

In [5]:
# Import word to vec model
wv = api.load(LANG_MODEL)
print("Reading {} complete!".format(LANG_MODEL))

# Create labels dict i.e. code -> label, i.e. A840 -> 'Neurologic'
label_dict = read_csv_to_dict(CLEANED_LABELS_FILE, key=0, value=1)

# Create descriptions dict i.e. code -> description
codes_dict = read_csv_to_dict(CODE_DESC_FILE, key=0, value=2)

# Create dict for label to int
category_dict = get_category_dict()

Reading word2vec-google-news-300 complete!
Reading ICD_Label_Cleaned_Oct_25.csv complete!
Reading BIODS220_ICD_Dx_10_9_v7 - icd_dx_10_9_v7.csv complete!


### Step 2: Create one-hot feature vectors

For the sake of reducing feature space, we are only including three features in our embedding. Future iterations of our embeddings will include more features, such as the patient's county.

In [6]:
def create_one_hot(patient_data):
    # Creates one-hot vectors
    columns_to_one_hot = ['Sex','Race']
    one_hot = pd.get_dummies(patient_data[columns_to_one_hot])
    
    ordinal_columns = ['Age']
    one_hot = pd.concat([patient_data[ordinal_columns], one_hot], axis=1)
    
    # Normalize age
    x = one_hot.Age.values.reshape(-1,1)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    one_hot.Age = x_scaled
    
    return one_hot.values

In [8]:
# Read in patient_data
patient_read_start = time.time()
patient_data = pd.read_csv(PATIENT_DATA_FILE, dtype=str, usecols=['Sex','Race','Age','Dx10_prin'])
print("Reading in patient_data took {}".format(time.time() - patient_read_start))

# One-hot encode visit features
one_hot_start = time.time()
one_hot = create_one_hot(patient_data)
print("Creating one-hot encodings took {}".format(time.time() - one_hot_start))
print(one_hot.shape)

Reading in patient_data took 44.95810008049011
Creating one-hot encodings took 11.02394723892212
(27977932, 11)


In [None]:
# (num_patients, 311)

# Step 3: Embed ICD Codes + concat with one-hot vectors

Given a sentence, retrieve the pre-trained word2vec embedding for each word in the sentence and return the mean of the embeddings. If the sentence has no words that exist in the word2vec model, we return None.

In [9]:
def embed_sentence(sentence, split_tokens): 
    # Revisit: since medical terms may not be present, although they are important to include
    word_embeddings = []
    for word in re.split(split_tokens, sentence):
        try:
            word_embeddings.append(wv[word.lower()])
        except KeyError:
            continue
    if len(word_embeddings)>0:
        return np.mean(word_embeddings, axis=0)
    else:
        return None

Given a sentence and label, return the embedding of a sentence. If there is no embedding for the sentence then return the embedding for the label.

In [10]:
def get_w2v_embedding(sentence, label):
    """
    Sentence: A description of an ICD10 code, usually 1-5 words
    Category: The class of the diagnosis, one of 17 options
    """
    embedding = embed_sentence(sentence, split_tokens='\s|(?<!\d)[:;,.\(\)\[\]-](?!\d)')
    
    # If we can't create embedding with ICD10 codes, we use category
    if embedding is not None:
        return embedding
    else:
        return embed_sentence(label, split_tokens='\s')

Given an ICD code, return the corresponding description. Here, we attempt to take handle mistyped data. Some codes are missing 1-2 characters. Sometimes, adding a 0 or removing the last character in a code will fix typos, however this strategy is prone to error and not guarunteed to work.

In [11]:
def get_desc(code):
    try:
        desc = codes_dict[code]
    except KeyError:
        try:
            new_code = code + '0'
            desc = codes_dict[new_code]
        except:
            try:
                new_code = code[:-1]
                desc = codes_dict[new_code]
            except:
                desc = 'Other'
    try:
        label = labels_dict[code]
    except:
        label = 'Other'
    return desc, label

Given an ICD code, we retrieve the embedding (300x1). We use dynamic programming to save embeddings. Often, only the first three characters of an ICD code are enough to determine the general diagnosis. Longer codes will only have slightly different descriptions (if at all). Thus, to reduce computational complexity, we store the first three characters of ICD codes. Future ICD codes that share the same first three characters will automatically use the same embedding, regardless of any remaining characters.

In [12]:
descriptions_dict = {}

def code_to_embedding(code):
    try:
        embedding = descriptions_dict[str(code[:3])]
    except:
        desc, category = get_desc(code)
        embedding = get_w2v_embedding(desc, category)
        if desc is not 'Other':
            descriptions_dict[str(code[:3])] = embedding
    return embedding

For each row of ICD Codes (i.e ['E839', 'SA920']), we retrieve the corresponding embeddings. For patient visits with n ICD Codes where n>1, we give 0.75 weight to the primary ICD code and 0.25/n-1 weight to the remaining (secondary) codes.

In [13]:
PRIMARY_WEIGHT = 0.75
SECONDARY_WEIGHT = 0.25

def row_to_embedding(input_row):
    """
    input_row: A list of ICD10 codes
    returns a 300x1 embedding
    """
    n = len(input_row)
    code = input_row[0]
    # Primary ICD code:
    primary_embedding = code_to_embedding(code)
    if n < 2: return primary_embedding
    
    # Subsequent ICD codes:
    secondary_embedding = None
    for i in range(1, n):
        code = input_row[i]
        curr_embed = code_to_embedding(code)
        if secondary_embedding is None:
            secondary_embedding = curr_embed
        else:
            secondary_embedding = np.sum([secondary_embedding, curr_embed], axis=0)
    
    
    return np.sum([primary_embedding * PRIMARY_WEIGHT, secondary_embedding * (SECONDARY_WEIGHT/n-1)], axis=0)


We read in the patient visit csv file again and create embeddings for each visit as we read the file. For each visit, we have already computed the one-hot vector encoding for categorical and ordinal variables. Here, we combine those encodings with ICD_10 code embeddings. To reduce model complexity, we limit the number of patients (not total visits) for which to create embeddings. Here, we are working under the assumption that all visits for a patient appear one after the other in the csv file, thus, we can stop reading from the file once `patient id > num_patients`.

In [14]:
NUM_PATIENTS = 1e6

def get_visit_embedding(input_file_path: str):
    start_time = time.time()
    embeddings = []
    patient_visit = []
    max_len = 0
#     patient_visit = {
#         1 : [[1, embed],[4, embed],[2,embed]],
#         2 : [[embed],[embed],....],
#         .
#         .
#         .
#     }
    with open(input_file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in data: # <- top to bottom
            if count == 0: # Skip the first row
                count += 1
                continue
            
            if int(row[0]) > NUM_PATIENTS: # Used to limit num patients, reducing model space
                break
                
            ICD_code_embeddings = row_to_embedding([entry for entry in row[16:41] if entry is not ''])
            
            # Combine one-hot with w2v embeddings
            visit_embedding = np.concatenate((one_hot[count-1, :], ICD_code_embeddings), axis=0)
            embeddings.append(visit_embedding)
            patient_visit.append([row[0], row[1]]) # store patient and visit info
            
            # Tracking progress
            if count % 250000 == 0:
                print("Completed {} visit embeddings in {}".format(count, (time.time() - start_time)))
            count +=1
            
    print("Shape of embeddings: ({},{})".format(len(embeddings), len(embeddings[0])))
    print("Shape of patient_visit: ({},{})".format(len(patient_visit), len(patient_visit[0])))
    embedding_vecs = np.array(embeddings)
    result = np.hstack((np.array(patient_visits),embedding_vecs)) # patient id, visit id, embedding vector
    return result, count-1

embedding_dset, num_visits = get_visit_embedding(PATIENT_DATA_FILE)

Completed 250000 visit embeddings in 15.989832162857056
Completed 500000 visit embeddings in 31.793057918548584
Completed 750000 visit embeddings in 46.7544960975647
Completed 1000000 visit embeddings in 61.467241287231445
Completed 1250000 visit embeddings in 76.34133696556091
Completed 1500000 visit embeddings in 91.55047106742859
Completed 1750000 visit embeddings in 106.88795018196106
Completed 2000000 visit embeddings in 122.51593518257141
Shape of embeddings: (2118142,311)


In [9]:
# import numpy as np
# a = np.random.rand(7,4)
# b = np.ones((7,2))
# print(a.shape, b.shape)
# b[:,1]= 2
# res = np.hstack((b,a))
# print(res.shape)
# print(a)
# print(res)
# c = [[1,2],[3,4],[3,4]]
# print(np.array(c).shape)
# d = np.array(list([1,2,3,4,5]))
# print(d.reshape(d.shape[0],1).shape)
# cols =  ['patient_id', 'visit_id'] + ['embed_vec'+ str(i) for i in range(res.shape[1]-2)] + ['label']
# print(cols)

(7, 4) (7, 2)
(7, 6)
[[0.24997387 0.47208521 0.44250465 0.15573033]
 [0.97051585 0.91525352 0.67683293 0.94238081]
 [0.24690088 0.33114635 0.21995855 0.20040291]
 [0.11115457 0.25069068 0.46031599 0.03313348]
 [0.81267051 0.80007031 0.29333346 0.01576718]
 [0.68358734 0.35329325 0.82702192 0.64831648]
 [0.32509275 0.49822325 0.85422819 0.99266063]]
[[1.         2.         0.24997387 0.47208521 0.44250465 0.15573033]
 [1.         2.         0.97051585 0.91525352 0.67683293 0.94238081]
 [1.         2.         0.24690088 0.33114635 0.21995855 0.20040291]
 [1.         2.         0.11115457 0.25069068 0.46031599 0.03313348]
 [1.         2.         0.81267051 0.80007031 0.29333346 0.01576718]
 [1.         2.         0.68358734 0.35329325 0.82702192 0.64831648]
 [1.         2.         0.32509275 0.49822325 0.85422819 0.99266063]]
(3, 2)
(5, 1)
['patient_id', 'visit_id', 'embed_vec0', 'embed_vec1', 'embed_vec2', 'embed_vec3', 'label']


### Step 4: Get labels

To create labels, we copy the primary diagnosis codes (`DX10_prin` column) into its own column named `Label`. For each code, we use the `label_dict` to retrive the corresponding category of the ICD code. Then we use the `category_dict` to retieve a number to represent the label. We limit the number of visits to the same size as the number of embeddings.

In [15]:
def convert_Dx10_prin_to_label(DX_10_code):
    try:
        category = label_dict[DX_10_code] # i.e. A065 -> Infectious
    except KeyError:
        category = 'Other'
    return category_dict[category] # i.e. Infectious -> 7 

def get_y_labels(patient_data):
    # Copy Dx10 code into new column named Label
    patient_data['Label'] = patient_data.Dx10_prin

    # Apply function to Label column to convert Dx10_prin code to category
    patient_data.Label = patient_data.Label.apply(convert_Dx10_prin_to_label)
    
    # Saves only the top num_visits, since that's how many embeddings there are
    y_list = patient_data.Label.iloc[:num_visits].to_list()
    return np.array(y_list), len(y_list)

In [16]:
# Get y_dset
label_start = time.time()
y_dset, num_labels = get_y_labels(patient_data)
print("Creating labels took {}".format(time.time() - label_start))

Creating labels took 12.619440793991089


In [17]:
# Ensures the number of labels corresponds to the number of patient visits
assert(num_labels == num_visits)

### Step 5: Save embeddings and labels

In [None]:
# filename = 'embeddings_{}.pickle'.format(LANG_MODEL)

# start_time = time.time()
# pickle_out = open(filename, 'wb')
# pickle.dump((embedding_dset, y_dset), pickle_out, protocol=4)
# pickle_out.close
# print("Saving embeddings took: {}".format(time.time() - start_time))

### Step 6: Store patient_id, visit_id, embedding_vec, label in a pandas dataframe

In [None]:
collated_data = np.vstack((embedding_dset, y_dset.reshape(y_dset.shape[0],1)))
cols =  ['patient_id', 'visit_id'] + ['embed_vec'+ str(i) for i in range(embedding_dset.shape[1]-2)] + ['label']
df = pd.DataFrame(data = collated_data, columns = cols)

In [36]:
# import pandas as pd
# df = pd.DataFrame({

#     'patient_id': ['A', 'A', 'B', 'B', 'D', 'C'],

#     'visit_id': [2, 1, 9, 8, 7, 4],

#     'col3': [0, 1, 9, 4, 2, 3],

#     'col4': ['a', 'B', 'c', 'D', 'e', 'F']

# })
# df = df.sort_values(['patient_id', 'visit_id'])
# gf = df.groupby('patient_id')
# for patient in df['patient_id'].unique():
#     curr_patient = gf.get_group(patient)
#     num_visits  = curr_patient.shape[0]
# #     print(curr_patient)
#     print([list(curr_patient.iloc[0,1:-1].values)])
#     print(curr_patient.iloc[0,-1])

[[1, 1]]
B
[[8, 4]]
D
[[4, 3]]
F
[[7, 2]]
e


In [37]:
df = df.sort_values(['patient_id', 'visit_id'])
gf = df.groupby('patient_id')
data_X = []
data_Y = []
for patient in df['patient_id'].unique():
    curr_patient = gf.get_group(patient)
    num_visits  = curr_patient.shape[0]
    if num_visits < 3:
        continue
    elif num_visits == 3:
        visit_0 = list(curr_patient.iloc[0,2:-1].values)
        visit_1 = list(curr_patient.iloc[1,2:-1].values)
        data_X.append([visit_0, visit_1])
        data_Y.append(curr_patient.iloc[2,-1])
    else:
        for i in range(0, num_visits-2):
            visit_0 = list(curr_patient.iloc[i,2:-1].values)
            visit_1 = list(curr_patient.iloc[i+1,2:-1].values)
            data_X.append([visit_0, visit_1])
            data_Y.append(curr_patient.iloc[i+2,-1])

data_X = np.array(data_X)
# data_X= data_X.reshape()
print(data_X.shape)

(0,)


In [None]:
np.random.seed(42) 
np.random.shuffle() 