# CS 271 Project Data Processing

In [None]:
# Define imports
import pandas as pd
import numpy as np
import tensorflow as tf
from numpy import asarray
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
import gensim.downloader as api
import sys
import pickle
import os
import csv
import time
import re

### Step 1: Read in relevant Data Files

Make sure the files below are in the same directory as your jupyter file

In [None]:
# Define constants
PATIENT_DATA_FILE = 'B220_SAA_v1.csv'
CLEANED_LABELS_FILE = 'ICD_Label_Cleaned_Oct_25.csv'
CODE_DESC_FILE = 'BIODS220_ICD_Dx_10_9_v7 - icd_dx_10_9_v7.csv'
# May want to consider: glove-wiki-gigaword-100 in the future
# See more here: https://github.com/RaRe-Technologies/gensim-data
LANG_MODEL = 'word2vec-google-news-300'

In [None]:
def read_csv_to_dict(file_path: str, key: int, value: int):
    ret_dict = {}
    with open(file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            ret_dict[row[key]] = row[value]
    print("Reading {} complete!".format(file_path))
    return ret_dict

In [None]:
def get_category_dict():
    category_dict = {
        'Circulatory': 1,
        'Dermatologic': 2,
        'Endocrine & Immune': 3,
        'Gastrointestinal': 4,
        'Genitourinary': 5, 
        'Hematologic': 6,
        'Infectious': 7,
        'Injury': 8,
        'Injury & Poisoning': 9,
        'Musculoskeletal': 10,
        'Neurologic': 11,
        'Other': 12,
        'Obstetric': 13,
        'Neoplastic': 14,
        'Poisoning': 15,
        'Psychiatric': 16,
        'Respiratory': 17,
        'Substance use': 18}
    return category_dict

In [None]:
# Import word to vec model
wv = api.load(LANG_MODEL)
print("Reading {} complete!".format(LANG_MODEL))

# Create labels dict i.e. code -> label
label_dict = read_csv_to_dict(CLEANED_LABELS_FILE, key=0, value=1)

# Create descriptions dict i.e. code -> description
codes_dict = read_csv_to_dict(CODE_DESC_FILE, key=0, value=2)

# Create dict for label to int
category_dict = get_category_dict()

### Step 2: Create one-hot feature vectors

For the sake of reducing feature space, we are only including three features in our embedding. Future iterations of our embeddings will include more features, such as the patient's county.

In [None]:
def create_one_hot(patient_data):
    # Creates one-hot vectors
    columns_to_one_hot = ['Sex','Race']
    one_hot = pd.get_dummies(patient_data[columns_to_one_hot])
    
    ordinal_columns = ['Age']
    one_hot = pd.concat([patient_data[ordinal_columns], one_hot], axis=1)
    
    # Normalize age
    x = one_hot.Age.values.reshape(-1,1)
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    one_hot.Age = x_scaled
    
    return one_hot.values

In [None]:
# Read in patient_data
patient_read_start = time.time()
patient_data = pd.read_csv(PATIENT_DATA_FILE, dtype=str, usecols=['Sex','Race','Age','Dx10_prin'])
print("Reading in patient_data took {}".format(time.time() - patient_read_start))

# One-hot encode visit features
one_hot_start = time.time()
one_hot = create_one_hot(patient_data)
print("Creating one-hot encodings took {}".format(time.time() - one_hot_start))

# Step 3: Embed ICD Codes + concat with one-hot vectors

Given a sentence, retrieve the pre-trained word2vec embedding for each word in the sentence and return the mean of the embeddings. If the sentence has no words that exist in the word2vec model, we return None.

In [None]:
def embed_sentence(sentence, split_tokens):
    word_embeddings = []
    for word in re.split(split_tokens, sentence):
        try:
            word_embeddings.append(wv[word.lower()])
        except KeyError:
            continue
    if len(word_embeddings)>0:
        return np.mean(word_embeddings, axis=0)
    else:
        return None

Given a sentence and label, return the embedding of a sentence. If there is no embedding for the sentence then return the embedding for the label.

In [None]:
def get_w2v_embedding(sentence, label):
    """
    Sentence: A description of an ICD10 code, usually 1-5 words
    Category: The class of the diagnosis, one of 17 options
    """
    embedding = embed_sentence(sentence, split_tokens='\s|(?<!\d)[:;,.\(\)\[\]-](?!\d)')
    
    # If we can't create embedding with ICD10 codes, we use category
    if embedding is not None:
        return embedding
    else:
        return embed_sentence(label, split_tokens='\s')

Given an ICD code, return the corresponding description. Here, we attempt to take handle mistyped data. Some codes are missing 1-2 characters. Sometimes, adding a 0 or removing the last character in a code will fix typos, however this strategy is prone to error and not guarunteed to work.

In [None]:
def get_desc(code):
    try:
        desc = codes_dict[code]
    except KeyError:
        try:
            new_code = code + '0'
            desc = codes_dict[new_code]
        except:
            try:
                new_code = code[:-1]
                desc = codes_dict[new_code]
            except:
                desc = 'Other'
    try:
        label = labels_dict[code]
    except:
        label = 'Other'
    return desc, label

Given an ICD code, we retrieve the embedding (300x1). We use dynamic programming to save embeddings. Often, only the first three characters of an ICD code are enough to determine the general diagnosis. Longer codes will only have slightly different descriptions (if at all). Thus, to reduce computational complexity, we store the first three characters of ICD codes. Future ICD codes that share the same first three characters will automatically use the same embedding, regardless of any remaining characters.

In [None]:
descriptions_dict = {}
def code_to_embedding(code):
    try:
        embedding = descriptions_dict[str(code[:3])]
    except:
        desc, category = get_desc(code)
        embedding = get_w2v_embedding(desc, category)
        if desc is not 'Other':
            descriptions_dict[str(code[:3])] = embedding
    return embedding

For each row of ICD Codes (i.e ['E839', 'SA920']), we retrieve the corresponding embeddings. For patient visits with n ICD Codes where n>1, we give 0.75 weight to the primary ICD code and 0.25/n-1 weight to the remaining (secondary) codes.

In [None]:
PRIMARY_WEIGHT = 0.75
SECONDARY_WEIGHT = 0.25

def row_to_embedding(input_row):
    """
    input_row: A list of ICD10 codes
    returns a 300x1 embedding
    """
    n = len(input_row)
    code = input_row[0]
    # Primary ICD code:
    primary_embedding = code_to_embedding(code)
    if n < 2: return primary_embedding
    
    # Subsequent ICD codes:
    secondary_embedding = None
    for i in range(1, n):
        code = input_row[i]
        curr_embed = code_to_embedding(code)
        if secondary_embedding is None:
            secondary_embedding = curr_embed
        else:
            secondary_embedding = np.sum([secondary_embedding, curr_embed], axis=0)
    return np.sum([primary_embedding * PRIMARY_WEIGHT, secondary_embedding * (SECONDARY_WEIGHT/n-1)], axis=0)


We read in the patient visit csv file again and create embeddings for each visit as we read the file. For each visit, we have already computed the one-hot vector encoding for categorical and ordinal variables. Here, we combine those encodings with ICD_10 code embeddings. To reduce model complexity, we limit the number of patients (not total visits) for which to create embeddings. Here, we are working under the assumption that all visits for a patient appear one after the other in the csv file, thus, we can stop reading from the file once `patient id > num_patients`.

In [None]:
NUM_PATIENTS = 1e6

def get_visit_embedding(input_file_path: str):
    start_time = time.time()
    embeddings = []
    max_len = 0
    with open(input_file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in data:
            if count == 0: # Skip the first row
                count += 1
                continue
            
            if int(row[0]) > NUM_PATIENTS: # Used to limit num patients, reducing model space
                break
            ICD_code_embeddings = row_to_embedding([entry for entry in row[16:41] if entry is not ''])
            visit_embedding = np.concatenate((one_hot[count-1, :], ICD_code_embeddings), axis=0)
            embeddings.append(visit_embedding)
            if count % 250000 == 0:
                print("Completed {} visit embeddings in {}".format(count, (time.time() - start_time)))
            count +=1
    print("Shape of embeddings: ({},{})".format(len(embeddings), len(embeddings[0])))
    return np.array(embeddings), count-1

embedding_dset, num_visits = get_visit_embedding(PATIENT_DATA_FILE)

### Step 4: Get labels

To create labels, we copy the primary diagnosis codes (`DX10_prin` column) into its own column named `Label`. For each code, we use the `label_dict` to retrive the corresponding category of the ICD code. Then we use the `category_dict` to retieve a number to represent the label. We limit the number of visits to the same size as the number of embeddings.

In [None]:
def convert_Dx10_prin_to_label(DX_10_code):
    try:
        category = label_dict[DX_10_code] # i.e. A065 -> Infectious
    except KeyError:
        category = 'Other'
    return category_dict[category] # i.e. Infectious -> 7 

def get_y_labels(patient_data):
    # Copy Dx10 code into new column named Label
    patient_data['Label'] = patient_data.Dx10_prin

    # Apply function to Label column to convert Dx10_prin code to category
    patient_data.Label = patient_data.Label.apply(convert_Dx10_prin_to_label)
    
    # Saves only the top num_visits, since that's how many embeddings there are
    y_list = patient_data.Label.iloc[:num_visits].to_list()
    return np.array(y_list), len(y_list)

In [None]:
# Get y_dset
label_start = time.time()
y_dset, num_labels = (get_y_labels(patient_data))
print("Creating labels took {}".format(time.time() - label_start))

In [None]:
# Ensures the number of labels corresponds to the number of patient visits
assert(num_labels == num_visits)

### Step 5: Save embeddings and labels

In [None]:
filename = 'embeddings_{}.pickle'.format(LANG_MODEL)

start_time = time.time()
pickle_out = open(filename, 'wb')
pickle.dump((embedding_dset, y_dset), pickle_out, protocol=4)
pickle_out.close
print("Saving embeddings took: {}".format(time.time() - start_time))