# CS 271 Project Data Processing

In [5]:
# Define imports
import sys
import pickle
import os
import csv
import time
import re

### Step 1: Read in relevant Data Files

Make sure the files below are in the same directory as your jupyter file

In [17]:
# Define constants
PATIENT_DATA_FILE = 'B220_SAA_v1.csv'
CLEANED_LABELS_FILE = 'ICD_Label_Cleaned_Oct_25.csv'
CODE_DESC_FILE = 'BIODS220_ICD_Dx_10_9_v7 - icd_dx_10_9_v7.csv'
# May want to consider: glove-wiki-gigaword-100 in the future
# See more here: https://github.com/RaRe-Technologies/gensim-data
LANG_MODEL = 'word2vec-google-news-300'

### Step 2: Get Med2Vec List of Codes 

In [18]:
med2vec_dict = {}
def check_med2vec_dict(icdcode, lastint):
    """
    icdcode: code to check in dictionary
    lastint: value of the last code that was added to dictionary
    checks to see if a code is in the med2vec dictionary, if not add to it
    returns an integer of the dictionary value, and updated last int
    """
    
    if icdcode in med2vec_dict:
        return med2vec_dict[icdcode], lastint
    else:
        lastint+=1
        med2vec_dict[icdcode] = lastint
        return med2vec_dict[icdcode], lastint  
    

In [19]:
def row_to_list(input_row, lastint):
    """
    input_row: A list of ICD10 codes
    returns a list of integers for each ICD10. 
    """
    
    codes = []
    n = len(input_row) 
    for i in range(0, n):
        code = input_row[i]
        value, lastint = check_med2vec_dict(code, lastint)
        codes.append(value)
    return codes, lastint
    

In [20]:
NUM_PATIENTS = .05e6 #consider removing this for the final run, for final run we should be running this on our final dset, not just the first n visits
def get_visit_embedding(input_file_path: str):
    final_seq = []
    lastint=0
    lastpatient=1
    start_time = time.time()
    max_len = 0
    with open(input_file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in data: # <- top to bottom
            if count == 0: # Skip the first row
                count += 1
                continue
            
            if int(row[0]) > NUM_PATIENTS: # Used to limit num patients, reducing model space
                break
            
            currentpatient = row[0]
            if lastpatient != currentpatient:
                final_seq.append([-1]) #append the patient delimiter for med2vec
                lastpatient = currentpatient
                
            codes, lastint = row_to_list([entry for entry in row[16:41] if entry != ''], lastint)
            final_seq.append(codes)
            
            # Tracking progress
            if count % 100000 == 0:
                print("Completed {} visit embeddings in {}".format(count, (time.time() - start_time)))
            count +=1
            
    return final_seq

codes_list = get_visit_embedding(PATIENT_DATA_FILE)
codes_list.pop(0) #remove the first -1

Completed 100000 visit embeddings in 1.2096598148345947


[-1]

In [21]:
print(len(med2vec_dict+1)) #need this number to run med2vec, need to add 1 to it

12068


In [22]:
filename = 'visit_list.seqs'
pickle_out = open(filename, 'wb')
pickle.dump(codes_list, pickle_out, protocol=2)
pickle_out.close()

In [9]:
med2vec_dict['padding'] = 0

In [10]:
filename = 'dict.pkl'
pickle_out = open(filename, 'wb')
pickle.dump(med2vec_dict, pickle_out, protocol=2)
pickle_out.close()

### Step 4 - make training list with less codes for med2vec

In [31]:
med2vec_dict = {}
def check_med2vec_dict(icdcode, lastint):
    """
    icdcode: code to check in dictionary
    lastint: value of the last code that was added to dictionary
    checks to see if a code is in the med2vec dictionary, if not add to it
    returns an integer of the dictionary value, and updated last int
    """
    icdcode = icdcode[:3] #take only first 3 digits 
    
    if icdcode in med2vec_dict:
        return med2vec_dict[icdcode], lastint
    else:
        lastint+=1
        med2vec_dict[icdcode] = lastint
        return med2vec_dict[icdcode], lastint  
    

In [32]:
NUM_PATIENTS = .05e6 #consider removing this for the final run
def get_visit_embedding_training(input_file_path: str):
    final_seq = []
    lastint=0
    lastpatient=1
    start_time = time.time()
    max_len = 0
    with open(input_file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        count = 0
        for row in data: # <- top to bottom
            if count == 0: # Skip the first row
                count += 1
                continue
            
            if int(row[0]) > NUM_PATIENTS: # Used to limit num patients, reducing model space
                break
            
            currentpatient = row[0]
            if lastpatient != currentpatient:
                final_seq.append([-1]) #append the patient delimiter for med2vec
                lastpatient = currentpatient
                
            codes, lastint = row_to_list([entry for entry in row[16:41] if entry != ''], lastint) #shrink size
            final_seq.append(codes)
            
            # Tracking progress
            if count % 100000 == 0:
                print("Completed {} visit embeddings in {}".format(count, (time.time() - start_time)))
            count +=1
            
    return final_seq

codes_list_grouped = get_visit_embedding(PATIENT_DATA_FILE)
codes_list_grouped.pop(0) #remove the first -1

Completed 100000 visit embeddings in 1.3099403381347656


[-1]

In [33]:
print(len(med2vec_dict+1)) #need this number to run med2vec add 1 

1399


In [34]:
max_int = 0
for lis in codes_list:
    if max(lis) > max_int:
        max_int = max(lis)
print(max_int)

12068


### Step 5: Pickle Grouped Output

In [35]:
filename = 'visit_list_grouped.seqs'
pickle_out = open(filename, 'wb')
pickle.dump(codes_list_grouped, pickle_out, protocol=2)
pickle_out.close()

In [12]:
print(codes_list[0:50])

[[1, 2, 3], [4], [5, 6], [7, 8, 9], [-1], [6], [10, 11, 12, 13, 14, 15, 16], [17], [18, 19], [20, 21], [22, 18, 11, 23], [24], [25], [26, 27, 28], [29, 11, 30], [11, 19], [11, 12, 28], [31, 16, 23], [27, 12, 2, 28], [28, 11], [18, 32, 33, 12, 11, 14, 15, 16], [34], [18, 35, 12, 36], [37, 7], [-1], [38, 39, 9, 40, 41], [38, 39, 42], [43, 39, 44, 45, 46, 9], [38], [38], [47, 9, 48, 49, 50, 14], [-1], [6, 51, 52], [53, 54], [55], [56, 57], [-1], [58], [59, 60], [54, 61], [62, 63, 64, 65, 6], [-1], [66, 67], [68, 69], [2, 59, 70, 67, 71, 72], [58, 9], [59, 73], [-1], [74, 39, 75, 53, 76, 77, 23, 78], [79, 80, 9, 76, 81, 82, 83, 84]]


### Step 6: Run Med2Vec.py with python2.7