# Word2Vec Model

This file contains the code necessary to train and test on a model using word2vec embeddings. Please refer to `word2vec_embeddings` to see how the embeddings were trained.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import random
import pickle
import csv
import collections
from tqdm import tqdm

## Step 0: Initialize PySpark

In [5]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [6]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '35G')
        .set('spark.driver.memory', '35G')
        .set('spark.driver.maxResultSize', '35G'))
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# arrow enabling is what makes the conversion from pandas to spark dataframe really fast
sc._conf.get('spark.driver.memory')

'35G'

In [7]:
spark

## Step 1: Read in relevant data files

In [31]:
dirPath = '/home/ubuntu/Biomed-Data-Science-NLP-Project/Data/'
embedFilePath = dirPath + 'pyspark_w2v_embeddingSize_100'
corpusFilePath = dirPath + 'train_stringed_CCSR_sentences.pkl'

### Read in embeddings

In [82]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml import Pipeline
loaded_model = Word2VecModel.load(embedFilePath)
df_embeddings = loaded_model.getVectors()
df_embeddings.count()

502

In [88]:
df_embeddings.show()

+------+--------------------+
|  word|              vector|
+------+--------------------+
|inj029|[-0.2791734635829...|
|skn005|[0.13488520681858...|
|inj033|[-0.1691723316907...|
|inj070|[-0.3290005326271...|
|nvs003|[-0.2571149170398...|
|cir002|[-0.2463666051626...|
|nvs018|[0.13281399011611...|
|ear002|[0.54433143138885...|
|neo028|[-0.0892063081264...|
|cir017|[-0.0426498651504...|
|neo039|[0.08893523365259...|
|inj050|[0.54959970712661...|
|ext014|[0.30336469411849...|
|inj074|[-0.0489733181893...|
|prg028|[0.29435864090919...|
|inj044|[0.34294834733009...|
|gen020|[-0.2292252480983...|
|dig018|[0.30665281414985...|
|cir039|[0.07407408207654...|
|pnl010|[0.08308684080839...|
+------+--------------------+
only showing top 20 rows



### Read in trian corpus

In [13]:
with open(corpusFilePath, 'rb') as handle:
    corpus = pickle.load(handle)

In [20]:
print("{} million visits".format(len(corpus)/1e6))

18.766186 million visits


In [29]:
df_corpus = pd.DataFrame(corpus, columns = ['sentences'])
df_corpus = spark.createDataFrame(df_corpus)
df_corpus.show(5)

+--------------------+
|           sentences|
+--------------------+
|MBD012 MBD001 MBD002|
|PRG028 SYM004 SYM...|
|SYM006 FAC021 FAC021|
|CIR009 CIR019 CIR...|
|RSP008 SYM010 CIR...|
+--------------------+
only showing top 5 rows



In [41]:
tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
tokenized_corpus = tokenizer.transform(df_corpus).select("tokens")
tokenized_corpus.show(5)
tokenized_corpus.count()

+--------------------+
|              tokens|
+--------------------+
|[mbd012, mbd001, ...|
|[prg028, sym004, ...|
|[sym006, fac021, ...|
|[cir009, cir019, ...|
|[rsp008, sym010, ...|
+--------------------+
only showing top 5 rows



18766186

### Read in train data

In [24]:
train = spark.read.load(dirPath + "train",
                     format="csv", sep=",", inferSchema="true", header="true")
train.count()

18766186

### Read in test data

In [23]:
test = spark.read.load(dirPath + "test",
                     format="csv", sep=",", inferSchema="true", header="true")
test.count()

4691547

## Turn Corpus into embedding

In [84]:
res = loaded_model.transform(tokenized_corpus)
res.show()

+--------------------+--------------------+
|              tokens|            features|
+--------------------+--------------------+
|[mbd012, mbd001, ...|[0.04541126700739...|
|[prg028, sym004, ...|[0.11673879437148...|
|[sym006, fac021, ...|[-0.1344170595208...|
|[cir009, cir019, ...|[0.05347454361617...|
|[rsp008, sym010, ...|[-0.0048551074827...|
|    [gen017, gen017]|[0.05020602792501...|
|    [mus025, mus010]|[0.06714359391480...|
|            [mbd005]|[-0.1537506878376...|
|            [cir012]|[0.28262704610824...|
|[inj011, inj027, ...|[0.20975265997861...|
|    [sym011, cir007]|[0.11789469188079...|
|            [fac025]|[-0.1113107800483...|
|[gen004, cir007, ...|[0.07823335627714...|
|[dig021, gen002, ...|[-0.0201907082726...|
|[fac001, inj017, ...|[0.08521811524406...|
|[mus026, inj030, ...|[0.02278881277889...|
|            [inj017]|[0.02689004875719...|
|    [dig002, fac025]|[0.31509736180305...|
|[inj003, cir012, ...|[-0.0570662468671...|
|            [rsp003]|[0.3761155

## Add demographic information

- May need to move to 100 dimensional vector
- Think of some spark methods to use


## Section to hopefully be replaced by word2Vec outcome

In [7]:
def get_word_index():
    dirPath = '/home/ubuntu/Biomed-Data-Science-NLP-Project/Data/'
    CCSR_filepath = dirPath + 'ICD_to_CCSR_20201_1.csv'
    
    icd_to_ccsr_code = {} # Maps icd_code to ccsr_code
    with open(CCSR_filepath, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            icd_to_ccsr_code[row[0][1:-1]] = row[6][1:-1]
    
    word_counts = collections.defaultdict(int)
    for i, sentence in tqdm(enumerate(corpus)):
        if len(sentence) == 1: continue
        for ICD_code in sentence:
            try:
                ccsr_code = icd_to_ccsr_code[ICD_code]
                word_counts[ccsr_code] += 1
            except:
                pass
    # Generate word:index
    words_list = list(word_counts.keys())
    word_index = dict((word, i) for i, word in enumerate(words_list))
    return word_index
word_index = get_word_index()

27977932it [01:11, 391447.87it/s]


## Step 2: Read in patient data

In [8]:
def read_patient_data():
    dirPath = '/home/ubuntu/Biomed-Data-Science-NLP-Project/Data/'
    patient_data_filepath = dirPath + 'B220_SAA_v1.csv'
    df = pd.read_csv(patient_data_filepath, dtype=str, usecols=[0, 3, 5, 6, 7] + list(range(16,41)))
    return df
patient_data = read_patient_data()

In [9]:
patient_data

Unnamed: 0,ID,Date,Age,Sex,Race,Dx10_prin,Dx10_1,Dx10_2,Dx10_3,Dx10_4,...,Dx10_15,Dx10_16,Dx10_17,Dx10_18,Dx10_19,Dx10_20,Dx10_21,Dx10_22,Dx10_23,Dx10_24
0,1,2016-06-05,35,F,White,S300XXA,M542,S199XXA,,,...,,,,,,,,,,
1,1,2017-07-16,36,F,White,N938,,,,,...,,,,,,,,,,
2,1,2017-08-15,36,F,White,F10129,N390,,,,...,,,,,,,,,,
3,1,2018-07-12,37,F,White,R0789,R0602,F17210,,,...,,,,,,,,,,
4,2,2015-12-29,42,M,Hispanic,N390,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27977927,14493203,2017-08-07,1,F,Hispanic,R509,J111,,,,...,,,,,,,,,,
27977928,14493203,2017-11-24,1,F,Hispanic,S53002A,,,,,...,,,,,,,,,,
27977929,14493203,2018-01-21,1,F,Hispanic,R112,,,,,...,,,,,,,,,,
27977930,14493203,2018-07-28,2,F,Hispanic,R509,,,,,...,,,,,,,,,,


In [None]:
def get_y_dset(patient_data):
    patient_data.loc[:,'Date'] = pd.to_datetime(patient_data.loc[:,'Date'])
    y_diff = abs(patient_data.groupby(by='ID')['Date'].diff(periods=-1)).dt.days
    y_dset = y_diff.to_numpy()
    y_dset = np.where(y_dset <= 30, 1, y_dset)
    y_dset = np.where(y_dset > 30, 0, y_dset)
    y_dset = np.where(np.isnan(y_dset), 2, y_dset)
    return y_dset
start = time.time()
y_dset = get_y_dset(patient_data.iloc[:, [0,1]])
end = time.time()
print("Took {} seconds".format(end - start))
y_dset.shape

In [104]:
def get_y_labels(patient_data):
    patient_data['Date'] = pd.to_datetime(patient_data['Date'])


y_dset, num_labels = get_y_labels(patient_data.iloc[0,:])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


TypeError: cannot unpack non-iterable NoneType object

In [103]:
def get_y_labels(patient_data):
    patient_data['Date'] = pd.to_datetime(patient_data['Date'])  #convert date column to date-time type
    y_diff = abs(patient_data.groupby(['ID'])['Date'].diff(periods=-1))
    y_diff = y_diff.dt.days
    y_label = []
    for i in y_diff:
        if pd.isnull(i):
            y_label.append(2)
        else:
            if i <= 30:
                y_label.append(1)
            else:
                y_label.append(0)
    return y_label, len(y_label)
y_dset, num_labels = get_y_labels(patient_data)

KeyboardInterrupt: 

In [None]:
-