# Prepare Data For Models

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import random
import pickle
import csv
import collections
from tqdm import tqdm

In [2]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark import SparkContext, SparkConf

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '35G')
        .set('spark.driver.memory', '35G')
        .set('spark.driver.maxResultSize', '35G'))
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# arrow enabling is what makes the conversion from pandas to spark dataframe really fast
sc._conf.get('spark.driver.memory')

'35G'

In [4]:
spark

# Step 1: Read word2vec learned embeddings

In [5]:
from pyspark.ml.feature import Tokenizer, Word2Vec, Word2VecModel, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import pyspark.sql.types as T

dirPath = '/home/ubuntu/BioMedProject/Data/'

In [6]:
EMBED_LENGTH = 256 # 16, 32, 64, 128, 256

embedFilePath = dirPath + "W2V_Models/w2v_ICD_embed_{}.model".format(EMBED_LENGTH)

loaded_model = Word2VecModel.load(embedFilePath)
df_embeddings = loaded_model.getVectors()

# Step 2: Read the train and test data with ALL patient visits

In [7]:
trainCorpusFilename = dirPath + "ModelData/trainCorpusAllVisits.pkl"
testCorpusFilename = dirPath + "ModelData/testCorpusAllVisits.pkl"

In [8]:
# 1. Read in train/test datasets
train = spark.read.load(dirPath + "train",
                     format="csv", sep=",", inferSchema="true", header="true")
test = spark.read.load(dirPath + "test",
                     format="csv", sep=",", inferSchema="true", header="true")

## Step 2.1 SKIP TO NEXT SECTION : The following 2 cells are a 1 time job and have been run-saved

In [9]:
# 2. Filter for columns with ICD Codes and convert to Pandas DF
trainICDCodes = train.select(train.columns[7:32]).toPandas()
testICDCodes = test.select(test.columns[7:32]).toPandas()

# 3. Convert Pandas to a list of lists
trainCorpus = [[elem[:3] for elem in row if type(elem) == str] for row in trainICDCodes.values.tolist()]
testCorpus = [[elem[:3] for elem in row if type(elem) == str] for row in testICDCodes.values.tolist()]

# 4. Convert list ICD codes to strings
sentence = " "
trainCorpus = [sentence.join(trainCorpus[i]) for i in range(len(trainCorpus))]
testCorpus = [sentence.join(testCorpus[i]) for i in range(len(testCorpus))]

In [10]:
# 5. Save train and test 
with open(trainCorpusFilename, 'wb') as handle:
    pickle.dump(trainCorpus, handle)
    
with open(testCorpusFilename, 'wb') as handle:
    pickle.dump(testCorpus, handle)

In [11]:
del trainCorpus
del testCorpus

## Step 2.2 Load the saved train and test data with all visits

In [9]:
with open(trainCorpusFilename, 'rb') as handle:
    corpus_train = pickle.load(handle)
    
with open(testCorpusFilename, 'rb') as handle:
    corpus_test = pickle.load(handle)

In [10]:
print("{} million TRAIN visits".format(len(corpus_train)/1e6))
print("{} million TEST visits".format(len(corpus_test)/1e6))

22.002111 million TRAIN visits
5.500527 million TEST visits


# Step 3: Make transformations on the train and test dataframes

## Step 3.1 Transformations for train data

In [11]:
df_corpus_train = pd.DataFrame(corpus_train, columns = ['sentences'])
df_corpus_train = spark.createDataFrame(df_corpus_train)

tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
tokenized_corpus_train = tokenizer.transform(df_corpus_train).select("tokens")

res_train = loaded_model.transform(tokenized_corpus_train)
to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
res_train = res_train.withColumn('features', to_array('features'))

X_train = res_train.select([F.col("features")[i] for i in range(EMBED_LENGTH)])

train = train.select("ID", "Visit", "Visits", "Age", "Sex", "Race", "Label")

indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(train) for column in ['Sex', 'Race']]

pipeline = Pipeline(stages=indexers)
train = pipeline.fit(train).transform(train)
train = train.drop("Sex", "Race")

# For each column (Sex and Race), we convert each string
# to some double. I.e. for Seame transformation is applied to tx, "M" = 0.0 and "F" = 1.0.
# The she Race column
# We remove the original "Sex" and "Race" columns which
# contain the string versions.
# Converts Race and Sex column types from double to integer
train = train.withColumn("Sex", train["Sex_NUMERIC"].cast(T.IntegerType()))
train = train.withColumn("Race", train["Race_NUMERIC"].cast(T.IntegerType()))
train = train.drop("Sex_NUMERIC", "Race_NUMERIC")
# Groups ages together within the same 5 year window i.e. (65 - 69) = 65 or (25-29) = 25 -- 
# integer encode Age rather than one-hot encode which works best for categorical variables. 
# Treat age as numeric
# Turns age column into integers
train = train.withColumn("Age", (5/100*round(train["Age"] / 5 )).cast(T.FloatType()))

encoder = OneHotEncoder(inputCols=["Sex", "Race"], outputCols=["Sex_Encoded", "Race_Encoded"])
model_train = encoder.fit(train)
train = model_train.transform(train)

train = train.drop("Sex", "Race")
train = train.select("Sex_Encoded", "Race_Encoded", "Age", "Label")
w=Window.orderBy(lit(1))
X_train = X_train.withColumn("id",  row_number().over(w))
train  =train.withColumn("id", row_number().over(w))
master_df_train = X_train.join(train, "id","outer")
master_df_train = master_df_train.drop("id")

## Step 3.2: Transformations for test data

In [12]:
df_corpus_test = pd.DataFrame(corpus_test, columns = ['sentences'])
df_corpus_test = spark.createDataFrame(df_corpus_test)

tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
tokenized_corpus_test = tokenizer.transform(df_corpus_test).select("tokens")

res_test = loaded_model.transform(tokenized_corpus_test)

to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
res_test = res_test.withColumn('features', to_array('features'))

X_test = res_test.select([F.col("features")[i] for i in range(EMBED_LENGTH)])

test = test.select("ID", "Visit", "Visits", "Age", "Sex", "Race", "Label")

indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(test) for column in ['Sex', 'Race']]

pipeline = Pipeline(stages=indexers)
test = pipeline.fit(test).transform(test)
test = test.drop("Sex", "Race")

# Converts Race and Sex column types from double to integer
test = test.withColumn("Sex", test["Sex_NUMERIC"].cast(T.IntegerType()))
test = test.withColumn("Race", test["Race_NUMERIC"].cast(T.IntegerType()))
test = test.drop("Sex_NUMERIC", "Race_NUMERIC")
test = test.withColumn("Age", (5/100*round(test["Age"] / 5)).cast(T.FloatType()))

encoder = OneHotEncoder(inputCols=["Sex", "Race"], outputCols=["Sex_Encoded", "Race_Encoded"])
model_test = encoder.fit(test)
test = model_test.transform(test)

test = test.drop("Sex", "Race")
test = test.select("Sex_Encoded", "Race_Encoded", "Age", "Label")
w=Window.orderBy(lit(1))
X_test = X_test.withColumn("id",  row_number().over(w))
test = test.withColumn("id", row_number().over(w))
master_df_test = X_test.join(test, "id","outer")
master_df_test = master_df_test.drop("id")

# Step 4: Collate the data into Pyspark-able format 

## Make the data ready for some baseline binary classifiers like Random Forest, Logistic regression, Gradient Boosted Decision Trees etc. See [here](https://towardsdatascience.com/machine-learning-with-pyspark-and-mllib-solving-a-binary-classification-problem-96396065d2aa)

In [13]:
inputCols = ['features[{}]'.format(i) for i in range(EMBED_LENGTH)] + ["Sex_Encoded", "Race_Encoded", "Age"]

# # for 0 embedding length -- use only demographic columns
# master_df = train
# inputCols = ["Sex_Encoded", "Race_Encoded", "Age"]

assembler = VectorAssembler(inputCols=inputCols, outputCol="features") # assemble ALL the features in a vector 
stages = [assembler]

In [14]:
pipeline = Pipeline(stages = stages)
pipelineModel_train = pipeline.fit(master_df_train)
fitting_ready_df_train = pipelineModel_train.transform(master_df_train)
fitting_ready_df_train = fitting_ready_df_train.select(['Label', 'features'])
fitting_ready_df_train.printSchema()

root
 |-- Label: integer (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
pipeline = Pipeline(stages = stages)
pipelineModel_test = pipeline.fit(master_df_test)
fitting_ready_df_test = pipelineModel_test.transform(master_df_test)
fitting_ready_df_test = fitting_ready_df_test.select(['Label', 'features'])
fitting_ready_df_test.printSchema()

root
 |-- Label: integer (nullable = true)
 |-- features: vector (nullable = true)



In [16]:
train = None
test = None
master_df_train = None
master_df_test = None

In [17]:
modelReadyTrainFilename = dirPath + f'ModelData/modelReadyTrain_{EMBED_LENGTH}.parquet'
modelReadyTestFilename = dirPath + f'ModelData/modelReadyTest_{EMBED_LENGTH}.parquet'

In [18]:
t1 = time.time()
fitting_ready_df_test.write.mode("overwrite").save(modelReadyTestFilename, format="parquet")
t2 = time.time()
print("Saving took {} minutes".format((t2-t1)/60))

Saving took 5.642933615048727 minutes


In [20]:
# 5. Save final dataframes
t1 = time.time()
fitting_ready_df_train.write.mode("overwrite").save(modelReadyTrainFilename, format="parquet")
t2 = time.time()
print("Saving took {} minutes".format((t2-t1)/60))

Saving took 26.053846804300942 minutes
