# H20 Models

In [1]:
import pandas as pd
import numpy as np
import time
import random
import pickle
import csv
import collections
from tqdm import tqdm

In [2]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark import SparkContext, SparkConf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import Tokenizer, Word2Vec, Word2VecModel, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '50G')
        .set('spark.driver.memory', '100G')
        .set('spark.driver.maxResultSize', '50G'))
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# arrow enabling is what makes the conversion from pandas to spark dataframe really fast
sc._conf.get('spark.driver.memory')

'100G'

In [4]:
import h2o
from h2o.estimators import H2OXGBoostEstimator
h2o.init(max_mem_size = '300G')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_292"; OpenJDK Runtime Environment (build 1.8.0_292-8u292-b10-0ubuntu1~18.04-b10); OpenJDK 64-Bit Server VM (build 25.292-b10, mixed mode)
  Starting server from /home/ubuntu/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp_wp1czr8
  JVM stdout: /tmp/tmp_wp1czr8/h2o_ubuntu_started_from_python.out
  JVM stderr: /tmp/tmp_wp1czr8/h2o_ubuntu_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.6
H2O_cluster_version_age:,5 days
H2O_cluster_name:,H2O_from_python_ubuntu_k8k9s6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,266.7 Gb
H2O_cluster_total_cores:,48
H2O_cluster_allowed_cores:,48


## Load in Data

In [5]:
dirPath = '/home/ubuntu/BioMedProject/Data/'
trainCorpusFilename = dirPath + "ModelData/trainCorpusAllVisits.pkl"
testCorpusFilename = dirPath + "ModelData/testCorpusAllVisits.pkl"

In [6]:
# Load Embeddings
EMBED_LENGTH = 128 # 16, 32, 64, 128, 256

#embedFilePath = dirPath + f"W2V_Models/w2v_ICD_embed_{EMBED_LENGTH}.model"
embedFilePath = dirPath + f"W2V_Models/w2v_ICD_Combined_Visits_embed_{EMBED_LENGTH}.model"
loaded_model = Word2VecModel.load(embedFilePath)

# Read train/test datasets
train = spark.read.load(dirPath + "train",
                     format="csv", sep=",", inferSchema="true", header="true")
test = spark.read.load(dirPath + "test",
                     format="csv", sep=",", inferSchema="true", header="true")

# Read in train/test corpus
with open(trainCorpusFilename, 'rb') as handle:
    corpus_train = pickle.load(handle)
    
with open(testCorpusFilename, 'rb') as handle:
    corpus_test = pickle.load(handle)

print("{} million TRAIN visits".format(len(corpus_train)/1e6))
print("{} million TEST visits".format(len(corpus_test)/1e6))

22.002111 million TRAIN visits
5.500527 million TEST visits


## Prepare train/test ICD features

In [7]:
# Train ICD features
start = time.time()
sentences_train = pd.DataFrame(corpus_train, columns = ['sentences'])
sentences_train = spark.createDataFrame(sentences_train)

tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
tokenized_sentences_train = tokenizer.transform(sentences_train).select("tokens")

to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
features_train = loaded_model.transform(tokenized_sentences_train)
features_train = features_train.withColumn('features', to_array('features'))
features_train = features_train.select([F.col("features")[i] for i in range(EMBED_LENGTH)])
features_train = features_train.toPandas()

del sentences_train, tokenized_sentences_train
end = time.time()
print(f"Cell took {np.round(end-start, 2)} seconds")
features_train.head(5)

Cell took 101.31 seconds


Unnamed: 0,features[0],features[1],features[2],features[3],features[4],features[5],features[6],features[7],features[8],features[9],...,features[118],features[119],features[120],features[121],features[122],features[123],features[124],features[125],features[126],features[127]
0,0.249005,0.183955,-0.066878,-0.179438,-0.21233,0.131907,-0.026677,0.131161,-0.07102,-0.269541,...,0.110372,-0.015842,-0.048696,-0.021769,0.172742,0.03551,-0.148079,0.124246,-0.045076,0.009939
1,-0.160411,0.059638,-0.068753,0.184218,0.041867,0.014402,0.092871,0.011807,0.070516,-0.03142,...,0.0309,0.033065,-0.011398,0.188823,-0.075263,0.068672,-0.004747,0.018791,0.318191,0.149578
2,0.090658,0.043877,0.05517,-0.014114,0.130936,0.1601,-0.145066,-0.120486,0.064123,0.023467,...,0.002414,0.082141,-0.145741,0.019065,-0.205346,-0.149748,0.013014,-0.019415,0.073258,0.084563
3,0.039653,-0.02986,0.055955,0.005946,0.077048,0.012998,0.080631,-0.053908,-0.096958,-0.068201,...,0.06968,-0.099999,-0.087965,0.005548,-0.092047,0.018882,-0.039858,0.074794,0.093909,-0.031947
4,-0.052766,0.270228,0.033832,0.363306,-0.408001,0.005488,0.396168,0.034025,-0.136719,0.125028,...,-0.230156,0.218363,-0.536752,0.209291,0.731626,-0.143321,-0.621784,-1.224659,0.417851,0.79262


In [8]:
# Test ICD features
start = time.time()
sentences_test = pd.DataFrame(corpus_test, columns = ['sentences'])
sentences_test = spark.createDataFrame(sentences_test)

tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
tokenized_sentences_test = tokenizer.transform(sentences_test).select("tokens")

to_array = F.udf(lambda v: v.toArray().tolist(), T.ArrayType(T.FloatType()))
features_test = loaded_model.transform(tokenized_sentences_test)
features_test = features_test.withColumn('features', to_array('features'))
features_test = features_test.select([F.col("features")[i] for i in range(EMBED_LENGTH)])
features_test = features_test.toPandas()

del sentences_test, tokenized_sentences_test
end = time.time()
print(f"Cell took {np.round(end-start, 2)} seconds")
features_test.head(5)

Cell took 23.68 seconds


Unnamed: 0,features[0],features[1],features[2],features[3],features[4],features[5],features[6],features[7],features[8],features[9],...,features[118],features[119],features[120],features[121],features[122],features[123],features[124],features[125],features[126],features[127]
0,-0.11726,0.000754,-0.032524,-0.096162,0.01284,-0.053684,-0.065955,-0.005485,-0.381163,0.085953,...,-0.069967,0.081969,0.062178,-0.239276,-0.065737,-0.041535,0.038996,-0.321499,-0.163752,0.095138
1,-0.012651,-0.049794,-0.050745,0.010635,-0.054866,0.095023,0.031784,-0.007406,-0.046581,-0.067744,...,0.048496,0.04111,-0.069603,-0.154994,-0.152459,-0.081574,0.056118,-0.01611,-0.073609,0.073931
2,0.026388,0.000131,-0.064715,0.040941,0.040159,0.069498,0.036796,-0.038671,-0.094336,-0.043618,...,-0.041855,-0.109371,-0.007898,0.053661,-0.061049,-0.04904,-0.041497,-0.00325,0.01162,0.0074
3,-0.183064,-0.04839,-0.100955,0.115407,0.01406,-0.013707,-0.119278,-0.036564,-0.251277,0.112947,...,-0.101023,-0.002932,0.17837,-0.123095,-0.155423,0.071806,0.017828,-0.286469,-0.135956,0.087659
4,-0.025092,-0.17317,-0.040595,0.011849,-0.036236,0.019903,0.047024,-0.008753,-0.194318,-0.020572,...,-0.136777,0.164928,0.070392,-0.053444,-0.111523,-0.11182,-0.049053,-0.050538,-0.042853,0.109765


## Prepare demographic features

In [9]:
start = time.time()
demographic_train = train.select("Age", "Sex", "Race", "Label")
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(demographic_train) for column in ['Sex', 'Race']]
pipeline = Pipeline(stages=indexers)
demographic_train = pipeline.fit(demographic_train).transform(demographic_train)
demographic_train = demographic_train.drop("Sex", "Race")
demographic_train = demographic_train.withColumn("Sex", demographic_train["Sex_NUMERIC"].cast(T.IntegerType()))
demographic_train = demographic_train.withColumn("Race", demographic_train["Race_NUMERIC"].cast(T.IntegerType()))
demographic_train = demographic_train.drop("Sex_NUMERIC", "Race_NUMERIC")
demographic_train = demographic_train.withColumn("Age", (5/100*round(train["Age"] / 5 )).cast(T.FloatType()))
df_demographic_train = demographic_train.toPandas()
df_demographic_train = pd.get_dummies(df_demographic_train, columns=["Sex", "Race"])

del demographic_train
end = time.time()

print(f"Cell took {np.round(end-start, 2)} seconds")
df_demographic_train.head(5)

Cell took 9.03 seconds


Unnamed: 0,Age,Label,Sex_0,Sex_1,Sex_2,Race_0,Race_1,Race_2,Race_3,Race_4,Race_5,Race_6
0,0.0,0,0,1,0,1,0,0,0,0,0,0
1,0.55,1,0,1,0,1,0,0,0,0,0,0
2,0.7,0,0,1,0,0,0,1,0,0,0,0
3,0.9,1,0,1,0,1,0,0,0,0,0,0
4,0.55,1,1,0,0,0,1,0,0,0,0,0


In [10]:
start = time.time()
demographic_test = test.select("Age", "Sex", "Race", "Label")
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(demographic_test) for column in ['Sex', 'Race']]
pipeline = Pipeline(stages=indexers)
demographic_test = pipeline.fit(demographic_test).transform(demographic_test)
demographic_test = demographic_test.drop("Sex", "Race")
demographic_test = demographic_test.withColumn("Sex", demographic_test["Sex_NUMERIC"].cast(T.IntegerType()))
demographic_test = demographic_test.withColumn("Race", demographic_test["Race_NUMERIC"].cast(T.IntegerType()))
demographic_test = demographic_test.drop("Sex_NUMERIC", "Race_NUMERIC")
demographic_test = demographic_test.withColumn("Age", (5/100*round(test["Age"] / 5 )).cast(T.FloatType()))
df_demographic_test = demographic_test.toPandas()
df_demographic_test = pd.get_dummies(df_demographic_test, columns=["Sex", "Race"])

del demographic_test
end = time.time()

print(f"Cell took {np.round(end-start, 2)} seconds")
df_demographic_test.head(5)

Cell took 2.38 seconds


Unnamed: 0,Age,Label,Sex_0,Sex_1,Sex_2,Race_0,Race_1,Race_2,Race_3,Race_4,Race_5,Race_6
0,0.2,0,1,0,0,1,0,0,0,0,0,0
1,0.25,0,1,0,0,0,0,1,0,0,0,0
2,0.8,1,1,0,0,1,0,0,0,0,0,0
3,0.1,0,0,1,0,0,1,0,0,0,0,0
4,0.5,1,1,0,0,0,0,0,1,0,0,0


## Concentate ICD and Demographic features

In [11]:
df_train = pd.concat([df_demographic_train, features_train], axis=1)
df_test = pd.concat([df_demographic_test, features_test], axis=1)

del df_demographic_train, df_demographic_test, features_train, features_test

## Convert to H20 Frame

In [12]:
start = time.time()
hf_test = h2o.H2OFrame(df_test)
end = time.time()
print(f"Converting test dataframe took {(np.round((end-start)/60, 2))} min")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Converting test dataframe took 15.47 min


In [13]:
start = time.time()
hf_train = h2o.H2OFrame(df_train)
end = time.time()
print(f"Converting train dataframe took {(np.round((end-start)/60, 2))} min")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Converting train dataframe took 62.0 min


## Prepare for H20 Models

In [14]:
hf_train['Label'] = hf_train['Label'].asfactor()
hf_test['Label'] = hf_test['Label'].asfactor()
response = "Label"
predictors = [predictor for predictor in hf_train.columns if predictor != 'Label']

In [15]:
train, valid = hf_train.split_frame(ratios=[.8], seed=1234)

## XGBoost Estimator

In [16]:
from h2o.estimators import H2OXGBoostEstimator
start = time.time()
xgb_model = H2OXGBoostEstimator(seed=1234)
xgb_model.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
end = time.time()
print(f"Cell took {np.round((end-start)/60, 2)} min")

xgboost Model Build progress: |███████████████████████████████████████████| 100%
Cell took 28.37 min


## Save Model

In [17]:
#h2o.save_model(model=xgb_model, path=dirPath + f"H20Models/XGBoost/_embed_{EMBED_LENGTH}", force=True)
h2o.save_model(model=xgb_model, path=dirPath + f"H20Models/XGBoost/combined_vists_embed_{EMBED_LENGTH}", force=True)

'/home/ubuntu/BioMedProject/Data/H20Models/XGBoost/combined_vists_embed_128/XGBoost_model_python_1629901629920_1'

## Save Model Performance

In [18]:
train_perf = xgb_model.model_performance()
test_perf = xgb_model.model_performance(hf_test)

In [19]:
def saveResults(preformance, csvFile, embed_len, dataset='train', printResults=False):
    logLoss = preformance.logloss()
    mse = preformance.mse()
    auc = preformance.auc()
    aucpr = preformance.aucpr()
    recall = preformance.recall()[0][0]
    precision = preformance.precision()[0][0]
    specificity = preformance.specificity()[0][0]
    accuracy = preformance.accuracy()[0][1]
    
    if dataset == 'train':
        csvFile.write(f"{embed_len}, {accuracy}, {auc}, {aucpr}, {logLoss}, {mse}, {recall}, {precision}, {specificity}, ")
    elif dataset == 'test':
        csvFile.write(f"{accuracy}, {auc}, {aucpr}, {logLoss}, {mse}, {recall}, {precision}, {specificity}\n")
    else:
        raise ValueError(f'dataset can only be \'train\' or \'test\'. \'{dataset}\' is incorrect.')
        
    if printResults:
        print(f"{dataset} Log loss: {logLoss}")
        print(f"{dataset} MSE: {mse}")
        print(f"{dataset} AUC: {auc}")
        print(f"{dataset} AUCPR: {aucpr}")
        print(f"{dataset} Recall: {recall}")
        print(f"{dataset} Precision: {precision}")
        print(f"{dataset} Specificity: {specificity}")
        print(f"{dataset} Accuracy: {accuracy}")

## Save model perfromance

In [20]:
#csvSavePath = dirPath + "h2o_XGB_ModelResults.csv"
csvSavePath = dirPath + "h2o_XGB_Combined_Visits_Model_Results.csv"
saveFile = open(csvSavePath, 'a')
saveResults(train_perf, saveFile, EMBED_LENGTH, dataset='train')
saveResults(test_perf, saveFile, EMBED_LENGTH, dataset='test')
saveFile.close()

In [98]:
pd.read_csv(dirPath + "h2o_XGB_Model_Results.csv") # Visits treated independently

Unnamed: 0,Embed Length,Train Accuracy,Train AUROC,Train AUROC_PR,Train Log Loss,Train MSE,Train Recall,Train Precision,Train Specificity,Test Accuracy,Test AUROC,Test AUROC_PR,Test Log_Loss,Test MSE,Test Recall,Test Precision,Test Specificity
0,16,0.657415,0.633367,0.494201,0.626402,0.2181,0.05466,0.877313,0.877313,0.657076,0.632496,0.493671,0.626825,0.218294,0.054958,0.876259,0.876259
1,32,0.660098,0.638536,0.501433,0.62422,0.217081,0.03648,0.852382,0.852382,0.65966,0.637549,0.500643,0.62472,0.217311,0.039834,0.852101,0.852101
2,64,0.660904,0.640139,0.503978,0.62346,0.216732,0.037797,0.921237,0.921237,0.660425,0.639131,0.503236,0.623972,0.216965,0.037954,0.920407,0.920407
3,128,0.662056,0.642023,0.50688,0.622557,0.216313,0.031975,0.879746,0.879746,0.661555,0.640687,0.505796,0.623181,0.216597,0.03254,0.883464,0.883464


In [126]:
pd.read_csv(dirPath + "h2o_XGB_Combined_Visits_Model_Results.csv") # Visits combined

Unnamed: 0,Embed Length,Train Accuracy,Train AUROC,Train AUROC_PR,Train Log Loss,Train MSE,Train Recall,Train Precision,Train Specificity,Test Accuracy,Test AUROC,Test AUROC_PR,Test Log_Loss,Test MSE,Test Recall,Test Precision,Test Specificity
0,16,0.657055,0.632326,0.492812,0.626953,0.218326,0.067188,0.818538,0.845964,0.656714,0.631512,0.492305,0.627348,0.21851,0.067548,0.82067,0.846085
1,32,0.65944,0.636886,0.499256,0.624955,0.217409,0.072857,0.82144,0.863385,0.658959,0.635927,0.498442,0.625446,0.217635,0.066733,0.812861,0.862946
2,64,0.660785,0.63988,0.5034,0.623634,0.216806,0.04668,0.888765,0.888765,0.660365,0.638877,0.502471,0.624164,0.217047,0.047363,0.897623,0.897623
3,128,0.66173,0.641524,0.505842,0.622873,0.216452,0.066173,0.862756,0.877116,0.661176,0.640296,0.504836,0.623467,0.216724,0.06617,0.862564,0.876771


## Analyze Predictions

In [99]:
!ls Data/H20Models/XGBoost

combined_vists_embed_128  combined_vists_embed_32  embed_128  embed_32
combined_vists_embed_16   combined_vists_embed_64  embed_16   embed_64


In [24]:
xgb_model = h2o.load_model('/home/ubuntu/BioMedProject/Data/H20Models/XGBoost/combined_vists_embed_16/XGBoost_model_python_1629824610670_1')
pred = xgb_model.predict(hf_test)
true_values = df_test['Label'].values.tolist()
pred_values = h2o.as_list(pred['predict']).values.tolist()
pred_values = [val[0] for val in pred_values]
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(len(true_values)):
    if true_values[i] == pred_values[i]:
        if true_values[i] == 0:
            TN += 1
        else:
            TP += 1
    else:
        if true_values[i] == 0:
            FP += 1
        else:
            FN += 1
total = TP + FP + TN + FN
print(f"True pos: {TP} ({TP / total})")
print(f"True neg: {TN} ({TN / total})")
print(f"False pos: {FP} ({FP / total})")
print(f"False neg: {FN} ({FN / total})")

xgboost prediction progress: |████████████████████████████████████████████| 100%
True pos: 1926265 (0.3501964448133788)
True neg: 183318 (0.03332735208826354)
False pos: 3331441 (0.6056585123570887)
False neg: 59503 (0.010817690741268973)
