# Run ML Models

In [1]:
import pandas as pd
import numpy as np
import time
import random
import pickle
import csv
import collections
from tqdm import tqdm

In [2]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark import SparkContext, SparkConf
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '50G')
        .set('spark.driver.memory', '100G')
        .set('spark.driver.maxResultSize', '50G'))
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# arrow enabling is what makes the conversion from pandas to spark dataframe really fast
sc._conf.get('spark.driver.memory')

'100G'

In [4]:
spark

## Linear SVC

In [None]:
def trainSVMModel(train, test, model_id, embed, maxIter, reg, txt_filepath, csv_filepath):
    # Train a Linear SVC model.
    start = time.time()
    svm = LinearSVC(labelCol="Label", featuresCol="features")
    svm.setMaxIter(maxIter)
    svm.setRegParam(reg)
    svm_model = svm.fit(train)
    end = time.time()
    
    # Save model
    model_path = dirPath + f"ModelResults/LinearSVC_EMBED{embed}_MAXITER{maxIter}_REG{reg}_AgeModif.model"
    svm_model.write().overwrite().save(model_path)
    
    # Predictions
    predictions_train = svm_model.transform(train)
    predictions_test = svm_model.transform(test)
    
    # Accuracy and AUROC
    train_accuracy = computeAccuracy(predictions_train)
    test_accuracy = computeAccuracy(predictions_test)
    train_auroc = computeAUROC(predictions_train)
    test_auroc = computeAUROC(predictions_test)
    
    timeElapsed = np.round((end-start)/60, 2)
    
    # Save results to readable text file 
    txt_filepath.write(f"{model_id}) SVM MODEL: {embed} embed length, {maxIter} max iter, {reg} reg\n")
    txt_filepath.write(f"\t Time: {timeElapsed} minutes\n")
    txt_filepath.write(f"\t TRAIN Accuracy: {train_accuracy}\n")
    txt_filepath.write(f"\t TRAIN AUROC:    {train_auroc}\n")
    txt_filepath.write(f"\t TEST  Accuracy: {test_accuracy}\n")
    txt_filepath.write(f"\t TEST  AUROC:    {test_auroc}\n")
    txt_filepath.write("\n")
    
    # Save results to csv file
    csv_result = f"{model_id}, {embed}, {maxIter}, {reg}, {timeElapsed}, {train_accuracy}, {train_auroc}, {test_accuracy}, {test_auroc}\n"
    csv_filepath.write(csv_result)
    

In [None]:
def computeAccuracy(predictions):
    evaluator = MulticlassClassificationEvaluator(
        labelCol="Label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    return accuracy

def computeAUROC(predictions):
    evaluator = BinaryClassificationEvaluator(labelCol="Label", rawPredictionCol="prediction", metricName="areaUnderROC")
    auroc = evaluator.evaluate(predictions)
    return auroc

In [None]:
from pyspark.ml.classification import LinearSVC, LinearSVCModel

dirPath = '/home/ubuntu/BioMedProject/Data/'
resultsFilePath = open(dirPath + 'SVM_Results_Readable.txt', "w")
csvFilePath = open(dirPath + 'SVM_Results.csv', "w")

EMBED_DIM =  [16, 32, 64, 128] #, 256]
MAX_ITER = [5, 10]
REG_PARAM = [0.0, 0.01, 0.1, 0.3]

model_id = 0
for embed in EMBED_DIM:
    
    trainFilepath = dirPath + f'ModelData/modelReadyTrain_{embed}.parquet'
    testFilepath = dirPath + f'ModelData/modelReadyTest_{embed}.parquet'
    df_test = spark.read.load(testFilepath, format="parquet", inferSchema="true", header="true")
    df_train = spark.read.load(trainFilepath, format="parquet", inferSchema="true", header="true")
    
    for maxIter in MAX_ITER:
        for reg in REG_PARAM:
            model_id += 1
            t1 = time.time()
            trainSVMModel(df_train, df_test, model_id, embed, maxIter, reg, resultsFilePath, csvFilePath)
            t2 = time.time()
            print(f"id: {model_id}, {embed} embed, {maxIter} max iter, {reg} reg, {(t2 - t1)/60} min")

csvFilePath.close()
resultsFilePath.close()

In [None]:
!cat 'Data/SVM_Results_Readable.txt'

## GBT classifier

In [None]:
def trainGBTModel(train, test, model_id, embed, maxDepth, txt_filepath, csv_filepath):
    # Train a GBT model.
    start = time.time()
    svm = GBTClassifier(labelCol="Label", featuresCol="features")
    svm.setMaxDepth(maxDepth)
    svm_model = svm.fit(train)
    end = time.time()
    
    # Save model
    model_path = dirPath + f"ModelResults/GBT_EMBED{embed}_MAXDEPTH{maxDepth}.model"
    svm_model.write().overwrite().save(model_path)
    
    # Predictions
    predictions_train = svm_model.transform(train)
    predictions_test = svm_model.transform(test)
    
    # Accuracy and AUROC
    train_accuracy = computeAccuracy(predictions_train)
    test_accuracy = computeAccuracy(predictions_test)
    train_auroc = computeAUROC(predictions_train)
    test_auroc = computeAUROC(predictions_test)
    
    timeElapsed = np.round((end-start)/60, 2)
    
    # Save results to readable text file 
    txt_filepath.write(f"{model_id}) GBT MODEL: {embed} embed length, {maxDepth} max depth,\n")
    txt_filepath.write(f"\t Time: {timeElapsed} minutes\n")
    txt_filepath.write(f"\t TRAIN Accuracy: {train_accuracy}\n")
    txt_filepath.write(f"\t TRAIN AUROC:    {train_auroc}\n")
    txt_filepath.write(f"\t TEST  Accuracy: {test_accuracy}\n")
    txt_filepath.write(f"\t TEST  AUROC:    {test_auroc}\n")
    txt_filepath.write("\n")
    
    # Save results to csv file
    csv_result = f"{model_id}, {embed}, {maxDepth}, {timeElapsed}, {train_accuracy}, {train_auroc}, {test_accuracy}, {test_auroc}\n"
    csv_filepath.write(csv_result)
    

In [None]:
def computeAccuracy(predictions):
    evaluator = MulticlassClassificationEvaluator(
        labelCol="Label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    return accuracy

def computeAUROC(predictions):
    evaluator = BinaryClassificationEvaluator(labelCol="Label", rawPredictionCol="prediction", metricName="areaUnderROC")
    auroc = evaluator.evaluate(predictions)
    return auroc

In [None]:
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel

dirPath = '/home/ubuntu/BioMedProject/Data/'
resultsFilePath = open(dirPath + 'GBT_Results_Readable.txt', "a") # append mode since models 1-5 were already run
csvFilePath = open(dirPath + 'GBT_Results.csv', "a")

EMBED_DIM =  [16, 32, 64, 128] #, 256]
MAXDEPTH_PARAM = [5, 10] #[15, 25, 50, 100]

model_id = 1
for embed in EMBED_DIM:
    
    trainFilepath = dirPath + f'ModelData/modelReadyTrain_{embed}.parquet'
    testFilepath = dirPath + f'ModelData/modelReadyTest_{embed}.parquet'
    df_test = spark.read.load(testFilepath, format="parquet", inferSchema="true", header="true")
    df_train = spark.read.load(trainFilepath, format="parquet", inferSchema="true", header="true")
    
    for maxDepth in MAXDEPTH_PARAM:
        model_id += 1
        if model_id < 6: continue
        trainGBTModel(df_train, df_test, model_id, embed, maxDepth, resultsFilePath, csvFilePath)
        print(f"{model_id}, {embed}, {maxDepth}")

csvFilePath.close()
resultsFilePath.close()


In [None]:
!cat 'Data/GBT_Results_Readable.txt'

## Random Forest Classifer

In [5]:
def trainRFModel(train, test, model_id, embed, maxDepth, numTrees, txt_filepath, csv_filepath):
    # Train a RF model.
    start = time.time()
    rf = RandomForestClassifier(labelCol="Label", featuresCol="features")
    rf.setMaxDepth(maxDepth)
    rf.setNumTrees(numTrees)
    rf_model = rf.fit(train)
    end = time.time()
    
    # Save model
    model_path = dirPath + f"ModelResults/RF_EMBED{embed}_MAXDEPTH{maxDepth}_NUMTREES{numTrees}.model"
    rf_model.write().overwrite().save(model_path)
    
    # Predictions
    predictions_train = rf_model.transform(train)
    predictions_test = rf_model.transform(test)
    
    # Accuracy and AUROC
    train_accuracy = computeAccuracy(predictions_train)
    test_accuracy = computeAccuracy(predictions_test)
    train_auroc = computeAUROC(predictions_train)
    test_auroc = computeAUROC(predictions_test)
    
    timeElapsed = np.round((end-start)/60, 2)
    
    # Save results to readable text file 
    txt_filepath.write(f"{model_id}) RF MODEL: {embed} embed length, {maxDepth} max depth, {numTrees} num trees\n")
    txt_filepath.write(f"\t Time: {timeElapsed} minutes\n")
    txt_filepath.write(f"\t TRAIN Accuracy: {train_accuracy}\n")
    txt_filepath.write(f"\t TRAIN AUROC:    {train_auroc}\n")
    txt_filepath.write(f"\t TEST  Accuracy: {test_accuracy}\n")
    txt_filepath.write(f"\t TEST  AUROC:    {test_auroc}\n")
    txt_filepath.write("\n")
    
    # Save results to csv file
    csv_result = f"{model_id}, {embed}, {maxDepth}, {numTrees}, {timeElapsed}, {train_accuracy}, {train_auroc}, {test_accuracy}, {test_auroc}\n"
    csv_filepath.write(csv_result)
    

In [6]:
def computeAccuracy(predictions):
    evaluator = MulticlassClassificationEvaluator(
        labelCol="Label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    return accuracy

def computeAUROC(predictions):
    evaluator = BinaryClassificationEvaluator(labelCol="Label", rawPredictionCol="prediction", metricName="areaUnderROC")
    auroc = evaluator.evaluate(predictions)
    return auroc

In [7]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

dirPath = '/home/ubuntu/BioMedProject/Data/'
resultsFilePath = open(dirPath + 'RF_Results_Readable.txt', "a")
csvFilePath = open(dirPath + 'RF_Results.csv', "a")

EMBED_DIM =  [16, 32, 64, 128] #, 256]
MAXDEPTH_PARAM = [15]
NUMBER_TREES = [20]

model_id = 4
for embed in EMBED_DIM:
    
    trainFilepath = dirPath + f'ModelData/modelReadyTrain_{embed}.parquet'
    testFilepath = dirPath + f'ModelData/modelReadyTest_{embed}.parquet'
    df_test = spark.read.load(testFilepath, format="parquet", inferSchema="true", header="true")
    df_train = spark.read.load(trainFilepath, format="parquet", inferSchema="true", header="true")
    
    for maxDepth in MAXDEPTH_PARAM:
        for numTrees in NUMBER_TREES:
            model_id += 1
            t1 = time.time()
            trainRFModel(df_train, df_test, model_id, embed, maxDepth, numTrees, resultsFilePath, csvFilePath)
            t2 = time.time()
            print(f"id: {model_id}, {embed} embed, {maxDepth} depth, {numTrees} trees, {(t2 - t1)/60} min")

csvFilePath.close()
resultsFilePath.close()

id: 5, 16 embed, 15 depth, 20 trees, 10.404978326956432 min
id: 6, 32 embed, 15 depth, 20 trees, 8.197274919350942 min
id: 7, 64 embed, 15 depth, 20 trees, 10.073543004194896 min
id: 8, 128 embed, 15 depth, 20 trees, 9.943018615245819 min


In [None]:
!cat 'Data/RF_Results_Readable.txt'

## Logistic Regression

In [14]:
def trainLRModel(train, test, model_id, embed, maxIter, reg, txt_filepath, csv_filepath):
    # Train a Logistic Regression model.
    start = time.time()
    lr = LogisticRegression(labelCol="Label", featuresCol="features", maxIter=maxIter, regParam=reg)
    lr_model = lr.fit(train)
    end = time.time()
    
    # Save model
    model_path = dirPath + f"ModelResults/LogisticReg_EMBED{embed}_MAXITER{maxIter}_REG{reg}.model"
    lr_model.write().overwrite().save(model_path)
    
    # Predictions
    predictions_train = lr_model.transform(train)
    predictions_test = lr_model.transform(test)
    
    # Accuracy and AUROC
    train_accuracy = computeAccuracy(predictions_train)
    test_accuracy = computeAccuracy(predictions_test)
    train_auroc = computeAUROC(predictions_train)
    test_auroc = computeAUROC(predictions_test)
    
    timeElapsed = np.round((end-start)/60, 2)
    
    # Save results to readable text file 
    txt_filepath.write(f"{model_id}) LR MODEL: {embed} embed length, {maxIter} max iter, {reg} reg\n")
    txt_filepath.write(f"\t Time: {timeElapsed} minutes\n")
    txt_filepath.write(f"\t TRAIN Accuracy: {train_accuracy}\n")
    txt_filepath.write(f"\t TRAIN AUROC:    {train_auroc}\n")
    txt_filepath.write(f"\t TEST  Accuracy: {test_accuracy}\n")
    txt_filepath.write(f"\t TEST  AUROC:    {test_auroc}\n")
    txt_filepath.write("\n")
    
    # Save results to csv file
    csv_result = f"{model_id}, {embed}, {maxIter}, {reg}, {timeElapsed}, {train_accuracy}, {train_auroc}, {test_accuracy}, {test_auroc}\n"
    csv_filepath.write(csv_result)   

In [13]:
def computeAccuracy(predictions):
    evaluator = MulticlassClassificationEvaluator(
        labelCol="Label", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    return accuracy

def computeAUROC(predictions):
    evaluator = BinaryClassificationEvaluator(labelCol="Label", rawPredictionCol="prediction", metricName="areaUnderROC")
    auroc = evaluator.evaluate(predictions)
    return auroc

In [None]:
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel

dirPath = '/home/ubuntu/BioMedProject/Data/'
resultsFilePath = open(dirPath + 'LR_Results_Readable.txt', "w")
csvFilePath = open(dirPath + 'LR_Results.csv', "w")

EMBED_DIM =  [16, 32, 64, 128] #, 256]
MAX_ITER = [5, 10]
REG_PARAM = [0.0, 0.01, 0.1, 0.3]

model_id = 0
for embed in EMBED_DIM:
    
    trainFilepath = dirPath + f'ModelData/modelReadyTrain_{embed}.parquet'
    testFilepath = dirPath + f'ModelData/modelReadyTest_{embed}.parquet'
    df_test = spark.read.load(testFilepath, format="parquet", inferSchema="true", header="true")
    df_train = spark.read.load(trainFilepath, format="parquet", inferSchema="true", header="true")
    
    for maxIter in MAX_ITER:
        for reg in REG_PARAM:
            model_id += 1
            t1 = time.time()
            trainLRModel(df_train, df_test, model_id, embed, maxIter, reg, resultsFilePath, csvFilePath)
            t2 = time.time()
            print(f"id: {model_id}, {embed} embed, {maxIter} max iter, {reg} reg, {(t2 - t1)/60} min")

csvFilePath.close()
resultsFilePath.close()

id: 1, 16 embed, 5 max iter, 0.0 reg, 0.9101653933525086 min
id: 2, 16 embed, 5 max iter, 0.01 reg, 0.889290964603424 min
id: 3, 16 embed, 5 max iter, 0.1 reg, 0.8832027792930603 min
id: 4, 16 embed, 5 max iter, 0.3 reg, 0.8853726545969646 min
id: 5, 16 embed, 10 max iter, 0.0 reg, 0.9196015278498332 min
id: 6, 16 embed, 10 max iter, 0.01 reg, 0.9145431280136108 min
id: 7, 16 embed, 10 max iter, 0.1 reg, 0.9206568638483683 min
id: 8, 16 embed, 10 max iter, 0.3 reg, 0.9217487851778666 min
id: 9, 32 embed, 5 max iter, 0.0 reg, 0.7806439717610677 min
id: 10, 32 embed, 5 max iter, 0.01 reg, 0.8238039294878642 min
id: 11, 32 embed, 5 max iter, 0.1 reg, 0.7682849685351054 min
id: 12, 32 embed, 5 max iter, 0.3 reg, 0.8196273366610209 min
id: 13, 32 embed, 10 max iter, 0.0 reg, 0.7896199504534404 min
id: 14, 32 embed, 10 max iter, 0.01 reg, 0.7854067087173462 min
id: 15, 32 embed, 10 max iter, 0.1 reg, 0.7849666873613993 min
id: 16, 32 embed, 10 max iter, 0.3 reg, 0.7926952878634135 min
id: 17

# Results

## SVM Classifier

In [27]:
headers = ["Model ID", "Embed Length", "Max Iter", "Reg", "Time Elapsed (min)", "Train Acc", "Train AUROC", "Test Acc", "Test AUROC"]
results = pd.read_csv(dirPath + 'SVM_Results.csv', header=None, names = headers)
results.sort_values(by = "Test AUROC", ascending = False)

Unnamed: 0,Model ID,Embed Length,Max Iter,Reg,Time Elapsed (min),Train Acc,Train AUROC,Test Acc,Test AUROC
0,1,16,25,0.0,0.56,0.639212,0.5,0.638986,0.5
1,2,16,25,0.01,0.53,0.639212,0.5,0.638986,0.5
20,21,128,25,0.001,1.41,0.639212,0.5,0.638986,0.5
21,22,128,50,0.0,2.21,0.639212,0.5,0.638986,0.5
22,23,128,50,0.01,2.27,0.639212,0.5,0.638986,0.5
23,24,128,50,0.001,2.48,0.639212,0.5,0.638986,0.5
24,25,128,100,0.0,3.66,0.639212,0.5,0.638986,0.5
25,26,128,100,0.01,3.66,0.639212,0.5,0.638986,0.5
26,27,128,100,0.001,4.3,0.639212,0.5,0.638986,0.5
27,28,256,25,0.0,8.26,0.639212,0.5,0.638986,0.5


## Random Forest Results

In [26]:
headers = ["Model ID", "Embed Length", "Max Depth", "Num Trees", "Time Elapsed (min)", "Train Acc", "Train AUROC", "Test Acc", "Test AUROC"]
results = pd.read_csv(dirPath + 'RF_Results.csv', header=None, names = headers)
results.sort_values(by = "Test AUROC", ascending = False)

Unnamed: 0,Model ID,Embed Length,Max Depth,Num Trees,Time Elapsed (min),Train Acc,Train AUROC,Test Acc,Test AUROC
1,2,32,15,10,2.77,0.658717,0.544697,0.656772,0.542763
6,7,64,15,20,8.05,0.65921,0.544596,0.657311,0.542689
5,6,32,15,20,6.21,0.658667,0.544395,0.656926,0.542651
2,3,64,15,10,3.7,0.658907,0.543688,0.656553,0.541311
4,5,16,15,20,7.75,0.657236,0.541462,0.655572,0.539854
0,1,16,15,10,3.12,0.656387,0.539635,0.654925,0.53823
3,4,128,15,10,4.88,0.639495,0.500424,0.638924,0.500023
7,8,128,15,20,8.1,0.639317,0.500155,0.638967,0.500005


## GBT Classifier Result

In [24]:
headers = ["Model ID", "Embed Length", "Max Depth", "Time Elapsed (min)", "Train Acc", "Train AUROC", "Test Acc", "Test AUROC"]
results = pd.read_csv(dirPath + 'GBT_Results.csv', header=None, names = headers)
results.sort_values(by = "Test AUROC", ascending = False)

Unnamed: 0,Model ID,Embed Length,Max Depth,Time Elapsed (min),Train Acc,Train AUROC,Test Acc,Test AUROC
2,3,16,15,29.3,0.668448,0.568967,0.65721,0.556877
4,5,32,10,4.41,0.659822,0.556354,0.65865,0.555257
1,2,16,10,4.08,0.656931,0.551535,0.656052,0.550858
0,1,16,5,2.64,0.651978,0.537596,0.651794,0.537644
3,4,32,5,2.39,0.651743,0.535896,0.651522,0.535888


## Logistic Regression Results

In [25]:
headers = ["Model ID", "Embed Length", "Max Iter", "Reg", "Time Elapsed (min)", "Train Acc", "Train AUROC", "Test Acc", "Test AUROC"]
results = pd.read_csv(dirPath + 'LR_Results.csv', header=None, names = headers)
results.sort_values(by = "Test AUROC", ascending = False)

Unnamed: 0,Model ID,Embed Length,Max Iter,Reg,Time Elapsed (min),Train Acc,Train AUROC,Test Acc,Test AUROC
20,21,64,10,0.0,0.47,0.649434,0.531101,0.649108,0.530974
16,17,64,5,0.0,0.43,0.648659,0.530546,0.648335,0.530429
17,18,64,5,0.01,0.43,0.648619,0.529753,0.648292,0.529629
21,22,64,10,0.01,0.48,0.649342,0.529708,0.648944,0.529502
12,13,32,10,0.0,0.29,0.648136,0.527471,0.647865,0.527416
8,9,32,5,0.0,0.28,0.64773,0.526526,0.647459,0.526454
13,14,32,10,0.01,0.28,0.64804,0.526243,0.647809,0.52622
4,5,16,10,0.0,0.3,0.647858,0.525754,0.6476,0.525686
9,10,32,5,0.01,0.27,0.64767,0.525681,0.647467,0.525682
0,1,16,5,0.0,0.29,0.647345,0.524798,0.647074,0.524736
