In [None]:
from pyspark.sql import *
from pyspark.ml import *

from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml import Pipeline

from builtins import round

In [None]:
# input_1 : Window_size = 5
train_folder = 'input_1/train'
valid_folder = 'input_1/valid'
num_trees = [10,100]
n_fold = 5
n_digits = 4
max_depth = [5,10]  # added according to Diane's reply in piazza


# input_2 : Window_size - 10
# train_folder = 'input_2/train'
# valid_folder = 'input_2/valid'
# num_trees = [10,100]
# n_fold = 5
# n_digits = 3
# max_depth = [5,10]  # added according to Diane's reply in piazza


In [None]:
ss = SparkSession.builder.config('spark.driver.extraClassPath', 'postgresql-42.2.18.jar')\  # no need jar actually
    .config("spark.executor.memory", "12g")\
    .config("spark.driver.memory", "12g")\
    .config("spark.executor.cores", 6)\
    .config('spark.executor.instances', 5).getOrCreate()

# QUESTION 1

In [None]:
train_df = ss.read.parquet(train_folder).repartition(8).cache()
valid_df = ss.read.parquet(valid_folder).repartition(8).cache()

In [None]:
print(train_df.count())

In [None]:
print(valid_df.count())

# QUESTION 2

In [None]:
# train_df.show(4)

In [None]:
rf = RandomForestClassifier()
evaluator = BinaryClassificationEvaluator()  # areaUnderROC is default  https://spark.apache.org/docs/latest/api/java/index.html?org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.html

paraGrid = ParamGridBuilder().addGrid(rf.numTrees, num_trees).build()

cv = CrossValidator(estimator = rf,
                   evaluator = evaluator,
                   numFolds = n_fold,
                   estimatorParamMaps = paraGrid)

cvmodel = cv.fit(train_df)

In [None]:
rfpredicts = cvmodel.bestModel.transform(valid_df)

In [None]:
print('RandomForestClassifier')

In [None]:
print(cvmodel.bestModel.getNumTrees)

In [None]:
print(round(evaluator.evaluate(rfpredicts),n_digits))

# QUESTION 3

In [None]:
GBT = GBTClassifier()
evaluator = BinaryClassificationEvaluator()  # areaUnderROC is default

paramGrid = ParamGridBuilder().addGrid(GBT.maxDepth, max_depth).build()

cv = CrossValidator(estimator = GBT,
                   evaluator = evaluator,
                   numFolds = n_fold,
                   estimatorParamMaps = paramGrid)

cvmodel = cv.fit(train_df)

In [None]:
GBTpredicts = cvmodel.bestModel.transform(valid_df)

In [None]:
print('GBTClassifier')

In [None]:
print(cvmodel.bestModel.getMaxDepth())

In [None]:
print(round(evaluator.evaluate(GBTpredicts),n_digits))