## Flight Delay Prediction  - Tree-based Algorithms
### Machine Learning At Scale
***Team 22 - Chenlin Ye, Hongsuk Nam and Swati Akella***

### Load training data

In [3]:
TEAM_PATH = 'dbfs:/mnt/w261/team22/'
display(dbutils.fs.ls('dbfs:/mnt/w261/'))

path,name,size
dbfs:/mnt/w261/flightDelay/,flightDelay/,0
dbfs:/mnt/w261/team22/,team22/,0


In [4]:
MODEL_DT = 'dbfs:/mnt/w261/team22/model/dt'
display(dbutils.fs.ls(MODEL_DT))

path,name,size
dbfs:/mnt/w261/team22/model/dt/metadata/,metadata/,0
dbfs:/mnt/w261/team22/model/dt/stages/,stages/,0


In [5]:
# Read from parquet
trainRDD = spark.read.option("header", "true").parquet(TEAM_PATH+"trainRDD.parquet")
validationRDD = spark.read.option("header", "true").parquet(TEAM_PATH+"validationRDD.parquet")
testRDD = spark.read.option("header", "true").parquet(TEAM_PATH+"testRDD.parquet")


# Checking the number of records for each dataset
print(f"... train dataset has {trainRDD.count()} records for evaluation")
print(f"... validation dataset has {validationRDD.count()} records for evaluation")
print(f"... test dataset has {testRDD.count()} records for evaluation")

### ML Pipeline

##### Load training data

In [8]:
trainRDD.createOrReplaceTempView('trainRDD')
testRDD.createOrReplaceTempView('testRDD')
validationRDD.createOrReplaceTempView('validationRDD')

In [9]:
# Drop TAIL_NUMBER
trainRDD_tree = trainRDD.drop('TAIL_NUM')
testRDD_tree = testRDD.drop('TAIL_NUM')
validationRDD_tree = validationRDD.drop('TAIL_NUM')

# sql view
trainRDD_tree.createOrReplaceTempView('trainRDD_tree')
testRDD_tree.createOrReplaceTempView('testRDD_tree')
validationRDD_tree.createOrReplaceTempView('validationRDD_tree')

In [10]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import PipelineModel
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.feature import PCA

import numpy as np
import pandas as pd

#### Decision tree

In [12]:
trainRDD_tree.printSchema()

##### Decision Tree Classifier (single tree; maxDepth=10)

In [14]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth=10)

# obtain numeric features 
numericCols = [feature for (feature, dataType) in trainRDD_tree.dtypes if ((dataType == "double") | (dataType == "int")) & (feature != "DEP_DEL15")]

# obtain categorical features 
categoricalCols = [feature for (feature, dataType) in trainRDD_tree.dtypes if (dataType == "string") & (feature != "DEP_DEL15")]

# create indexer and OHE output columns
  # - no one-hot-encoding needed for DT
indexOutputCols = [x + "Index" for x in categoricalCols]
# oheOutputCols = [x + "OHE" for x in categoricalCols]

# create column indexers for categorical features
  # - no one-hot-encoding needed for DT
  # - categorical features will be transformed to indexOutputCols
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="keep")
label_stringIndexer = StringIndexer(inputCol='DEP_DEL15', outputCol='label')
# oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# create vector assembler so that all features are in one single vector 
  # - indexOutputCols: indexed categorical features
  # - numericCols: original numeric features
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# create stages for pipeline
stages = [stringIndexer, label_stringIndexer, vecAssembler, dt]

# ML pipeline
pipeline = Pipeline(stages=stages)

# the value of max bins needs to be >= max number of categories for any categorical feature
dt.setMaxBins(36174)
pipelineModel_dt = pipeline.fit(trainRDD_tree)

Model evaluation

In [16]:
# Metrics - part 1
predictions = pipelineModel_dt.transform(validationRDD_tree)
evaluator = BinaryClassificationEvaluator()

# Metrics - part 2
tp = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 0)].count()
total = predictions.count()
recall = float(tp)/(tp + fn)

# Metrics - part 3
data = {'Actual: delay': [tp, fn], 'Actual: on-time': [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient='index', 
                                          columns=['Prediction: delay', 'Prediction: on-time'])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderPR'})))

print("True positive rate: {:.2%}".format(tp/(tp + fn)))
print("True negative rate: {:.2%}".format(tn/(tn + fp)))
print("False positive rate: {:.2%}".format(fp/(fp + tn)))
print("False negative rate: {:.2%}".format(fn/(tp + fn)))
print("Recall: {:.2%}".format(recall))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

Extract feature importance

In [18]:
featureImportance = pipelineModel_dt.stages[-1].featureImportances
va = pipelineModel_dt.stages[-2]

importabnce_df = pd.DataFrame(list(zip(va.getInputCols(), featureImportance)), columns=["feature", "importance"])
importabnce_df.sort_values(by="importance", ascending=False)

Unnamed: 0,feature,importance
11,PR_ARR_DEL15Index,0.645834
5,CRS_DEP_TIME_HOURIndex,0.245068
9,OP_CARRIER_FL_NUMIndex,0.036103
6,OP_UNIQUE_CARRIERIndex,0.022503
2,ORIGINIndex,0.019611
4,CRS_ARR_TIME_HOURIndex,0.009559
3,DESTIndex,0.007757
19,DISTANCE,0.004674
18,DEW_TEMP,0.003483
14,PRECIPITATION,0.002183


##### Feature Selection - based on DT's feature importance
- features with importance level lower than 0.0006 are dropped
- ```OP_CARRIER_FL_NUM``` is also dropped since it can be a proxy for origin/destination

In [20]:
selected_ftr = ['PR_ARR_DEL15', 'CRS_DEP_TIME_HOUR', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'CRS_ARR_TIME_HOUR', 
                'DEST', 'DISTANCE', 'DEW_TEMP', 'PRECIPITATION', 'MONTH', 'TMP_TEMP', 'DAY_OF_WEEK', 'DEP_DEL15']

trainRDD_tree_ftr = trainRDD_tree.select(selected_ftr).cache()
validationRDD_tree_ftr = validationRDD_tree.select(selected_ftr).cache()
testRDD_tree_ftr = testRDD_tree.select(selected_ftr).cache()

##### Decision Tree - cross validation (main notebook)

In [22]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label')

# obtain numeric features 
numericCols = [feature for (feature, dataType) in trainRDD_tree_ftr.dtypes if ((dataType == "double") | (dataType == "int")) & (feature != "DEP_DEL15")]

# obtain categorical features 
categoricalCols = [feature for (feature, dataType) in trainRDD_tree_ftr.dtypes if (dataType == "string") & (feature != "DEP_DEL15")]

# create indexer outputs
indexOutputCols = [x + "Index" for x in categoricalCols]

# create column indexers for categorical features
  # - categorical features will be transformed to indexOutputCols
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="keep")
label_stringIndexer = StringIndexer(inputCol='DEP_DEL15', outputCol='label')

# create vector assembler so that all features are in one single vector 
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# create stages for pipeline
stages = [stringIndexer, label_stringIndexer, vecAssembler, dt]

# ML pipeline
pipeline = Pipeline(stages=stages)

# construct paramGrid
paramGrid = (ParamGridBuilder()
  .addGrid(dt.maxDepth, [2, 4, 6, 10, 12])
  .addGrid(dt.maxBins, [400, 800, 1200])
  .build())

# define evaluation metrics
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderROC').setRawPredictionCol('prediction').setLabelCol('label')

# cross-validator
cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    parallelism =3,
                    seed=42)

cvModel_dt = cv.fit(trainRDD_tree_ftr)
dt_bestModel = cvModel_dt.bestModel

# inspect results
# list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

In [23]:
from pyspark.ml.pipeline import PipelineModel

param_dict = dt_bestModel.stages[-1].extractParamMap()

sane_dict = {}
for k,v in param_dict.items():
  sane_dict[k.name] = v
  
best_maxDepth = sane_dict["maxDepth"]
best_maxBins = sane_dict["maxBins"]

print(best_maxDepth)
print(best_maxBins)

Model evaluation (DT with cross validation)

In [25]:
# Metrics - part 1
predictions = dt_bestModel.transform(testRDD_tree_ftr)
evaluator = BinaryClassificationEvaluator()

# Metrics - part 2
tp = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 0)].count()
total = predictions.count()
recall = float(tp)/(tp + fn)

# Metrics - part 3
data = {'Actual: delay': [tp, fn], 'Actual: on-time': [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient='index', 
                                          columns=['Prediction: delay', 'Prediction: on-time'])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderPR'})))

print("True positive rate: {:.2%}".format(tp/(tp + fn)))
print("True negative rate: {:.2%}".format(tn/(tn + fp)))
print("False positive rate: {:.2%}".format(fp/(fp + tn)))
print("False negative rate: {:.2%}".format(fn/(tp + fn)))
print("Recall: {:.2%}".format(recall))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

Feature importance

In [27]:
featureImportance = dt_bestModel.stages[-1].featureImportances
va = dt_bestModel.stages[-2]

importabnce_df = pd.DataFrame(list(zip(va.getInputCols(), featureImportance)), columns=["feature", "importance"])
importabnce_df.sort_values(by="importance", ascending=False)

Unnamed: 0,feature,importance
0,PR_ARR_DEL15Index,0.704536
1,CRS_DEP_TIME_HOURIndex,0.261126
2,OP_UNIQUE_CARRIERIndex,0.020187
3,ORIGINIndex,0.010011
8,DISTANCE,0.003115
5,DESTIndex,0.000489
7,DAY_OF_WEEKIndex,0.000345
9,DEW_TEMP,0.000191
4,CRS_ARR_TIME_HOURIndex,0.0
6,MONTHIndex,0.0


##### Decision tree (single tree, maxDepth=10)
- Drop features with lower than 0.001
- Keep flight number

In [29]:
selected_ftr = ['PR_ARR_DEL15', 'CRS_DEP_TIME_HOUR', 'OP_CARRIER_FL_NUM', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'CRS_ARR_TIME_HOUR', 
                'DEST', 'DISTANCE', 'DEW_TEMP', 'PRECIPITATION', 'MONTH', 'DEP_DEL15']

trainRDD_tree_ftr2 = trainRDD_tree.select(selected_ftr)
validationRDD_tree_ftr2 = validationRDD_tree.select(selected_ftr)
testRDD_tree_ftr2 = testRDD_tree.select(selected_ftr)

In [30]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth=10)

# obtain numeric features 
numericCols = [feature for (feature, dataType) in trainRDD_tree_ftr2.dtypes if ((dataType == "double") | (dataType == "int")) & (feature != "DEP_DEL15")]

# obtain categorical features 
categoricalCols = [feature for (feature, dataType) in trainRDD_tree_ftr2.dtypes if (dataType == "string") & (feature != "DEP_DEL15")]

# create indexer and OHE output columns
  # - no one-hot-encoding needed for DT
indexOutputCols = [x + "Index" for x in categoricalCols]
# oheOutputCols = [x + "OHE" for x in categoricalCols]

# create column indexers for categorical features
  # - no one-hot-encoding needed for DT
  # - categorical features will be transformed to indexOutputCols
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="keep")
label_stringIndexer = StringIndexer(inputCol='DEP_DEL15', outputCol='label')
# oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# create vector assembler so that all features are in one single vector 
  # - indexOutputCols: indexed categorical features
  # - numericCols: original numeric features
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# create stages for pipeline
stages = [stringIndexer, label_stringIndexer, vecAssembler, dt]

# ML pipeline
pipeline = Pipeline(stages=stages)

# the value of max bins needs to be >= max number of categories for any categorical feature
dt.setMaxBins(36174)
pipelineModel_dt2 = pipeline.fit(trainRDD_tree_ftr2)

Model evaluation

In [32]:
# Metrics - part 1
predictions = pipelineModel_dt2.transform(validationRDD_tree_ftr2)
evaluator = BinaryClassificationEvaluator()

# Metrics - part 2
tp = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 0)].count()
total = predictions.count()
recall = float(tp)/(tp + fn)

# Metrics - part 3
data = {'Actual: delay': [tp, fn], 'Actual: on-time': [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient='index', 
                                          columns=['Prediction: delay', 'Prediction: on-time'])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderPR'})))

print("True positive rate: {:.2%}".format(tp/(tp + fn)))
print("True negative rate: {:.2%}".format(tn/(tn + fp)))
print("False positive rate: {:.2%}".format(fp/(fp + tn)))
print("False negative rate: {:.2%}".format(fn/(tp + fn)))
print("Recall: {:.2%}".format(recall))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

#### Random forest

##### Random forest with optimized parameters based on DT (main notebook)

In [35]:
selected_ftr = ['PR_ARR_DEL15', 'CRS_DEP_TIME_HOUR', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'CRS_ARR_TIME_HOUR', 
                'DEST', 'DISTANCE', 'DEW_TEMP', 'PRECIPITATION', 'MONTH', 'TMP_TEMP', 'DAY_OF_WEEK', 'DEP_DEL15']

trainRDD_tree_ftr = trainRDD_tree.select(selected_ftr).cache()
validationRDD_tree_ftr = validationRDD_tree.select(selected_ftr).cache()
testRDD_tree_ftr = testRDD_tree.select(selected_ftr).cache()

In [36]:
# Random Forest Classifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', maxBins=800)

# obtain numeric features 
numericCols = [feature for (feature, dataType) in trainRDD_tree_ftr.dtypes if ((dataType == "double") | (dataType == "int")) & (feature != "DEP_DEL15")]

# obtain categorical features 
categoricalCols = [feature for (feature, dataType) in trainRDD_tree_ftr.dtypes if (dataType == "string") & (feature != "DEP_DEL15")]

# create indexer and OHE output columns
  # - no one-hot-encoding needed for DT
indexOutputCols = [x + "Index" for x in categoricalCols]
# oheOutputCols = [x + "OHE" for x in categoricalCols]

# create column indexers for categorical features
  # - no one-hot-encoding needed for DT
  # - categorical features will be transformed to indexOutputCols
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="keep")
label_stringIndexer = StringIndexer(inputCol='DEP_DEL15', outputCol='label')
# oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# create vector assembler so that all features are in one single vector 
  # - indexOutputCols: indexed categorical features
  # - numericCols: original numeric features
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# ML pipeline
pipeline = Pipeline(stages = [stringIndexer, label_stringIndexer, vecAssembler, rf])

# construct paramGrid
paramGrid = (ParamGridBuilder()
  .addGrid(rf.maxDepth, [4, 6, 10])
  .addGrid(rf.numTrees, [10, 50, 100])
  .build())

# define evaluation metrics
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderROC').setRawPredictionCol('prediction').setLabelCol('label')

cv = CrossValidator(estimator=pipeline,
                    evaluator=evaluator, 
                    estimatorParamMaps=paramGrid, 
                    numFolds=3, 
                    parallelism=3,
                    seed=42)

cvModel_rf2 = cv.fit(trainRDD_tree_ftr)
rf_bestModel_tvs = cvModel_rf2.bestModel

# inspect results
# list(zip(tvsModel.getEstimatorParamMaps(), tvsModel.avgMetrics))

In [37]:
# Metrics - part 1
predictions = rf_bestModel_tvs.transform(testRDD_tree_ftr)
evaluator = BinaryClassificationEvaluator()

# Metrics - part 2
tp = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 0)].count()
total = predictions.count()
recall = float(tp)/(tp + fn)

# Metrics - part 3
data = {'Actual: delay': [tp, fn], 'Actual: on-time': [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient='index', 
                                          columns=['Prediction: delay', 'Prediction: on-time'])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderPR'})))

print("True positive rate: {:.2%}".format(tp/(tp + fn)))
print("True negative rate: {:.2%}".format(tn/(tn + fp)))
print("False positive rate: {:.2%}".format(fp/(fp + tn)))
print("False negative rate: {:.2%}".format(fn/(tp + fn)))
print("Recall: {:.2%}".format(recall))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

##### Random Forest with down-sampling

Under-sample the ontime flights

In [40]:
trainRDD_tree_delay = trainRDD_tree.filter("DEP_DEL15 = 1")
trainRDD_tree_ontime = trainRDD_tree.filter("DEP_DEL15 = 0")

sampleRatio = float(trainRDD_tree_delay.count()) / float(trainRDD_tree.count())
ontimeSampleDf = trainRDD_tree_ontime.sample(False, sampleRatio)

trainRDD_tree_undersample = trainRDD_tree_delay.unionAll(ontimeSampleDf)

In [41]:
selected_ftr = ['PR_ARR_DEL15', 'CRS_DEP_TIME_HOUR', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'CRS_ARR_TIME_HOUR', 
                'DEST', 'DISTANCE', 'DEW_TEMP', 'PRECIPITATION', 'TMP_TEMP', 'DAY_OF_WEEK', 'MONTH', 'ALTIMETER_SET', 'CLOUD_BASE_HEIGHT', 'VIS_DISTANCE', 'WND_SPEED', 'DEP_DEL15']

trainRDD_tree_ftr3 = trainRDD_tree.select(selected_ftr).cache()
validationRDD_tree_ftr3 = validationRDD_tree.select(selected_ftr).cache()
testRDD_tree_ftr3 = testRDD_tree.select(selected_ftr).cache()

In [42]:
# Random Forest Classifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label', maxBins=800)

# obtain numeric features 
numericCols = [feature for (feature, dataType) in trainRDD_tree_ftr3.dtypes if ((dataType == "double") | (dataType == "int")) & (feature != "DEP_DEL15")]

# obtain categorical features 
categoricalCols = [feature for (feature, dataType) in trainRDD_tree_ftr3.dtypes if (dataType == "string") & (feature != "DEP_DEL15")]

# create indexer and OHE output columns
  # - no one-hot-encoding needed for DT
indexOutputCols = [x + "Index" for x in categoricalCols]
# oheOutputCols = [x + "OHE" for x in categoricalCols]

# create column indexers for categorical features
  # - no one-hot-encoding needed for DT
  # - categorical features will be transformed to indexOutputCols
stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="keep")
label_stringIndexer = StringIndexer(inputCol='DEP_DEL15', outputCol='label')
# oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# create vector assembler so that all features are in one single vector 
  # - indexOutputCols: indexed categorical features
  # - numericCols: original numeric features
assemblerInputs = indexOutputCols + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# ML pipeline
pipeline = Pipeline(stages = [stringIndexer, label_stringIndexer, vecAssembler, rf])

# construct paramGrid
paramGrid = (ParamGridBuilder()
  .addGrid(rf.maxDepth, [4, 6, 10])
  .addGrid(rf.numTrees, [10, 50, 100])
  .build())

# define evaluation metrics
evaluator = BinaryClassificationEvaluator().setMetricName('areaUnderROC').setRawPredictionCol('prediction').setLabelCol('label')

# less expensive, single pair 
tvs = TrainValidationSplit(estimator=pipeline,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

tvsModel_rf = tvs.fit(trainRDD_tree_ftr3)
rf_bestModel_tvs = tvsModel_rf.bestModel

# inspect results
# list(zip(tvsModel.getEstimatorParamMaps(), tvsModel.avgMetrics))

In [43]:
# Metrics - part 1
predictions = rf_bestModel_tvs.transform(validationRDD_tree_ftr3)
evaluator = BinaryClassificationEvaluator()

# Metrics - part 2
tp = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.DEP_DEL15 == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.DEP_DEL15 == 1) & (predictions.prediction == 0)].count()
total = predictions.count()
recall = float(tp)/(tp + fn)

# Metrics - part 3
data = {'Actual: delay': [tp, fn], 'Actual: on-time': [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient='index', 
                                          columns=['Prediction: delay', 'Prediction: on-time'])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderROC'})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(predictions, {evaluator.metricName: 'areaUnderPR'})))

print("True positive rate: {:.2%}".format(tp/(tp + fn)))
print("True negative rate: {:.2%}".format(tn/(tn + fp)))
print("False positive rate: {:.2%}".format(fp/(fp + tn)))
print("False negative rate: {:.2%}".format(fn/(tp + fn)))
print("Recall: {:.2%}".format(recall))

print("########### Confusion Matrix ###########")
print(confusion_matrix)