### Import packages

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.pipeline import PipelineModel
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

### Data Extraction

##### Data directory & team folder

In [5]:
# Data directory
DATA_PATH = "dbfs:/mnt/mids-w261/data/datasets_final_project/"
display(dbutils.fs.ls(DATA_PATH))

path,name,size
dbfs:/mnt/mids-w261/data/datasets_final_project/airlines_data/,airlines_data/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/allstate-claims-severity.zip,allstate-claims-severity.zip,51204863
dbfs:/mnt/mids-w261/data/datasets_final_project/dac.tar.gz,dac.tar.gz,4576820670
dbfs:/mnt/mids-w261/data/datasets_final_project/kdd-cup-2014-predicting-excitement-at-donors-choose.zip,kdd-cup-2014-predicting-excitement-at-donors-choose.zip,971133938
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data/,parquet_airlines_data/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data_3m/,parquet_airlines_data_3m/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/parquet_airlines_data_6m/,parquet_airlines_data_6m/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/porto-seguro-safe-driver-prediction.zip,porto-seguro-safe-driver-prediction.zip,80247571
dbfs:/mnt/mids-w261/data/datasets_final_project/walmart-recruiting-trip-type-classification.zip,walmart-recruiting-trip-type-classification.zip,11510035
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/,weather_data/,0


In [6]:
# Weather data 
display(dbutils.fs.ls(DATA_PATH+"weather_data"))

path,name,size
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/weather-miss.parquet/,weather-miss.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/weather2015a.parquet/,weather2015a.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/weather2016a.parquet/,weather2016a.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/weather2017a.parquet/,weather2017a.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/weather2018a.parquet/,weather2018a.parquet/,0
dbfs:/mnt/mids-w261/data/datasets_final_project/weather_data/weather2019a.parquet/,weather2019a.parquet/,0


In [7]:
# create team folder
# dbutils.fs.mkdirs('dbfs:/mnt/w261/team22')
TEAM_PATH = 'dbfs:/mnt/w261/team22/'
display(dbutils.fs.ls('dbfs:/mnt/w261/'))

path,name,size
dbfs:/mnt/w261/flightDelay/,flightDelay/,0
dbfs:/mnt/w261/team22/,team22/,0


In [8]:
TEAM_PATH = 'dbfs:/mnt/w261/team22/'
display(dbutils.fs.ls('dbfs:/mnt/w261/team22'))

path,name,size
dbfs:/mnt/w261/team22/airlines_6m_cleaned.parquet/,airlines_6m_cleaned.parquet/,0
dbfs:/mnt/w261/team22/test/,test/,0
dbfs:/mnt/w261/team22/testweather_transformed.parquet/,testweather_transformed.parquet/,0
dbfs:/mnt/w261/team22/weather.parquet/,weather.parquet/,0
dbfs:/mnt/w261/team22/weather_column_transform.parquet/,weather_column_transform.parquet/,0
dbfs:/mnt/w261/team22/weather_stations.parquet/,weather_stations.parquet/,0
dbfs:/mnt/w261/team22/weather_timestamp.parquet/,weather_timestamp.parquet/,0
dbfs:/mnt/w261/team22/weather_us.parquet/,weather_us.parquet/,0
dbfs:/mnt/w261/team22/weather_us_stations.parquet/,weather_us_stations.parquet/,0


In [9]:
#dbutils.fs.mkdirs('dbfs:/mnt/w261/team22/model')
#dbutils.fs.mkdirs('dbfs:/mnt/w261/team22/model/lr')
#dbutils.fs.mkdirs('dbfs:/mnt/w261/team22/model/dt')
#dbutils.fs.mkdirs('dbfs:/mnt/w261/team22/model/gbt')
MODEL_LR = 'dbfs:/mnt/w261/team22/model/lr'
MODEL_DT = 'dbfs:/mnt/w261/team22/model/dt'
MODEL_GBT = 'dbfs:/mnt/w261/team22/model/gbt'

##### Check point - trainRDD, validationRDD, testRDD

In [11]:
# Read from parquet
trainRDD = spark.read.option("header", "true").parquet(TEAM_PATH+"trainRDD.parquet")
validationRDD = spark.read.option("header", "true").parquet(TEAM_PATH+"validationRDD.parquet")
testRDD = spark.read.option("header", "true").parquet(TEAM_PATH+"testRDD.parquet")


# Checking the number of records for each dataset
print(f"... train dataset has {trainRDD.count()} records for evaluation")
print(f"... validation dataset has {validationRDD.count()} records for evaluation")
print(f"... test dataset has {testRDD.count()} records for evaluation")

### ML Pipeline

####GBT Model Exploration

- Implementation (Scalability)
1. We didn't include the feature that has large number of categorical values (e.g. TAIL_NUM, OP_CARRIER_FL_NUM), since those features considerably slowed down the notebook (not scalable).
2. We used cache to our dataframe to save the time and make it more scalable.

In [14]:
#Drop columns that has large number of categorical values
trainRDD_GBT = trainRDD.drop("TAIL_NUM", "OP_CARRIER_FL_NUM")
validationRDD_GBT = validationRDD.drop("TAIL_NUM", "OP_CARRIER_FL_NUM")
testRDD_GBT = testRDD.drop("TAIL_NUM", "OP_CARRIER_FL_NUM")

trainRDD_GBT.cache()
validationRDD_GBT.cache()
testRDD_GBT.cache()

In [15]:
trainRDD_GBT.printSchema()

In [16]:
#Set up categorical/numerical features for staging
categoricals_gbt = [
 'YEAR',
 'DAY_OF_WEEK',
 'ORIGIN',
 'DEST',
 'CRS_ARR_TIME_HOUR',
 'CRS_DEP_TIME_HOUR',
 'OP_UNIQUE_CARRIER',
 'MONTH',
 'DAY_OF_MONTH',
 'QUARTER',
 'PR_ARR_DEL15']

numerics_gbt = ['CLOUD_BASE_HEIGHT',
 'ALTIMETER_SET',
 'PRECIPITATION',
 'SNOW',
 'SLP_PRESSURE',
 'TMP_TEMP',
 'DEW_TEMP',
 'DISTANCE',
 'CLOUD_COVERAGE',
 'VIS_DISTANCE',
 'WND_SPEED']

#####Model tuning #1

In [18]:
# Establish stages for our GBT model
inputCol_gbt = [x + "Index" for x in categoricals_gbt]
indexers_gbt = StringIndexer(inputCols=categoricals_gbt, outputCols=inputCol_gbt, handleInvalid="keep")
label_indexers_gbt = StringIndexer(inputCol="DEP_DEL15", outputCol="label")

featureCols_gbt = inputCol_gbt + numerics_gbt

# Define vector assemblers
vector_gbt = VectorAssembler(inputCols=featureCols_gbt, outputCol="features")

# Define a GBT model.
gbt = GBTClassifier(featuresCol="features",
                    labelCol="label",
                    lossType = "logistic",
                    maxBins = 400)

# Chain indexer and GBT in a Pipeline
stages_gbt = [indexers_gbt, label_indexers_gbt, vector_gbt, gbt]
pipeline_gbt = Pipeline(stages=stages_gbt)

# Build the parameter grid for model tuning
paramGrid_gbt = (ParamGridBuilder()
.addGrid(gbt.maxDepth, [2, 5, 10])
.addGrid(gbt.maxIter, [10, 20, 50])
.build())

# Cross validation
cv_gbt = CrossValidator(estimator=pipeline_gbt,
                          estimatorParamMaps=paramGrid_gbt,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

# Train the tuned model and establish our best model
cvModel_gbt = cv_gbt.fit(trainRDD_GBT)
cv_gbt_model = cvModel_gbt.bestModel

In [19]:
#Save model
cv_gbt_model.write().overwrite().save(MODEL_GBT)

In [20]:
#Load model
cv_gbt_model = PipelineModel.load(MODEL_GBT)

#####Result #1

- This tuning we included all the hyperparameter tuning (using parameter grid with different max depths and max iterations) and used crossvalidator to generalize the model. We utilized best model from this result and applied it to the test dataset to see the result.

Validation dataset

Precision / Recall / F1-score

In [24]:
validation_gbt2 = cv_gbt_model.transform(validationRDD_GBT)

val_true2 = validation_gbt2.select(['label']).collect()
val_pred2 = validation_gbt2.select(['prediction']).collect()

print(classification_report(val_true2, val_pred2))

ROC / PR / Confusion matrix

In [26]:
evaluator = BinaryClassificationEvaluator()

tp = validation_gbt2[(validation_gbt2.DEP_DEL15 == 1) & (validation_gbt2.prediction == 1)].count()
tn = validation_gbt2[(validation_gbt2.DEP_DEL15 == 0) & (validation_gbt2.prediction == 0)].count()
fp = validation_gbt2[(validation_gbt2.DEP_DEL15 == 0) & (validation_gbt2.prediction == 1)].count()
fn = validation_gbt2[(validation_gbt2.DEP_DEL15 == 1) & (validation_gbt2.prediction == 0)].count()

data = {"Actual: delay": [tp, fn], "Actual: on-time": [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient="index", columns=["Prediction: delay", "Prediction: on-time"])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(validation_gbt2, {evaluator.metricName: "areaUnderROC"})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(validation_gbt2, {evaluator.metricName: "areaUnderPR"})))
      
print("True positive rate: {:.2%}".format(tp/(tp+fn)))
print("True negative rate: {:.2%}".format(tn/(tn+fp)))
print("False positive rate: {:.2%}".format(fp/(tn+fp)))
print("False negative rate: {:.2%}".format(fn/(tp+fn)))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

Hyperparameters for best model

In [28]:
from pyspark.ml.pipeline import PipelineModel
loaded_model = PipelineModel.load(MODEL_GBT)

param_dict = loaded_model.stages[-1].extractParamMap()

sane_dict = {}
for k,v in param_dict.items():
  sane_dict[k.name] = v
  
best_maxDepth = sane_dict["maxDepth"]
best_maxIter = sane_dict["maxIter"]
print("max depth for best model: ", best_maxDepth)
print("max iteration for best model: ", best_maxIter)

#####Model tuning #2

In [30]:
#down sampling
trainRDD_gbt_delay = trainRDD_GBT.filter("DEP_DEL15 = 1")
trainRDD_gbt_ontime = trainRDD_GBT.filter("DEP_DEL15 = 0")

ratio = float(trainRDD_gbt_delay.count()) / float(trainRDD_GBT.count())
ontime_df = trainRDD_gbt_ontime.sample(False, ratio)

trainRDD_gbt_undersample = trainRDD_gbt_delay.unionAll(ontime_df)

In [31]:
# Establish stages for our GBT model
inputCol_gbt = [x + "Index" for x in categoricals_gbt]
indexers_gbt = StringIndexer(inputCols=categoricals_gbt, outputCols=inputCol_gbt, handleInvalid="keep")
label_indexers_gbt = StringIndexer(inputCol="DEP_DEL15", outputCol="label")

featureCols_gbt = inputCol_gbt + numerics_gbt

# Define vector assemblers
vector_gbt = VectorAssembler(inputCols=featureCols_gbt, outputCol="features")

# Define a GBT model.
gbt = GBTClassifier(featuresCol="features",
                    labelCol="label",
                    lossType = "logistic",
                    maxBins = 400)

# Chain indexer and GBT in a Pipeline
stages_gbt = [indexers_gbt, label_indexers_gbt, vector_gbt, gbt]
pipeline_gbt = Pipeline(stages=stages_gbt)

# Build the parameter grid for model tuning
paramGrid_gbt = (ParamGridBuilder()
.addGrid(gbt.maxDepth, [2, 5, 10])
.addGrid(gbt.maxIter, [10, 20, 50])
.build())

# Cross validation
cv_gbt = CrossValidator(estimator=pipeline_gbt,
                          estimatorParamMaps=paramGrid_gbt,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)

# Train the tuned model and establish our best model
cvModel_gbt_undersample = cv_gbt.fit(trainRDD_gbt_undersample)
cv_gbt_model_undersample = cvModel_gbt_undersample.bestModel

In [32]:
#Save model
cv_gbt_model_undersample.write().overwrite().save(MODEL_GBT)

In [33]:
#Load model
cv_gbt_model_undersample = PipelineModel.load(MODEL_GBT)

#####Result #2

- This tuning we used down sampled data to compensate imbalance between 1 and 0 from delay prediction labels. Although we get higher area under ROC, false positive (which is important for our business problem) rate is too high, so we didn't pick this model.

Validation dataset

Precision / Recall / F1-score

In [37]:
#Precision / Recall / F1-score
validation_gbt3 = cv_gbt_model_undersample.transform(validationRDD_GBT)

val_true3 = validation_gbt3.select(['label']).collect()
val_pred3 = validation_gbt3.select(['prediction']).collect()

print(classification_report(val_true3, val_pred3))

ROC / PR / Confusion matrix

In [39]:
evaluator = BinaryClassificationEvaluator()

tp = validation_gbt3[(validation_gbt3.DEP_DEL15 == 1) & (validation_gbt3.prediction == 1)].count()
tn = validation_gbt3[(validation_gbt3.DEP_DEL15 == 0) & (validation_gbt3.prediction == 0)].count()
fp = validation_gbt3[(validation_gbt3.DEP_DEL15 == 0) & (validation_gbt3.prediction == 1)].count()
fn = validation_gbt3[(validation_gbt3.DEP_DEL15 == 1) & (validation_gbt3.prediction == 0)].count()

data = {"Actual: delay": [tp, fn], "Actual: on-time": [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient="index", columns=["Prediction: delay", "Prediction: on-time"])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(validation_gbt3, {evaluator.metricName: "areaUnderROC"})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(validation_gbt3, {evaluator.metricName: "areaUnderPR"})))
      
print("True positive rate: {:.2%}".format(tp/(tp+fn)))
print("True negative rate: {:.2%}".format(tn/(tn+fp)))
print("False positive rate: {:.2%}".format(fp/(tn+fp)))
print("False negative rate: {:.2%}".format(fn/(tp+fn)))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

Hyperparameters for best model

In [41]:
from pyspark.ml.pipeline import PipelineModel
loaded_model = PipelineModel.load(MODEL_GBT)

param_dict = loaded_model.stages[-1].extractParamMap()

sane_dict = {}
for k,v in param_dict.items():
  sane_dict[k.name] = v
  
best_maxDepth = sane_dict["maxDepth"]
best_maxIter = sane_dict["maxIter"]
print("max depth for best model: ", best_maxDepth)
print("max iteration for best model: ", best_maxIter)

####GBT Final Model Result with Test Data

Precision / Recall / F1-score

In [44]:
#Precision / Recall / F1-score
test_gbt = cv_gbt_model.transform(testRDD_GBT)

test_true = test_gbt.select(['label']).collect()
test_pred = test_gbt.select(['prediction']).collect()

print(classification_report(test_true, test_pred))

ROC / PR / Confusion matrix

In [46]:
evaluator = BinaryClassificationEvaluator()

tp = test_gbt[(test_gbt.DEP_DEL15 == 1) & (test_gbt.prediction == 1)].count()
tn = test_gbt[(test_gbt.DEP_DEL15 == 0) & (test_gbt.prediction == 0)].count()
fp = test_gbt[(test_gbt.DEP_DEL15 == 0) & (test_gbt.prediction == 1)].count()
fn = test_gbt[(test_gbt.DEP_DEL15 == 1) & (test_gbt.prediction == 0)].count()

data = {"Actual: delay": [tp, fn], "Actual: on-time": [fp, tn]}
confusion_matrix = pd.DataFrame.from_dict(data, orient="index", columns=["Prediction: delay", "Prediction: on-time"])

print("Test Area Under ROC: ", "{:.2f}".format(evaluator.evaluate(test_gbt, {evaluator.metricName: "areaUnderROC"})))
print("Test Area Under Precision-Recall Curve: ", "{:.2f}".format(evaluator.evaluate(test_gbt, {evaluator.metricName: "areaUnderPR"})))
      
print("True positive rate: {:.2%}".format(tp/(tp+fn)))
print("True negative rate: {:.2%}".format(tn/(tn+fp)))
print("False positive rate: {:.2%}".format(fp/(tn+fp)))
print("False negative rate: {:.2%}".format(fn/(tp+fn)))

print("########### Confusion Matrix ###########")
print(confusion_matrix)

Hyperparameters for best model

In [48]:
from pyspark.ml.pipeline import PipelineModel
loaded_model = PipelineModel.load(MODEL_GBT)

param_dict = loaded_model.stages[-1].extractParamMap()

sane_dict = {}
for k,v in param_dict.items():
  sane_dict[k.name] = v
  
best_maxDepth = sane_dict["maxDepth"]
best_maxIter = sane_dict["maxIter"]
print("max depth for best model: ", best_maxDepth)
print("max iteration for best model: ", best_maxIter)