# Flights Delay Prediction using PySpark

## Imports

### Imports and session building

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
#from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/09/19 21:07:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Data importing

In [3]:
csv = spark.read.csv('flights.csv', inferSchema=True, header=True)
csv.show(3)

                                                                                

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



#### converting departure delay into boolean label field to use in classification model
specifically a flight that departed late by 30 mins is marked as 1 and filghts that are departed early or late by less than 30 mins are marked as 0

In [4]:
data = csv.select("DayOfWeek", "DayOfMonth", "Carrier", "OriginAirportID", "DestAirportID", "ArrDelay", ((col("DepDelay") > 15).cast("Int").alias("label")))
data.show(3)

+---------+----------+-------+---------------+-------------+--------+-----+
|DayOfWeek|DayOfMonth|Carrier|OriginAirportID|DestAirportID|ArrDelay|label|
+---------+----------+-------+---------------+-------------+--------+-----+
|        5|        19|     DL|          11433|        13303|       1|    0|
|        5|        19|     DL|          14869|        12478|      -8|    0|
|        5|        19|     DL|          14057|        14869|     -15|    0|
+---------+----------+-------+---------------+-------------+--------+-----+
only showing top 3 rows



## Train Test split

I will use 70% of the data for training, and reserve 30% for testing. In the testing data, the label column is renamed to trueLabel so I can use it later to compare predicted labels with known actual values.

In [5]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")

print("Training Rows:", train.count(), "\n", "Testing Rows:", test.count())



Training Rows: 1892749 
 Testing Rows: 809469


                                                                                

In [6]:
train.show(3)

[Stage 10:>                                                         (0 + 1) / 1]

+---------+----------+-------+---------------+-------------+--------+-----+
|DayOfWeek|DayOfMonth|Carrier|OriginAirportID|DestAirportID|ArrDelay|label|
+---------+----------+-------+---------------+-------------+--------+-----+
|        1|         1|     9E|          10397|        10693|      -6|    0|
|        1|         1|     9E|          10397|        12191|     -18|    0|
|        1|         1|     9E|          10397|        12191|     -18|    0|
+---------+----------+-------+---------------+-------------+--------+-----+
only showing top 3 rows



                                                                                

In [7]:
test.show(3)

[Stage 11:>                                                         (0 + 1) / 1]

+---------+----------+-------+---------------+-------------+--------+---------+
|DayOfWeek|DayOfMonth|Carrier|OriginAirportID|DestAirportID|ArrDelay|trueLabel|
+---------+----------+-------+---------------+-------------+--------+---------+
|        1|         1|     9E|          10423|        13487|     -10|        0|
|        1|         1|     9E|          10423|        14869|     -31|        0|
|        1|         1|     9E|          10529|        11193|     -10|        0|
+---------+----------+-------+---------------+-------------+--------+---------+
only showing top 3 rows



                                                                                

## Pipepine Building

A pipeline consists of a series of transformer and estimator stages that typically prepare a DataFrame for modeling and then train a predictive model. In this case, you will create a pipeline with seven stages:

* A StringIndexer estimator that converts string values to indexes for categorical features

* A VectorAssembler that combines categorical features into a single vector

* A VectorIndexer that creates indexes for a vector of categorical features

* A VectorAssembler that creates a vector of continuous numeric features

* A MinMaxScaler that normalizes continuous numeric features

* A VectorAssembler that creates a vector of categorical and continuous features

* A 3 Classifiers that trains a classification model.

In [8]:
strIdx = StringIndexer(inputCol = "Carrier", 
                        outputCol = "CarrierIdx")

catVect = VectorAssembler(inputCols = ["CarrierIdx", "DayOfMonth", "DayOfWeek", "OriginAirportID", "DestAirportID","ArrDelay"], 
                          outputCol="catFeatures")

catIdx = VectorIndexer( inputCol = catVect.getOutputCol(), 
                        outputCol = "idxCatFeatures")

numVect = VectorAssembler(inputCols = ["ArrDelay"], 
                          outputCol="numFeatures")

minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), 
                      outputCol="normFeatures")

featVect = VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], 
                            outputCol="features")

## Classification models

#### Logistic Regression , DecisionTreeClassifier , Gradient Boost classifier trains a classification model.

In [9]:
lr = LogisticRegression(labelCol="label",featuresCol="features",maxIter=10,regParam=0.3)
dtc = DecisionTreeClassifier(labelCol="label",featuresCol="features")
gbt = GBTClassifier(labelCol='label',featuresCol='features')

#### Creating 3 different pipeline to train all 3 model on train part of data.

In [10]:
pipeline_lr = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, lr])
pipeline_dtc = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, dtc])
pipeline_gbt = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, gbt])

#### Running the pipeline to train the model on train part of data.

In [11]:
plModel_lr = pipeline_lr.fit(train)

[Stage 20:>                                                         (0 + 4) / 4]

23/09/19 21:09:05 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/09/19 21:09:05 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

In [12]:
plModel_dtc = pipeline_dtc.fit(train)

                                                                                

In [13]:
plModel_gbt = pipeline_gbt.fit(train)

                                                                                

### Prediction

Transform the test data with all of the stages and the trained model in the pipeline to generate label predictions.

In [14]:
prediction_lr = plModel_lr.transform(test)
predicted_lr = prediction_lr.select("features", "prediction", "trueLabel")
# predicted_lr.show(100, truncate=False)

In [15]:
prediction_dtc = plModel_dtc.transform(test)
predicted_dtc = prediction_dtc.select("features", "prediction", "trueLabel")
# predicted_dtc.show(100, truncate=False)

In [16]:
prediction_gbt = plModel_gbt.transform(test)
predicted_gbt = prediction_gbt.select("features", "prediction", "trueLabel")
# predicted_gbt.show(100, truncate=False)

In [17]:
tp = float(predicted_lr.filter("prediction == 1.0 AND truelabel == 1").count())
fp = float(predicted_lr.filter("prediction == 1.0 AND truelabel == 0").count())
tn = float(predicted_lr.filter("prediction == 0.0 AND truelabel == 0").count())
fn = float(predicted_lr.filter("prediction == 0.0 AND truelabel == 1").count())
pr = tp / (tp + fp)
re = tp / (tp + fn)
metrics = spark.createDataFrame([ ("TP", tp),
                                  ("FP", fp),
                                  ("TN", tn),
                                  ("FN", fn),
                                  ("Precision", pr),
                                  ("Recall", re),
                                  ("F1", 2*pr*re/(re+pr))],
                                  
                                  ["metric", "value"]
                                )
metrics.show()

[Stage 276:>                                                        (0 + 1) / 1]

+---------+------------------+
|   metric|             value|
+---------+------------------+
|       TP|           34313.0|
|       FP|             629.0|
|       TN|          652598.0|
|       FN|          121929.0|
|Precision|0.9819987407704196|
|   Recall|0.2196144442595461|
|       F1|0.3589526320194159|
+---------+------------------+



                                                                                

### AUROC

In [19]:
binary_evaluator = BinaryClassificationEvaluator(labelCol="trueLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

auroc_lr = binary_evaluator.evaluate(prediction_lr)
auroc_dtc = binary_evaluator.evaluate(prediction_dtc)
auroc_gbt = binary_evaluator.evaluate(prediction_gbt)

print("AUR LR= ", auroc_lr)
print("AUR DTC= ", auroc_dtc)
print("AUR GBT= ", auroc_gbt)

                                                                                

AUR LR=  0.9580035733283679
AUR DTC=  0.6601904434620858
AUR GBT=  0.9609189355031346


### Acuuracy , Precision , Recall , F1 score

In [20]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="accuracy")
precision_evaluator = MulticlassClassificationEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="f1")

In [26]:
lr_acc = acc_evaluator.evaluate(prediction_lr)
lr_precision = precision_evaluator.evaluate(prediction_lr)
lr_rc = recall_evaluator.evaluate(prediction_lr)
lr_f1 = f1_evaluator.evaluate(prediction_lr)

                                                                                

In [27]:
dtc_acc = acc_evaluator.evaluate(prediction_dtc)
dtc_precision = precision_evaluator.evaluate(prediction_dtc)
dtc_rc = recall_evaluator.evaluate(prediction_dtc)
dtc_f1 = f1_evaluator.evaluate(prediction_dtc)

                                                                                

In [28]:
gbt_acc = acc_evaluator.evaluate(prediction_gbt)
gbt_precision = precision_evaluator.evaluate(prediction_gbt)
gbt_rc = recall_evaluator.evaluate(prediction_gbt)
gbt_f1 = f1_evaluator.evaluate(prediction_gbt)


                                                                                

### All model comparison

In [29]:
metrics = spark.createDataFrame([ ("Logistic Regression",lr_acc, lr_precision,lr_rc,lr_f1,auroc_lr) ,
                                  ("Decision Tree Classification",dtc_acc, dtc_precision,dtc_rc,dtc_f1,auroc_dtc) ,
                                  ("Gradient Boost Classifier",gbt_acc, gbt_precision, gbt_rc ,gbt_f1,auroc_gbt) 
                                ],

                                ["Model Name","Accuracy", "Precision","Recall","F1","AUROC"]
                              )
metrics.show()

+--------------------+------------------+------------------+------------------+------------------+------------------+
|          Model Name|          Accuracy|         Precision|            Recall|                F1|             AUROC|
+--------------------+------------------+------------------+------------------+------------------+------------------+
| Logistic Regression|0.8485945724913493|0.8694872277664414|0.8485945724913493|0.8069952722062542|0.9580035733283679|
|Decision Tree Cla...|0.9217647618376985|0.9192679801489796|0.9217647618376985|0.9192237385001706|0.6601904434620858|
|Gradient Boost Cl...|0.9232472151496846|0.9211126517148436|0.9232472151496846|  0.92158120386093|0.9609189355031346|
+--------------------+------------------+------------------+------------------+------------------+------------------+

