# Flights Delay Prediction using PySpark

## Imports

### Imports and session building

In [60]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
#from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.regression import LinearRegression , RandomForestRegressor , DecisionTreeRegressor, GBTRegressor
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator , RegressionEvaluator

In [2]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/09/29 19:54:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Data importing

In [3]:
csv = spark.read.csv('flights.csv', inferSchema=True, header=True)
csv.show(3)

                                                                                

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



#### converting departure delay into boolean label field to use in classification model
specifically a flight that departed late by 30 mins is marked as 1 and filghts that are departed early or late by less than 30 mins are marked as 0

In [4]:
data = csv.select("DayOfWeek", "DayOfMonth", "Carrier", "OriginAirportID", "DestAirportID", "ArrDelay", "DepDelay")
data.show(3)

+---------+----------+-------+---------------+-------------+--------+--------+
|DayOfWeek|DayOfMonth|Carrier|OriginAirportID|DestAirportID|ArrDelay|DepDelay|
+---------+----------+-------+---------------+-------------+--------+--------+
|        5|        19|     DL|          11433|        13303|       1|      -3|
|        5|        19|     DL|          14869|        12478|      -8|       0|
|        5|        19|     DL|          14057|        14869|     -15|      -4|
+---------+----------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



## Train Test split

I will use 70% of the data for training, and reserve 30% for testing. In the testing data, the label column is renamed to trueLabel so I can use it later to compare predicted labels with known actual values.

In [24]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
# test = splits[1].withColumnRenamed("label", "trueLabel")
# test = splits[1].withColumnRenamed("DepDelay", "trueLabel")

print("Training Rows:", train.count(), "\n", "Testing Rows:", test.count())



Training Rows: 1891799 
 Testing Rows: 811823


                                                                                

In [25]:
train.show(3)

[Stage 42:>                                                         (0 + 1) / 1]

+---------+----------+-------+---------------+-------------+--------+--------+
|DayOfWeek|DayOfMonth|Carrier|OriginAirportID|DestAirportID|ArrDelay|DepDelay|
+---------+----------+-------+---------------+-------------+--------+--------+
|        1|         1|     9E|          10397|        12191|     -18|      -2|
|        1|         1|     9E|          10397|        12264|     -25|      -3|
|        1|         1|     9E|          10397|        12264|       6|      -5|
+---------+----------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



                                                                                

In [26]:
test.show(3)

[Stage 43:>                                                         (0 + 1) / 1]

+---------+----------+-------+---------------+-------------+--------+---------+
|DayOfWeek|DayOfMonth|Carrier|OriginAirportID|DestAirportID|ArrDelay|trueLabel|
+---------+----------+-------+---------------+-------------+--------+---------+
|        1|         1|     9E|          10397|        12191|     -18|       -3|
|        1|         1|     9E|          10397|        13851|      -7|       -2|
|        1|         1|     9E|          10397|        13851|       2|        2|
+---------+----------+-------+---------------+-------------+--------+---------+
only showing top 3 rows



                                                                                

## Pipepine and Model Building

A pipeline consists of a series of transformer and estimator stages that typically prepare a DataFrame for modeling and then train a predictive model. In this case, you will create a pipeline with seven stages:

### Pipeline Items

* A StringIndexer estimator that converts string values to indexes for categorical features

* A VectorAssembler that combines categorical features into a single vector

* A VectorIndexer that creates indexes for a vector of categorical features

* A VectorAssembler that creates a vector of continuous numeric features

* A MinMaxScaler that normalizes continuous numeric features

* A VectorAssembler that creates a vector of categorical and continuous features

* A 3 Classifiers that trains a classification model.

In [27]:
strIdx = StringIndexer(inputCol = "Carrier", 
                        outputCol = "CarrierIdx")

catVect = VectorAssembler(inputCols = ["CarrierIdx", "DayOfMonth", "DayOfWeek", "OriginAirportID", "DestAirportID","ArrDelay"], 
                          outputCol="catFeatures")

catIdx = VectorIndexer( inputCol = catVect.getOutputCol(), 
                        outputCol = "idxCatFeatures")

numVect = VectorAssembler(inputCols = ["ArrDelay"], 
                          outputCol="numFeatures")

minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), 
                      outputCol="normFeatures")

featVect = VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], 
                            outputCol="features")

### Using 4 Regression Models

In [61]:
lr = LinearRegression(featuresCol="catFeatures", labelCol="DepDelay")
rf = RandomForestRegressor(featuresCol="catFeatures", labelCol="DepDelay" , numTrees=15)
dt = DecisionTreeRegressor(featuresCol="catFeatures", labelCol="DepDelay" , maxDepth=5)
gbt = GBTRegressor(featuresCol="catFeatures", labelCol="DepDelay" , maxIter=10)

In [62]:
pipeline_lr = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, lr])
pipeline_rf = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, rf])
pipeline_dt = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, dt])
pipeline_gbt = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, gbt])

### Fitting the Model on Train Data

In [63]:
plModel_lr = pipeline_lr.fit(train)

                                                                                

23/09/29 20:33:09 WARN Instrumentation: [87ce971a] regParam is zero, which might cause numerical instability and overfitting.


                                                                                

In [64]:
plModel_rd = pipeline_rf.fit(train)

[Stage 127:>                                                        (0 + 4) / 4]

23/09/29 20:34:04 WARN MemoryStore: Not enough space to cache rdd_507_1 in memory! (computed 67.3 MiB so far)
23/09/29 20:34:04 WARN BlockManager: Persisting block rdd_507_1 to disk instead.


                                                                                

In [65]:
plModel_dt = pipeline_dt.fit(train)

                                                                                

In [66]:
plModel_gbt = pipeline_gbt.fit(train)

                                                                                

## Making Predictions

### Making prediction on test

In [68]:
prediction_lr = plModel_lr.transform(test)
prediction_rf = plModel_rd.transform(test)
prediction_dt = plModel_dt.transform(test)
prediction_gbt = plModel_gbt.transform(test)


### Calculating Evaluation Metric

In [75]:
# Create an instance of RegressionEvaluator
evaluator_mae = RegressionEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="mae"  )
evaluator_mse = RegressionEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="mse"  )
evaluator_r2 = RegressionEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="r2"  )

# Calculate the evaluation metric for Linear regression model
mae_lr = evaluator_mae.evaluate(prediction_lr)
mse_lr = evaluator_mse.evaluate(prediction_lr)
r2_lr = evaluator_r2.evaluate(prediction_lr)

                                                                                

In [69]:
mae_rf = evaluator_mae.evaluate(prediction_rf)
mse_rf = evaluator_mse.evaluate(prediction_rf)
r2_rf = evaluator_r2.evaluate(prediction_rf)

                                                                                

In [70]:
mae_dt = evaluator_mae.evaluate(prediction_dt)
mse_dt = evaluator_mse.evaluate(prediction_dt)
r2_dt = evaluator_r2.evaluate(prediction_dt)

                                                                                

In [73]:
mae_gbt = evaluator_mae.evaluate(prediction_gbt)
mse_gbt = evaluator_mse.evaluate(prediction_gbt)
r2_gbt = evaluator_r2.evaluate(prediction_gbt)

                                                                                

#### Evaluation Metric

In [76]:
metrics = spark.createDataFrame([ ("Linear Regression",mae_lr, mse_lr,r2_lr ) ,
                                  ("Random Forest",mae_rf, mse_rf,r2_rf ) ,
                                  ("Decision Tree Regression",mae_dt, mse_dt,r2_dt ) ,
                                  ("Gradient Boost Regressor",mae_gbt, mse_gbt,r2_gbt) 
                                ],

                                ["Model Name","MAE", "MSE","R2"]
                              )
metrics.show()

[Stage 281:>                                                        (0 + 1) / 1]

+--------------------+-----------------+------------------+------------------+
|          Model Name|              MAE|               MSE|                R2|
+--------------------+-----------------+------------------+------------------+
|   Linear Regression|8.364091696765529|150.55199715628916|0.8831821518290619|
|       Random Forest|9.837720626903483| 421.2451051510495|0.6731431819851453|
|Decision Tree Reg...|7.639199489024753| 327.1529296471188|0.7461521468591279|
|Gradient Boost Re...|7.473643727135636| 320.1897720435061|0.7515550714505879|
+--------------------+-----------------+------------------+------------------+



                                                                                

### Showing prediction of each model 

In [None]:
# lr_output = prediction_lr.select("trueLabel", "prediction").withColumnRenamed("prediction", "lr_prediction").limit(10)
# dt_output = prediction_rf.select("trueLabel", "prediction").withColumnRenamed("prediction", "dt_prediction").limit(10)
# rf_output = prediction_dt.select("trueLabel", "prediction").withColumnRenamed("prediction", "rf_prediction").limit(10)
# gbt_output = prediction_gbt.select("trueLabel", "prediction").withColumnRenamed("prediction", "gbt_prediction").limit(10)

23/09/29 21:05:51 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [10000 milliseconds]. This timeout is controlled by spark.executor.heartbeatInterval
	at org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:47)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:62)
	at org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:58)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:103)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1053)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:238)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java

In [None]:
# # Join the predictions from all four models based on label
# combined_predictions = lr_output \
#     .join(dt_output, on="trueLabel", how="inner") \
#     .join(rf_output, on="trueLabel", how="inner") \
#     .join(gbt_output, on="trueLabel", how="inner")

# # Show the combined predictions
# combined_predictions.show(5)

## Conclusion