In [1]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor, GeneralizedLinearRegression, IsotonicRegression
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LinearSVC, NaiveBayes



In [2]:
spark = SparkSession.builder.appName("AirlineStatusModel_xiomara_1").config("spark.jars",\
        "path/to/your/jars").config("spark.driver.extraJavaOptions", "--illegal-access=warn").getOrCreate() 
spark.sparkContext

23/11/18 19:09:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
df = spark.read.csv("gs://msca-bdp-student-gcs/Group7_Final_Project/flight_status/airline_status_preprocessed.csv", 
                                     header = True, inferSchema = True)

                                                                                

In [4]:
df = df.withColumn("departure_delay_15", col("departure_delay_15").cast("double"))

In [5]:
df.printSchema()

root
 |-- flight_date: timestamp (nullable = true)
 |-- airline: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- cancelled: boolean (nullable = true)
 |-- diverted: boolean (nullable = true)
 |-- crs_departure_time: integer (nullable = true)
 |-- departure_time: integer (nullable = true)
 |-- departure_delay_mins: double (nullable = true)
 |-- departure_delay: double (nullable = true)
 |-- arrival_time: integer (nullable = true)
 |-- arrival_delay_mins: double (nullable = true)
 |-- air_time: double (nullable = true)
 |-- crs_elapsed_time: double (nullable = true)
 |-- actual_elapsed_time: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- marketing_airline_network: string (nullable = true)
 |-- flight_nu

In [34]:
df.select(['flight_date', 'origin', 'destination', 'airline', 'departure_time_formatted', 'arrival_time','departure_delay','departure_delay_mins']).show()

+-------------------+------+-----------+-----------------+--------------+------------+---------------+--------------------+
|        flight_date|origin|destination|          airline|departure_time|arrival_time|departure_delay|departure_delay_mins|
+-------------------+------+-----------+-----------------+--------------+------------+---------------+--------------------+
|2018-01-23 00:00:00|   ABY|        ATL|Endeavor Air Inc.|          1157|        1256|           -5.0|                 0.0|
|2018-01-24 00:00:00|   ABY|        ATL|Endeavor Air Inc.|          1157|        1258|           -5.0|                 0.0|
|2018-01-25 00:00:00|   ABY|        ATL|Endeavor Air Inc.|          1153|        1302|           -9.0|                 0.0|
|2018-01-26 00:00:00|   ABY|        ATL|Endeavor Air Inc.|          1150|        1253|          -12.0|                 0.0|
|2018-01-27 00:00:00|   ABY|        ATL|Endeavor Air Inc.|          1355|        1459|           -5.0|                 0.0|
|2018-01

In [6]:
# StringIndexer: Convert origin, destination, airlines, and day of week cols to numerical indices
indexer_departure = StringIndexer(inputCol="origin", outputCol="departure_index")
indexer_arrival = StringIndexer(inputCol="destination", outputCol="arrival_index")
indexer_airline = StringIndexer(inputCol="airline", outputCol="airline_index")
indexer_day_of_week = StringIndexer(inputCol="day_of_week", outputCol="day_of_week_index")

# OneHotEncoder: Convert numerical indices to sparse vectors
encoder_departure = OneHotEncoder(inputCol="departure_index", outputCol="departure_onehot")
encoder_arrival = OneHotEncoder(inputCol="arrival_index", outputCol="arrival_onehot")
encoder_airline = OneHotEncoder(inputCol="airline_index", outputCol="airline_onehot")
encoder_day_of_week = OneHotEncoder(inputCol="day_of_week_index", outputCol="day_of_week_onehot")


In [7]:
# Assemble feature vector
df = df.na.drop(subset=["origin", "destination", "airline", "day_of_week", \
                        "taxi_out", "crs_elapsed_time", "distance", "departure_delay"])

selected_features = ["departure_onehot", "arrival_onehot", "airline_onehot","day_of_week_onehot",\
                     "taxi_out","crs_elapsed_time", "distance"]

assembler = VectorAssembler(
    inputCols= selected_features,
    outputCol="features"
)

# Create a pipeline
pipeline = Pipeline(stages=[indexer_departure, indexer_arrival, indexer_airline, indexer_day_of_week,
                            encoder_departure, encoder_arrival, encoder_airline, encoder_day_of_week,
                            assembler])


# Fit and transform the data
model = pipeline.fit(df)
transformed_df = model.transform(df)



                                                                                

In [8]:
# Display the result
transformed_df.select("origin", "destination", "airline", "day_of_week", "features").show(truncate=False)

+------+-----------+-----------------+-----------+---------------------------------------------------------------------+
|origin|destination|airline          |day_of_week|features                                                             |
+------+-----------+-----------------+-----------+---------------------------------------------------------------------+
|ABY   |ATL        |Endeavor Air Inc.|2          |(810,[273,387,782,806,807,808,809],[1.0,1.0,1.0,1.0,14.0,62.0,145.0])|
|ABY   |ATL        |Endeavor Air Inc.|3          |(810,[273,387,782,805,807,808,809],[1.0,1.0,1.0,1.0,13.0,62.0,145.0])|
|ABY   |ATL        |Endeavor Air Inc.|4          |(810,[273,387,782,803,807,808,809],[1.0,1.0,1.0,1.0,18.0,62.0,145.0])|
|ABY   |ATL        |Endeavor Air Inc.|5          |(810,[273,387,782,801,807,808,809],[1.0,1.0,1.0,1.0,17.0,62.0,145.0])|
|ABY   |ATL        |Endeavor Air Inc.|6          |(810,[273,387,782,807,808,809],[1.0,1.0,1.0,17.0,60.0,145.0])        |
|ABY   |ATL        |Endeavor Air

In [9]:
#sampling data
sampled_df = transformed_df.sample(fraction=0.1, seed=42) 

train_df, test_df = sampled_df.randomSplit([0.8, 0.2], seed=42)


In [10]:
train_df.count()

23/11/18 19:13:14 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

1954879

### Started with a logistic regression w/binary departure_delay_15_column - not very useful

In [11]:
lr = LogisticRegression(featuresCol="features", labelCol="departure_delay_15", maxIter=10)
lr_model = lr.fit(train_df)

                                                                                

In [13]:
predictions_lr = lr_model.transform(test_df)

# Select the columns of interest
selected_columns = ["features", "departure_delay_15", "prediction", "probability"]
result = predictions_lr.select(selected_columns)

# Show the results
# result.show()

In [14]:
# Decision Tree
dt = DecisionTreeClassifier(featuresCol='features', labelCol='departure_delay_15', maxDepth=5)
dt_model = dt.fit(train_df)
predictions_dt = dt_model.transform(test_df)


                                                                                

In [15]:
# Random Forest
rf = RandomForestClassifier(featuresCol='features', labelCol='departure_delay_15', numTrees=10)
rf_model = rf.fit(train_df)
predictions_rf = rf_model.transform(test_df)

                                                                                

In [16]:
# GBT
gbt = GBTClassifier(featuresCol='features', labelCol='departure_delay_15', maxIter=10)
gbt_model = gbt.fit(train_df)
predictions_gbt = gbt_model.transform(test_df)

                                                                                

In [17]:
# Naive Bayes -- requires non-negative features 
# nb = NaiveBayes(featuresCol='features', labelCol='departure_delay_15')
# nb_model = nb.fit(train_df)
# predictions_nb = nb_model.transform(test_df)

In [18]:
# Assuming your label column is named 'departure_delay_15'
binary_evaluator = BinaryClassificationEvaluator(labelCol='departure_delay_15')
multiclass_evaluator = MulticlassClassificationEvaluator(labelCol='departure_delay_15', metricName='f1')


models_classification = {
    "Logistic Regression": predictions_lr,
    "Decision Trees": predictions_dt,
    "Random Forest": predictions_rf,
    "GBT": predictions_gbt
#     "Naive Bayes": predictions_nb
}

for model_name, predictions in models_classification.items():
    auc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
    accuracy = multiclass_evaluator.evaluate(predictions)
    f1_score = multiclass_evaluator.evaluate(predictions, {multiclass_evaluator.metricName: "f1"})
    
    print(f"{model_name} - AUC: {auc:.4f}, Accuracy: {accuracy:.4f}, F1 Score: {f1_score:.4f}")

                                                                                

Logistic Regression - AUC: 0.6084, Accuracy: 0.7610, F1 Score: 0.7610


                                                                                

Decision Trees - AUC: 0.4562, Accuracy: 0.7602, F1 Score: 0.7602


                                                                                

Random Forest - AUC: 0.5465, Accuracy: 0.7601, F1 Score: 0.7601




GBT - AUC: 0.5984, Accuracy: 0.7603, F1 Score: 0.7603


                                                                                

In [19]:
# Create a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="departure_delay_15")

# Evaluate the model on test data
accuracy = evaluator.evaluate(predictions)
f1_score = evaluator.evaluate(predictions)

# Print the accuracy
print("Accuracy: {:.2%}".format(accuracy))
print("F1 Score: {:.2%}".format(f1_score))

                                                                                

Accuracy: 59.84%
F1 Score: 59.84%


### Switch to column departure_delay as label and changed in linear regression

In [35]:
# Create a Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='departure_delay', regParam=0.3, elasticNetParam=0.8, \
                      maxIter=10)
lr_model = lr.fit(train_df)

# Make predictions on the test set
predictions_lr = lr_model.transform(test_df)

# # Evaluate the regression model
# regression_evaluator = RegressionEvaluator(labelCol="departure_delay_mins", metricName="rmse")
# rmse = regression_evaluator.evaluate(predictions_lr)

# print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

                                                                                

In [36]:
# Decision Tree Regression
dt_reg = DecisionTreeRegressor(featuresCol='features', labelCol='departure_delay', maxDepth=5)
dt_model_reg = dt_reg.fit(train_df)
predictions_dt_reg = dt_model_reg.transform(test_df)

                                                                                

In [37]:
# Random Forest Regression
rf_reg = RandomForestRegressor(featuresCol='features', labelCol='departure_delay', numTrees=10)
rf_model_reg = rf_reg.fit(train_df)
predictions_rf_reg = rf_model_reg.transform(test_df)

                                                                                

In [38]:
# GBT Regression
gbt_reg = GBTRegressor(featuresCol='features', labelCol='departure_delay', maxIter=10)
gbt_model_reg = gbt_reg.fit(train_df)
predictions_gbt_reg = gbt_model_reg.transform(test_df)

                                                                                

In [39]:
# Generalized Linear Regression
glr = GeneralizedLinearRegression(featuresCol='features', labelCol='departure_delay', family='gaussian', link='identity', maxIter=10)
glr_model = glr.fit(train_df)
predictions_glr = glr_model.transform(test_df)

23/11/19 00:19:43 WARN Instrumentation: [76fd41ce] regParam is zero, which might cause numerical instability and overfitting.
23/11/19 00:21:29 WARN Instrumentation: [76fd41ce] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


In [None]:
# Isotonic Regression
iso_reg = IsotonicRegression(featuresCol='features', labelCol='departure_delay')
iso_model = iso_reg.fit(train_df)
predictions_iso_reg = iso_model.transform(test_df)



In [42]:
# Regression evaluations
regression_evaluator_rmse = RegressionEvaluator(labelCol="departure_delay", predictionCol="prediction", metricName="rmse")
regression_evaluator_mae = RegressionEvaluator(labelCol="departure_delay", predictionCol="prediction", metricName="mae")
regression_evaluator_r2 = RegressionEvaluator(labelCol="departure_delay", predictionCol="prediction", metricName="r2")

# Regression metrics
models_regression = {
    "Linear Regression": predictions_lr,
    "Decision Trees": predictions_dt_reg,
    "Random Forest": predictions_rf_reg,
    "GBT": predictions_gbt_reg,
    "Generalized Linear Regression": predictions_glr,
    "Isotonic Regression": predictions_iso_reg
}

for model_name, predictions in models_regression.items():
    rmse = regression_evaluator_rmse.evaluate(predictions)
    mae = regression_evaluator_mae.evaluate(predictions)
    r2 = regression_evaluator_r2.evaluate(predictions)
    
    print(f"{model_name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R-squared: {r2:.4f}")

                                                                                

Linear Regression - RMSE: 46.8996, MAE: 19.9823, R-squared: 0.0061


                                                                                

Decision Trees - RMSE: 47.0317, MAE: 20.0168, R-squared: 0.0005


                                                                                

Random Forest - RMSE: 46.9388, MAE: 20.0193, R-squared: 0.0045


                                                                                

GBT - RMSE: 46.9454, MAE: 19.9637, R-squared: 0.0042


                                                                                

Generalized Linear Regression - RMSE: 46.8564, MAE: 19.8955, R-squared: 0.0079




Isotonic Regression - RMSE: 80.3208, MAE: 68.1365, R-squared: -1.9151


                                                                                

### R_2 is very low so going to take aways airlines and see if it helps...

In [43]:
selected_features = ["departure_onehot", "arrival_onehot","day_of_week_onehot",\
                     "taxi_out","crs_elapsed_time", "distance"]

assembler = VectorAssembler(
    inputCols= selected_features,
    outputCol="features"
)

# Create a pipeline
pipeline = Pipeline(stages=[indexer_departure, indexer_arrival, indexer_day_of_week,
                            encoder_departure, encoder_arrival, encoder_day_of_week,
                            assembler])


# Fit and transform the data
model = pipeline.fit(df)
transformed_df = model.transform(df)

                                                                                

In [44]:
#sampling data
sampled_df = transformed_df.sample(fraction=0.1, seed=42) 

train_df, test_df = sampled_df.randomSplit([0.8, 0.2], seed=42)

In [45]:
# Create a Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='departure_delay', regParam=0.3, elasticNetParam=0.8, maxIter=10)
lr_model = lr.fit(train_df)

# Make predictions on the test set
predictions_lr = lr_model.transform(test_df)

# Decision Tree Regression
dt_reg = DecisionTreeRegressor(featuresCol='features', labelCol='departure_delay', maxDepth=5)
dt_model_reg = dt_reg.fit(train_df)
predictions_dt_reg = dt_model_reg.transform(test_df)

# Random Forest Regression
rf_reg = RandomForestRegressor(featuresCol='features', labelCol='departure_delay', numTrees=10)
rf_model_reg = rf_reg.fit(train_df)
predictions_rf_reg = rf_model_reg.transform(test_df)

# GBT Regression
gbt_reg = GBTRegressor(featuresCol='features', labelCol='departure_delay', maxIter=10)
gbt_model_reg = gbt_reg.fit(train_df)
predictions_gbt_reg = gbt_model_reg.transform(test_df)

# Generalized Linear Regression
glr = GeneralizedLinearRegression(featuresCol='features', labelCol='departure_delay', family='gaussian', link='identity', maxIter=10)
glr_model = glr.fit(train_df)
predictions_glr = glr_model.transform(test_df)

# Isotonic Regression
iso_reg = IsotonicRegression(featuresCol='features', labelCol='departure_delay')
iso_model = iso_reg.fit(train_df)
predictions_iso_reg = iso_model.transform(test_df)



23/11/19 03:13:48 WARN Instrumentation: [f0dedb90] regParam is zero, which might cause numerical instability and overfitting.
23/11/19 03:15:39 WARN Instrumentation: [f0dedb90] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [46]:
# Regression evaluations
regression_evaluator_rmse = RegressionEvaluator(labelCol="departure_delay", predictionCol="prediction", metricName="rmse")
regression_evaluator_mae = RegressionEvaluator(labelCol="departure_delay", predictionCol="prediction", metricName="mae")
regression_evaluator_r2 = RegressionEvaluator(labelCol="departure_delay", predictionCol="prediction", metricName="r2")

# Regression metrics
models_regression = {
    "Linear Regression": predictions_lr,
    "Decision Trees": predictions_dt_reg,
    "Random Forest": predictions_rf_reg,
    "GBT": predictions_gbt_reg,
    "Generalized Linear Regression": predictions_glr,
    "Isotonic Regression": predictions_iso_reg
}

for model_name, predictions in models_regression.items():
    rmse = regression_evaluator_rmse.evaluate(predictions)
    mae = regression_evaluator_mae.evaluate(predictions)
    r2 = regression_evaluator_r2.evaluate(predictions)
    
    print(f"{model_name} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R-squared: {r2:.4f}")

                                                                                

Linear Regression - RMSE: 46.8996, MAE: 19.9823, R-squared: 0.0061


                                                                                

Decision Trees - RMSE: 47.0317, MAE: 20.0168, R-squared: 0.0005


                                                                                

Random Forest - RMSE: 46.9388, MAE: 20.0193, R-squared: 0.0045


                                                                                

GBT - RMSE: 46.9454, MAE: 19.9637, R-squared: 0.0042


                                                                                

Generalized Linear Regression - RMSE: 46.8564, MAE: 19.8955, R-squared: 0.0079




Isotonic Regression - RMSE: 80.3208, MAE: 68.1365, R-squared: -1.9151


                                                                                

### arrival_delay_mins