In [1]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor, DecisionTreeRegressor, \
                                  RandomForestRegressor, LinearRegression, \
                                  GeneralizedLinearRegression

In [2]:
spark = SparkSession.builder.appName("FlightPriceRegression").getOrCreate() 
spark.conf.set("spark.sql.legacy.timeParserPolicy", "CORRECTED")
spark.sparkContext

In [3]:
price_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group7_Final_Project/flight_pricing/preprocessed_prices.parquet", 
                                     header=True, inferSchema=True)

                                                                                

In [4]:
price_df.printSchema()

root
 |-- flight_date: date (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- fare_basis_code: string (nullable = true)
 |-- travel_duration: string (nullable = true)
 |-- elapsed_days: integer (nullable = true)
 |-- is_basic_economy: boolean (nullable = true)
 |-- is_refundable: boolean (nullable = true)
 |-- is_nonstop: boolean (nullable = true)
 |-- base_fare: double (nullable = true)
 |-- total_fare: double (nullable = true)
 |-- seats_remaining: integer (nullable = true)
 |-- total_distance: integer (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- airline_name: string (nullable = true)
 |-- airline_code: string (nullable = true)
 |-- equipment_description: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- cabin_code: string (nullable = true)
 |-- flight_year: integer (nullable = true)
 |-- flight_month: integer (nullable = true)
 |-- flight_day:

In [5]:
price_df = price_df.withColumn("is_basic_economy", col("is_basic_economy").cast("int"))
price_df = price_df.withColumn("is_refundable", col("is_refundable").cast("int"))
price_df = price_df.withColumn("is_nonstop", col("is_nonstop").cast("int"))

price_df = price_df.select('flight_month',
                           'flight_day',
                           'is_basic_economy', 
                           'is_nonstop',
                           'is_refundable',
                           'elapsed_days', 
                           'total_distance', 
                           'travel_duration_minutes',
                           'num_stops',
                           'day_of_week_index',
                           'final_arrival_hour',
                           'final_arrival_minute',
                           'initial_departure_hour',
                           'initial_departure_minute',
                           'total_fare')

In [6]:
price_df.printSchema()

root
 |-- flight_month: integer (nullable = true)
 |-- flight_day: integer (nullable = true)
 |-- is_basic_economy: integer (nullable = true)
 |-- is_nonstop: integer (nullable = true)
 |-- is_refundable: integer (nullable = true)
 |-- elapsed_days: integer (nullable = true)
 |-- total_distance: integer (nullable = true)
 |-- travel_duration_minutes: integer (nullable = true)
 |-- num_stops: integer (nullable = true)
 |-- day_of_week_index: integer (nullable = true)
 |-- final_arrival_hour: integer (nullable = true)
 |-- final_arrival_minute: integer (nullable = true)
 |-- initial_departure_hour: integer (nullable = true)
 |-- initial_departure_minute: integer (nullable = true)
 |-- total_fare: double (nullable = true)



In [7]:
missing_counts = price_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in price_df.columns])
missing_counts.show()



+------------+----------+----------------+----------+-------------+------------+--------------+-----------------------+---------+-----------------+------------------+--------------------+----------------------+------------------------+----------+
|flight_month|flight_day|is_basic_economy|is_nonstop|is_refundable|elapsed_days|total_distance|travel_duration_minutes|num_stops|day_of_week_index|final_arrival_hour|final_arrival_minute|initial_departure_hour|initial_departure_minute|total_fare|
+------------+----------+----------------+----------+-------------+------------+--------------+-----------------------+---------+-----------------+------------------+--------------------+----------------------+------------------------+----------+
|           0|         0|               0|         0|            0|           0|       6094532|                      0|        0|                0|                 0|                   0|                     0|                       0|         0|
+-----------

                                                                                

In [8]:
price_df_cleaned = price_df.na.drop()

missing_counts = price_df_cleaned.select([sum(col(c).isNull().cast("int")).alias(c) for c in price_df_cleaned.columns])
missing_counts.show()



+------------+----------+----------------+----------+-------------+------------+--------------+-----------------------+---------+-----------------+------------------+--------------------+----------------------+------------------------+----------+
|flight_month|flight_day|is_basic_economy|is_nonstop|is_refundable|elapsed_days|total_distance|travel_duration_minutes|num_stops|day_of_week_index|final_arrival_hour|final_arrival_minute|initial_departure_hour|initial_departure_minute|total_fare|
+------------+----------+----------------+----------+-------------+------------+--------------+-----------------------+---------+-----------------+------------------+--------------------+----------------------+------------------------+----------+
|           0|         0|               0|         0|            0|           0|             0|                      0|        0|                0|                 0|                   0|                     0|                       0|         0|
+-----------

                                                                                

In [9]:
feature_columns = ['flight_month',
                   'flight_day',
                   'is_basic_economy', 
                   'is_nonstop',
                   'is_refundable',
                   'elapsed_days', 
                   'total_distance', 
                   'travel_duration_minutes',
                   'num_stops',
                   'day_of_week_index',
                   'final_arrival_hour',
                   'final_arrival_minute',
                   'initial_departure_hour',
                   'initial_departure_minute']

assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
vprice_df = assembler.transform(price_df_cleaned)
vprice_df = vprice_df.select(['features', 'total_fare'])
vprice_df.show(5)

23/11/19 21:35:12 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+----------+
|            features|total_fare|
+--------------------+----------+
|(14,[0,1,3,6,7,10...|     415.6|
|[10.0,16.0,0.0,1....|     415.6|
|[10.0,16.0,0.0,0....|    419.01|
|[10.0,16.0,0.0,1....|     428.6|
|(14,[0,1,6,7,8,10...|     437.6|
+--------------------+----------+
only showing top 5 rows



                                                                                

In [10]:
(train_data, test_data) = vprice_df.randomSplit([0.8, 0.2], seed=42)

In [11]:
lr = LinearRegression(featuresCol='features', labelCol='total_fare')
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)

23/11/19 21:35:15 WARN org.apache.spark.ml.util.Instrumentation: [80d47564] regParam is zero, which might cause numerical instability and overfitting.
23/11/19 21:36:26 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/11/19 21:36:26 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
23/11/19 21:36:26 WARN com.github.fommil.netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
23/11/19 21:36:26 WARN com.github.fommil.netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
                                                                                

In [12]:
dt = DecisionTreeRegressor(featuresCol='features', labelCol='total_fare')
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)

                                                                                

In [13]:
rf = RandomForestRegressor(featuresCol='features', labelCol='total_fare', numTrees=10)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

                                                                                

In [14]:
gbt = GBTRegressor(featuresCol='features', labelCol='total_fare', maxIter=10)
gbt_model = gbt.fit(train_data)
gbt_predictions = gbt_model.transform(test_data)

                                                                                

In [15]:
glr = GeneralizedLinearRegression(featuresCol='features', labelCol='total_fare', 
                                  family='gaussian', link='identity', maxIter=10)
glr_model = glr.fit(train_data)
glr_predictions = glr_model.transform(test_data)

23/11/19 21:51:30 WARN org.apache.spark.ml.util.Instrumentation: [8afccc18] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

In [16]:
rmse_evaluator = RegressionEvaluator(labelCol="total_fare", predictionCol="prediction", metricName="rmse")
r2_evaluator = RegressionEvaluator(labelCol="total_fare", predictionCol="prediction", metricName="r2")

models_classification = {"Linear Regression": lr_predictions,
                         "Generalized Linear Regression": glr_predictions,
                         "Decision Trees": dt_predictions,
                         "Random Forest": rf_predictions,
                         "GBT": gbt_predictions}


for model_name, predictions in models_classification.items():
    rmse = rmse_evaluator.evaluate(predictions)
    r2 = r2_evaluator.evaluate(predictions)
    
    print(f"{model_name} - RMSE: {rmse}, R2: {r2}")

                                                                                

Linear Regression - RMSE: 150.06988151438003, R2: 0.4210931352122307


                                                                                

Generalized Linear Regression - RMSE: 150.06988151438003, R2: 0.42109313521223035


                                                                                

Decision Trees - RMSE: 146.2013527777852, R2: 0.45055477628950535


                                                                                

Random Forest - RMSE: 145.73854013403027, R2: 0.4540279002146702




GBT - RMSE: 140.7731622236668, R2: 0.49059717040046413


                                                                                

In [None]:
lr_predictions_model = lr_predictions.select('prediction').toPandas()
glr_predictions_model = glr_predictions.select('prediction').toPandas()
dt_predictions_model = dt_predictions.select('prediction').toPandas()
rf_predictions_model = rf_predictions.select('prediction').toPandas()
gbt_predictions_model = gbt_predictions.select('prediction').toPandas()
actual_values = test_data.select('total_fare').toPandas()

result_df = pd.concat([lr_predictions_model, dt_predictions_model, gbt_predictions_model, actual_values], axis=1)
result_df.columns = ['LR Prediction Model', 
                     'GLR Prediction Model', 
                     'DT Prediction Model', 
                     'RF Prediction Model',
                     'GBT Prediction Model', 
                     'Actual Value']

plt.figure(figsize=(12, 8))
plt.scatter(result_df['Actual Value'], result_df['LR Prediction Model'], label='Linear Regression', alpha=0.5)
plt.scatter(result_df['Actual Value'], result_df['GLR Prediction Model'], label='Generalized Linear Regression', alpha=0.5)
plt.scatter(result_df['Actual Value'], result_df['DT Prediction Model'], label='Decision Trees', alpha=0.5)
plt.scatter(result_df['Actual Value'], result_df['RF Prediction Model'], label='Random Forest', alpha=0.5)
plt.scatter(result_df['Actual Value'], result_df['GBT Prediction Model'], label='Gradient-Boosted Trees', alpha=0.5)

plt.title('Predicted vs Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()

                                                                                