In [1]:
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder.appName("FlightPricePrep").getOrCreate() 
spark.conf.set("spark.sql.legacy.timeParserPolicy", "CORRECTED")
spark.sparkContext

In [3]:
price_df = spark.read.parquet("gs://msca-bdp-student-gcs/Group7_Final_Project/flight_pricing/itineraries.parquet", 
                                     header=True, inferSchema=True)

                                                                                

In [4]:
price_df.show(1)

23/11/17 21:33:17 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+----------+----------+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+---------------------------------+------------------------+-------------------------------+----------------------+--------------------------+----------------------------+--------------------+-------------------+----------------------------+-------------------------+----------------+-----------------+
|               legId|searchDate|flightDate|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsDepartureTimeRaw|segmentsArrivalTimeEpochSeconds|segmentsArrivalTimeRaw|segmentsArrivalAirportCode|segmentsDepartureAirportCode| segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|segments

                                                                                

In [5]:
price_df.printSchema()

root
 |-- legId: string (nullable = true)
 |-- searchDate: string (nullable = true)
 |-- flightDate: string (nullable = true)
 |-- startingAirport: string (nullable = true)
 |-- destinationAirport: string (nullable = true)
 |-- fareBasisCode: string (nullable = true)
 |-- travelDuration: string (nullable = true)
 |-- elapsedDays: integer (nullable = true)
 |-- isBasicEconomy: boolean (nullable = true)
 |-- isRefundable: boolean (nullable = true)
 |-- isNonStop: boolean (nullable = true)
 |-- baseFare: double (nullable = true)
 |-- totalFare: double (nullable = true)
 |-- seatsRemaining: integer (nullable = true)
 |-- totalTravelDistance: integer (nullable = true)
 |-- segmentsDepartureTimeEpochSeconds: string (nullable = true)
 |-- segmentsDepartureTimeRaw: string (nullable = true)
 |-- segmentsArrivalTimeEpochSeconds: string (nullable = true)
 |-- segmentsArrivalTimeRaw: string (nullable = true)
 |-- segmentsArrivalAirportCode: string (nullable = true)
 |-- segmentsDepartureAirportCod

In [6]:
price_df = price_df.drop("legId", 
                         "searchDate", 
                         "segmentsDurationInSeconds", 
                         "segmentsArrivalTimeEpochSeconds", 
                         "segmentsDepartureTimeEpochSeconds", 
                         "segmentsArrivalAirportCode",
                         "segmentsDepartureAirportCode")

In [7]:
price_df = (price_df.withColumnRenamed('flightDate', 'flight_date')
                    .withColumnRenamed('startingAirport', 'origin')
                    .withColumnRenamed('destinationAirport', 'destination')
                    .withColumnRenamed('fareBasisCode', 'fare_basis_code')
                    .withColumnRenamed('travelDuration', 'travel_duration')
                    .withColumnRenamed('elapsedDays', 'elapsed_days')
                    .withColumnRenamed('isBasicEconomy', 'is_basic_economy')
                    .withColumnRenamed('isRefundable', 'is_refundable')
                    .withColumnRenamed('isNonStop', 'is_nonstop')
                    .withColumnRenamed('baseFare', 'base_fare')
                    .withColumnRenamed('totalFare', 'total_fare')
                    .withColumnRenamed('seatsRemaining', 'seats_remaining')
                    .withColumnRenamed('totalTravelDistance', 'total_distance')
                    .withColumnRenamed('segmentsDepartureTimeRaw', 'departure_time')
                    .withColumnRenamed('segmentsArrivalTimeRaw', 'arrival_time')
                    .withColumnRenamed('segmentsAirlineName', 'airline_name')
                    .withColumnRenamed('segmentsAirlineCode', 'airline_code')
                    .withColumnRenamed('segmentsEquipmentDescription', 'equipment_description')
                    .withColumnRenamed('segmentsDurationInSeconds', 'duration_seconds')
                    .withColumnRenamed('segmentsDistance', 'distance')
                    .withColumnRenamed('segmentsCabinCode', 'cabin_code'))

In [8]:
price_df = price_df.withColumn("flight_date", to_date(price_df["flight_date"], "yyyy-MM-dd"))

In [9]:
price_df = price_df.withColumn("flight_year", year(col("flight_date")))
price_df = price_df.withColumn("flight_month", month(col("flight_date")))
price_df = price_df.withColumn("flight_day", dayofmonth(col("flight_date")))

price_df.show(1)

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|
+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+
| 2022-05-

                                                                                

In [10]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Define UDF for minutes
@F.pandas_udf(IntegerType())
def parse_iso8601_duration_minutes(str_duration: pd.Series) -> pd.Series:
    return str_duration.apply(lambda duration: 1440 + pd.Timedelta(duration).seconds // 60 if duration.startswith('P1D') else pd.Timedelta(duration).seconds // 60)

# Define UDF for seconds
@F.pandas_udf(IntegerType())
def parse_iso8601_duration_seconds(str_duration: pd.Series) -> pd.Series:
    return str_duration.apply(lambda duration: 86400 if duration.startswith('P1D') else pd.Timedelta(duration).seconds)

# Apply UDFs to create new columns
price_df = price_df.withColumn("travel_duration_minutes", parse_iso8601_duration_minutes(F.col("travel_duration")))
price_df = price_df.withColumn("travel_duration_seconds", parse_iso8601_duration_seconds(F.col("travel_duration")))

# Show the updated DataFrame
price_df.show(1)

[Stage 3:>                                                          (0 + 1) / 1]

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+-----------------------+-----------------------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|travel_duration_minutes|travel_duration_seconds|
+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+----------

                                                                                

In [11]:
# Count the number of '||' delimiters
stop_count = F.size(F.split(price_df['departure_time'], '\|\|')) - 1

# Add the count as a new column
price_df = price_df.withColumn("num_stops", stop_count)

# Show the updated DataFrame
price_df.show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+-----------------------+-----------------------+---------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|travel_duration_minutes|travel_duration_seconds|num_stops|
+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+-----------

                                                                                

In [12]:
price_df = price_df.withColumn("initial_departure_datetime", F.split("departure_time", "\|\|")[0])
price_df = price_df.withColumn("final_arrival_datetime", F.split("arrival_time", "\|\|").getItem(F.size(F.split("arrival_time", "\|\|")) - 1))

price_df.show(1)

[Stage 5:>                                                          (0 + 1) / 1]

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+-----------------------+-----------------------+---------+--------------------------+----------------------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|travel_duration_minutes|travel_duration_seconds|num_stops|initial_departure_datetime|final_arrival_datetime|
+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+-----

                                                                                

In [13]:
price_df = price_df.withColumn("initial_departure_datetime", to_timestamp("initial_departure_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"))
price_df = price_df.withColumn("final_arrival_datetime", to_timestamp("final_arrival_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"))

price_df = price_df.withColumn("initial_departure_hour", hour("initial_departure_datetime"))
price_df = price_df.withColumn("initial_departure_minute", minute("initial_departure_datetime"))

price_df = price_df.withColumn("final_arrival_hour", hour("final_arrival_datetime"))
price_df = price_df.withColumn("final_arrival_minute", minute("final_arrival_datetime"))

price_df.show(1)

[Stage 6:>                                                          (0 + 1) / 1]

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+-----------------------+-----------------------+---------+--------------------------+----------------------+----------------------+------------------------+------------------+--------------------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|travel_duration_minutes|travel_duration_seconds|num_stops|initial_departure_datetime|final_arrival_datetime|initial_departure_hour|initial_depart

                                                                                

In [14]:
price_df = price_df.withColumn('day_of_week', date_format(col('flight_date'), 'EEEE'))

price_df.show(1)

[Stage 7:>                                                          (0 + 1) / 1]

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+-----------------------+-----------------------+---------+--------------------------+----------------------+----------------------+------------------------+------------------+--------------------+-----------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|travel_duration_minutes|travel_duration_seconds|num_stops|initial_departure_datetime|final_arrival_datetime|initial_departure_hour|in

                                                                                

In [15]:
category_mapping = {
    'Sunday': 0,
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6
}

mapping_udf = udf(lambda index: category_mapping.get(index), StringType())

price_df = price_df.withColumn('day_of_week_index', mapping_udf('day_of_week'))
price_df = price_df.withColumn("day_of_week_index", col("day_of_week_index").cast("int"))
price_df.show(5)

[Stage 8:>                                                          (0 + 1) / 1]

+-----------+------+-----------+---------------+---------------+------------+----------------+-------------+----------+---------+----------+---------------+--------------+--------------------+--------------------+--------------------+------------+---------------------+----------+------------+-----------+------------+----------+-----------------------+-----------------------+---------+--------------------------+----------------------+----------------------+------------------------+------------------+--------------------+-----------+-----------------+
|flight_date|origin|destination|fare_basis_code|travel_duration|elapsed_days|is_basic_economy|is_refundable|is_nonstop|base_fare|total_fare|seats_remaining|total_distance|      departure_time|        arrival_time|        airline_name|airline_code|equipment_description|  distance|  cabin_code|flight_year|flight_month|flight_day|travel_duration_minutes|travel_duration_seconds|num_stops|initial_departure_datetime|final_arrival_datetime|initial

                                                                                

In [16]:
price_df.printSchema()

root
 |-- flight_date: date (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- fare_basis_code: string (nullable = true)
 |-- travel_duration: string (nullable = true)
 |-- elapsed_days: integer (nullable = true)
 |-- is_basic_economy: boolean (nullable = true)
 |-- is_refundable: boolean (nullable = true)
 |-- is_nonstop: boolean (nullable = true)
 |-- base_fare: double (nullable = true)
 |-- total_fare: double (nullable = true)
 |-- seats_remaining: integer (nullable = true)
 |-- total_distance: integer (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- airline_name: string (nullable = true)
 |-- airline_code: string (nullable = true)
 |-- equipment_description: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- cabin_code: string (nullable = true)
 |-- flight_year: integer (nullable = true)
 |-- flight_month: integer (nullable = true)
 |-- flight_day:

In [17]:
parquet_output_path = "gs://msca-bdp-student-gcs/Group7_Final_Project/flight_pricing/preprocessed_prices.parquet"
price_df.write.parquet(parquet_output_path)

                                                                                