Create a Spark application

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("DataAssessment").config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

Change configuration settings on Spark 

In [17]:
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), ('spark.app.name', 'Spark Updated Conf'), ('spark.executor.cores', '4'), ('spark.cores.max', '4'), ('spark.driver.memory','4g')])

Store the Spark Context in a variable. Sometimes the Context is provided directly from the environment

In [18]:
sc = spark.sparkContext

Load the Data

In [19]:
flight_df = spark.read.csv("gs://msca-bdp-student-gcs/itineraries.csv", header=True, inferSchema=True)

                                                                                

Initial Data Inspection

In [20]:
flight_df.show(5)

+--------------------+----------+----------+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+---------------------------------+------------------------+-------------------------------+----------------------+--------------------------+----------------------------+-------------------+-------------------+----------------------------+-------------------------+----------------+-----------------+
|               legId|searchDate|flightDate|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsDepartureTimeRaw|segmentsArrivalTimeEpochSeconds|segmentsArrivalTimeRaw|segmentsArrivalAirportCode|segmentsDepartureAirportCode|segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|segmentsDi

Data Schema

In [21]:
flight_df.printSchema()

root
 |-- legId: string (nullable = true)
 |-- searchDate: string (nullable = true)
 |-- flightDate: string (nullable = true)
 |-- startingAirport: string (nullable = true)
 |-- destinationAirport: string (nullable = true)
 |-- fareBasisCode: string (nullable = true)
 |-- travelDuration: string (nullable = true)
 |-- elapsedDays: integer (nullable = true)
 |-- isBasicEconomy: boolean (nullable = true)
 |-- isRefundable: boolean (nullable = true)
 |-- isNonStop: boolean (nullable = true)
 |-- baseFare: double (nullable = true)
 |-- totalFare: double (nullable = true)
 |-- seatsRemaining: integer (nullable = true)
 |-- totalTravelDistance: integer (nullable = true)
 |-- segmentsDepartureTimeEpochSeconds: string (nullable = true)
 |-- segmentsDepartureTimeRaw: timestamp (nullable = true)
 |-- segmentsArrivalTimeEpochSeconds: string (nullable = true)
 |-- segmentsArrivalTimeRaw: timestamp (nullable = true)
 |-- segmentsArrivalAirportCode: string (nullable = true)
 |-- segmentsDepartureAirp

Convert columns to Datetime and Timestamp

In [22]:
from pyspark.sql.functions import to_date, to_timestamp

flight_df = flight_df.withColumn("flightDate", to_date(flight_df["flightDate"], "yyyy-MM-dd"))
flight_df = flight_df.withColumn("searchDate", to_date(flight_df["searchDate"], "yyyy-MM-dd"))
flight_df = flight_df.withColumn("segmentsDepartureTimeRaw", to_timestamp(flight_df["segmentsDepartureTimeRaw"], "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"))
flight_df = flight_df.withColumn("segmentsArrivalTimeRaw", to_timestamp(flight_df["segmentsArrivalTimeRaw"], "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"))

Verify Datetime and Timestamp columns

In [23]:
flight_df.printSchema()
flight_df.select("flightDate", "searchDate", "segmentsDepartureTimeRaw", "segmentsArrivalTimeRaw").show(5)

root
 |-- legId: string (nullable = true)
 |-- searchDate: date (nullable = true)
 |-- flightDate: date (nullable = true)
 |-- startingAirport: string (nullable = true)
 |-- destinationAirport: string (nullable = true)
 |-- fareBasisCode: string (nullable = true)
 |-- travelDuration: string (nullable = true)
 |-- elapsedDays: integer (nullable = true)
 |-- isBasicEconomy: boolean (nullable = true)
 |-- isRefundable: boolean (nullable = true)
 |-- isNonStop: boolean (nullable = true)
 |-- baseFare: double (nullable = true)
 |-- totalFare: double (nullable = true)
 |-- seatsRemaining: integer (nullable = true)
 |-- totalTravelDistance: integer (nullable = true)
 |-- segmentsDepartureTimeEpochSeconds: string (nullable = true)
 |-- segmentsDepartureTimeRaw: timestamp (nullable = true)
 |-- segmentsArrivalTimeEpochSeconds: string (nullable = true)
 |-- segmentsArrivalTimeRaw: timestamp (nullable = true)
 |-- segmentsArrivalAirportCode: string (nullable = true)
 |-- segmentsDepartureAirportC

Summary Statistics

In [24]:
flight_df.describe().show()

                                                                                

+-------+--------------------+---------------+------------------+-------------+--------------+-------------------+------------------+------------------+------------------+-------------------+---------------------------------+-------------------------------+--------------------------+----------------------------+--------------------+-------------------+----------------------------+-------------------------+--------------------+--------------------+
|summary|               legId|startingAirport|destinationAirport|fareBasisCode|travelDuration|        elapsedDays|          baseFare|         totalFare|    seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsArrivalTimeEpochSeconds|segmentsArrivalAirportCode|segmentsDepartureAirportCode| segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|    segmentsDistance|   segmentsCabinCode|
+-------+--------------------+---------------+------------------+-------------+--------------+--

Missing Values

In [25]:
from pyspark.sql.functions import col, isnan, when, count

missing_values = flight_df.select([count(when(col(c).isNull(), c)).alias(c) for c in flight_df.columns])
missing_values.show()



+-----+----------+----------+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+---------------------------------+------------------------+-------------------------------+----------------------+--------------------------+----------------------------+-------------------+-------------------+----------------------------+-------------------------+----------------+-----------------+
|legId|searchDate|flightDate|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|segmentsDepartureTimeEpochSeconds|segmentsDepartureTimeRaw|segmentsArrivalTimeEpochSeconds|segmentsArrivalTimeRaw|segmentsArrivalAirportCode|segmentsDepartureAirportCode|segmentsAirlineName|segmentsAirlineCode|segmentsEquipmentDescription|segmentsDurationInSeconds|segmentsDistance|segmentsCabinCode|
+---

                                                                                

Unique Values

In [28]:
from pyspark.sql.functions import countDistinct, col

categorical_columns = ["startingAirport", "destinationAirport", "fareBasisCode", "segmentsCabinCode"]
unique_values = flight_df.agg(*[countDistinct(col(c)).alias(c) for c in flight_df.columns if c in categorical_columns])
unique_values.show()



+---------------+------------------+-------------+-----------------+
|startingAirport|destinationAirport|fareBasisCode|segmentsCabinCode|
+---------------+------------------+-------------+-----------------+
|             16|                16|        21062|               71|
+---------------+------------------+-------------+-----------------+



                                                                                

Date Correlations

In [None]:
numerical_columns = ["travelDuration", "elapsedDays", "baseFare", "totalFare", "seatsRemaining", "totalTravelDistance"]
correlation_matrix = flight_df.select(*numerical_columns).toPandas().corr()

Exception in thread "IPC Parameter Sending Thread #5" java.lang.OutOfMemoryError: GC overhead limit exceeded
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.enq(AbstractQueuedSynchronizer.java:587)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.addWaiter(AbstractQueuedSynchronizer.java:616)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1199)
	at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:209)
	at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:285)
	at java.util.concurrent.ThreadPoolExecutor.processWorkerExit(ThreadPoolExecutor.java:1006)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1167)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
23/11/07 20:58:11 ERROR com.codahale.metrics.ScheduledReporter: Exception thrown from StatsdReporter#report. Exception wa

Py4JJavaError: An error occurred while calling o838.collectToPython.
: java.lang.OutOfMemoryError: GC overhead limit exceeded


Histogram of baseFare Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

pandas_df = flight_df.select("baseFare").toPandas()

# Plot a histogram of the 'baseFare' column
plt.figure(figsize=(8, 6))
sns.histplot(pandas_df['baseFare'], bins=20, kde=True)
plt.title("Distribution of Base Fare")
plt.xlabel("Base Fare")
plt.ylabel("Frequency")
plt.show()

                                                                                

Box Plot of totalFare by isBasicEconomy

In [None]:
pandas_df = flight_df.select("isBasicEconomy", "totalFare").toPandas()

# Plot a box plot of 'totalFare' by 'isBasicEconomy'
plt.figure(figsize=(8, 6))
sns.boxplot(data=pandas_df, x="isBasicEconomy", y="totalFare")
plt.title("Total Fare by Basic Economy")
plt.xlabel("Is Basic Economy")
plt.ylabel("Total Fare")
plt.show()

Scatter Plot of totalTravelDistance vs. travelDuration

In [None]:
pandas_df = flight_df.select("totalTravelDistance", "travelDuration").toPandas()

# Plot a scatter plot of 'totalTravelDistance' vs. 'travelDuration'
plt.figure(figsize=(8, 6))
plt.scatter(pandas_df['totalTravelDistance'], pandas_df['travelDuration'], alpha=0.5)
plt.title("Total Travel Distance vs. Travel Duration")
plt.xlabel("Total Travel Distance")
plt.ylabel("Travel Duration")
plt.show()