In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Airline").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/17 12:06:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, mean, corr


# Load the CSV file into a DataFrame
file_path = "/Users/sampreethshetty/Downloads/Airlines.csv"  # Replace with your file path
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the first few rows of the dataset
print("Initial Dataset:")
df.show(5)

Initial Dataset:
+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|
+---+-------+------+-----------+---------+---------+----+------+-----+
only showing top 5 rows



In [3]:
# Filter flights that were delayed (Delay == 1)
delayed_flights_df = df.filter(col("Delay") == 1)

print("Flights with Delay:")
delayed_flights_df.show(5)

Flights with Delay:
+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  6|     CO|  1094|        LAX|      IAH|        3|  30|   181|    1|
+---+-------+------+-----------+---------+---------+----+------+-----+
only showing top 5 rows



In [4]:
# Calculate the average flight length for delayed vs. non-delayed flights
avg_length_delayed = df.filter(col("Delay") == 1).agg(mean("Length").alias("Avg_Length_Delayed"))
avg_length_non_delayed = df.filter(col("Delay") == 0).agg(mean("Length").alias("Avg_Length_NonDelayed"))


In [5]:
# Display average flight lengths
print("Average Flight Length for Delayed Flights:")
avg_length_delayed.show()

Average Flight Length for Delayed Flights:
+------------------+
|Avg_Length_Delayed|
+------------------+
| 135.3696974994173|
+------------------+



In [6]:
print("Average Flight Length for Non-Delayed Flights:")
avg_length_non_delayed.show()

Average Flight Length for Non-Delayed Flights:
+---------------------+
|Avg_Length_NonDelayed|
+---------------------+
|    129.6575944690909|
+---------------------+



In [7]:
# Analyze correlation between Length and Delay
# Since Delay is binary, we can treat it as a numerical variable for correlation
correlation = df.stat.corr("Length", "Delay")

print(f"Correlation between Flight Length and Delay: {correlation}")



Correlation between Flight Length and Delay: 0.040489471608066915


In [12]:
from pyspark.sql.functions import avg, desc, concat_ws

# Task 1: Group by airline carrier and compute average delay
avg_delay_by_airline = df.groupBy("Airline") \
    .agg(avg("Delay").alias("Avg_Delay")) \
    .orderBy(desc("Avg_Delay"))

print("Average Delay by Airline:")
avg_delay_by_airline.show()


Average Delay by Airline:
+-------+-------------------+
|Airline|          Avg_Delay|
+-------+-------------------+
|     WN| 0.6977586958138942|
|     CO| 0.5661994507055592|
|     B6|0.46703842756183744|
|     OO| 0.4528992716997652|
|     DL|  0.450475877912701|
|     F9|0.44903965303593557|
|     EV| 0.4022084837222599|
|     9E|0.39766025331141835|
|     AA|0.38847029963203084|
|     XE|0.37894364839683864|
|     MQ|0.34809452260620133|
|     AS| 0.3392903844477378|
|     US|0.33597101449275363|
|     UA|0.32390745501285345|
|     HA| 0.3201864467551094|
|     FL|0.30129159264416383|
|     OH| 0.2772763262074426|
|     YV| 0.2429143897996357|
+-------+-------------------+



In [13]:
# Task 2: Determine the top 5 routes with the highest average delay
# Combine 'AirportFrom' and 'AirportTo' to create a 'Route' column
df_with_route = df.withColumn("Route", concat_ws("->", col("AirportFrom"), col("AirportTo")))

# Group by Route and compute average delay
avg_delay_by_route = df_with_route.groupBy("Route") \
    .agg(avg("Delay").alias("Avg_Delay")) \
    .orderBy(desc("Avg_Delay"))

# Show the top 5 routes with the highest average delay
print("Top 5 Routes with Highest Average Delay:")
avg_delay_by_route.show(5)


Top 5 Routes with Highest Average Delay:
+--------+---------+
|   Route|Avg_Delay|
+--------+---------+
|PIT->CRW|      1.0|
|STL->SAN|      1.0|
|LAS->BUF|      1.0|
|MSP->PVD|      1.0|
|STL->RSW|      1.0|
+--------+---------+
only showing top 5 rows



In [14]:
spark.stop()