In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('AirlinesAndFlight').getOrCreate()

spark

In [7]:
path = 'Datasets/'
airlines = spark.read.csv(path + 'airlines.csv',inferSchema=True, header=True)
airlines.limit(5).toPandas()

Unnamed: 0,Code,Description
0,19031,Mackey International Inc.: MAC
1,19032,Munz Northern Airlines Inc.: XY
2,19033,Cochise Airlines Inc.: COC
3,19034,Golden Gate Airlines Inc.: GSA
4,19035,Aeromech Inc.: RZZ


In [10]:
airlines.createTempView('airline_view')

AnalysisException: Temporary view 'airline_view' already exists;

In [11]:
airline_data = spark.sql('select * from airline_view')

In [12]:
airline_data.show(5)

+-----+--------------------+
| Code|         Description|
+-----+--------------------+
|19031|Mackey Internatio...|
|19032|Munz Northern Air...|
|19033|Cochise Airlines ...|
|19034|Golden Gate Airli...|
|19035|  Aeromech Inc.: RZZ|
+-----+--------------------+
only showing top 5 rows



In [13]:
path = 'Datasets/'
flights = spark.read.csv(path + 'flights.csv',inferSchema=True, header=True)
flights.limit(5).toPandas()

Unnamed: 0,date,airlines,flight_number,origin,destination,departure,departure_delay,arrival,arrival_delay,air_time,distance
0,2014-04-01,19805,1,JFK,LAX,854,-6.0,1217,2.0,355.0,2475.0
1,2014-04-01,19805,2,LAX,JFK,944,14.0,1736,-29.0,269.0,2475.0
2,2014-04-01,19805,3,JFK,LAX,1224,-6.0,1614,39.0,371.0,2475.0
3,2014-04-01,19805,4,LAX,JFK,1240,25.0,2028,-27.0,264.0,2475.0
4,2014-04-01,19805,5,DFW,HNL,1300,-5.0,1650,15.0,510.0,3784.0


In [14]:
flights.createTempView('flights_view')

In [16]:
flights_data = spark.sql('select * from flights_view')

In [17]:
flights_data.show(5)

+----------+--------+-------------+------+-----------+---------+---------------+-------+-------------+--------+--------+
|      date|airlines|flight_number|origin|destination|departure|departure_delay|arrival|arrival_delay|air_time|distance|
+----------+--------+-------------+------+-----------+---------+---------------+-------+-------------+--------+--------+
|2014-04-01|   19805|            1|   JFK|        LAX|      854|           -6.0|   1217|          2.0|   355.0|  2475.0|
|2014-04-01|   19805|            2|   LAX|        JFK|      944|           14.0|   1736|        -29.0|   269.0|  2475.0|
|2014-04-01|   19805|            3|   JFK|        LAX|     1224|           -6.0|   1614|         39.0|   371.0|  2475.0|
|2014-04-01|   19805|            4|   LAX|        JFK|     1240|           25.0|   2028|        -27.0|   264.0|  2475.0|
|2014-04-01|   19805|            5|   DFW|        HNL|     1300|           -5.0|   1650|         15.0|   510.0|  3784.0|
+----------+--------+-----------

In [18]:
from pyspark.sql.functions import broadcast
airline_flights_data = flights_data.join(broadcast(airline_data),
                                        flights_data.airlines == airline_data.Code)

In [20]:
airline_flights_data.limit(5).toPandas()

Unnamed: 0,date,airlines,flight_number,origin,destination,departure,departure_delay,arrival,arrival_delay,air_time,distance,Code,Description
0,2014-04-01,19805,1,JFK,LAX,854,-6.0,1217,2.0,355.0,2475.0,19805,American Airlines Inc.: AA
1,2014-04-01,19805,2,LAX,JFK,944,14.0,1736,-29.0,269.0,2475.0,19805,American Airlines Inc.: AA
2,2014-04-01,19805,3,JFK,LAX,1224,-6.0,1614,39.0,371.0,2475.0,19805,American Airlines Inc.: AA
3,2014-04-01,19805,4,LAX,JFK,1240,25.0,2028,-27.0,264.0,2475.0,19805,American Airlines Inc.: AA
4,2014-04-01,19805,5,DFW,HNL,1300,-5.0,1650,15.0,510.0,3784.0,19805,American Airlines Inc.: AA


In [24]:
flights_count = spark.sql('select count(*) from flights_view')
airlines_count = spark.sql('select count(*) from airline_view')

In [26]:
flights_count.collect()[0][0], airlines_count.collect()[0][0]

(476881, 1579)

In [29]:
flights_delayed = spark.sql("select airlines, date,origin, destination,departure_delay from flights_view where departure_delay>0")

In [30]:
flights_delayed.limit(5).toPandas()

Unnamed: 0,airlines,date,origin,destination,departure_delay
0,19805,2014-04-01,LAX,JFK,14.0
1,19805,2014-04-01,LAX,JFK,25.0
2,19805,2014-04-01,OGG,DFW,126.0
3,19805,2014-04-01,DFW,OGG,125.0
4,19805,2014-04-01,HNL,DFW,4.0


In [31]:
flights_delayed.createTempView('delayed_flights')

In [44]:
flights_delayed_by_airlines = spark.sql("""select airlines, departure_delay from delayed_flights
                                       JOIN airline_view ON airline_view.Code = delayed_flights.airlines
                                       ORDER BY departure_delay DESC""")

In [42]:
flights_delayed_by_airlines.limit(5).toPandas()

Unnamed: 0,airlines,departure_delay
0,19805,1696.0
1,19805,1529.0
2,19805,1489.0
3,19805,1476.0
4,19805,1456.0
