In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import pandas as pd
sqlContext = SQLContext(sc)


## FLIGHT_COUNT

##### Goal:
On a per day basis compute the total number of flights departing, and arriving at a given airport.

##### Hypothesis:
The number of flights to/from an airport tells us how busy that airport is on a given day. This feature would be very correlated to the size/popularity of an airport, we hope to see if there is a strong correlation between this and predicting `DEP_DEL15`


Note: Our data only deals with flights that were neither diverted nor cancelled

In [0]:
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/cleaned_data/airlines/airlines_latest_utc/part-00*.parquet")

print("Number of flights (2015 - 2019):  ", airlines.count())
print("Number of data columns:  ", len(airlines.columns))

display(airlines)

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,DIV_AIRPORT_LANDINGS,ORIGIN_TZ,DEST_TZ,DEP_MIN,DEP_HOUR,ARR_MIN,ARR_HOUR,ORIGIN_TS,ORIGIN_UTC,DEST_TS,DEST_UTC
2019,3,7,31,3,2019-07-31,WN,19393,WN,N908WN,555,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13487,1348702,31650,MSP,"Minneapolis, MN",MN,27,Minnesota,63,1045,1058,13.0,13.0,0.0,0,1000-1059,10.0,1108,1254,4.0,1255,1258,3.0,3.0,0.0,0,1200-1259,0.0,0.0,130.0,120.0,106.0,1.0,695.0,3,0,America/Chicago,America/Chicago,45,10,55,12,2019-07-31T10:45:00.000+0000,2019-07-31T15:45:00.000+0000,2019-07-31T12:55:00.000+0000,2019-07-31T17:55:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N415WN,2300,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13487,1348702,31650,MSP,"Minneapolis, MN",MN,27,Minnesota,63,1815,1813,-2.0,0.0,0.0,-1,1800-1859,9.0,1822,2006,11.0,2020,2017,-3.0,0.0,0.0,-1,2000-2059,0.0,0.0,125.0,124.0,104.0,1.0,695.0,3,0,America/Chicago,America/Chicago,15,18,20,20,2019-07-31T18:15:00.000+0000,2019-07-31T23:15:00.000+0000,2019-07-31T20:20:00.000+0000,2019-08-01T01:20:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N966WN,443,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13495,1349505,33495,MSY,"New Orleans, LA",LA,22,Louisiana,72,1945,1953,8.0,8.0,0.0,0,1900-1959,12.0,2005,2110,2.0,2105,2112,7.0,7.0,0.0,0,2100-2159,0.0,0.0,80.0,79.0,65.0,1.0,471.0,2,0,America/Chicago,America/Chicago,45,19,5,21,2019-07-31T19:45:00.000+0000,2019-08-01T00:45:00.000+0000,2019-07-31T21:05:00.000+0000,2019-08-01T02:05:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N403WN,1371,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13495,1349505,33495,MSY,"New Orleans, LA",LA,22,Louisiana,72,1330,1337,7.0,7.0,0.0,0,1300-1359,9.0,1346,1451,4.0,1505,1455,-10.0,0.0,0.0,-1,1500-1559,0.0,0.0,95.0,78.0,65.0,1.0,471.0,2,0,America/Chicago,America/Chicago,30,13,5,15,2019-07-31T13:30:00.000+0000,2019-07-31T18:30:00.000+0000,2019-07-31T15:05:00.000+0000,2019-07-31T20:05:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N423WN,2483,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13495,1349505,33495,MSY,"New Orleans, LA",LA,22,Louisiana,72,745,750,5.0,5.0,0.0,0,0700-0759,10.0,800,903,2.0,915,905,-10.0,0.0,0.0,-1,0900-0959,0.0,0.0,90.0,75.0,63.0,1.0,471.0,2,0,America/Chicago,America/Chicago,45,7,15,9,2019-07-31T07:45:00.000+0000,2019-07-31T12:45:00.000+0000,2019-07-31T09:15:00.000+0000,2019-07-31T14:15:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N413WN,2514,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13495,1349505,33495,MSY,"New Orleans, LA",LA,22,Louisiana,72,1645,1647,2.0,2.0,0.0,0,1600-1659,9.0,1656,1803,7.0,1815,1810,-5.0,0.0,0.0,-1,1800-1859,0.0,0.0,90.0,83.0,67.0,1.0,471.0,2,0,America/Chicago,America/Chicago,45,16,15,18,2019-07-31T16:45:00.000+0000,2019-07-31T21:45:00.000+0000,2019-07-31T18:15:00.000+0000,2019-07-31T23:15:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N7718B,1078,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,13796,1379608,32457,OAK,"Oakland, CA",CA,6,California,91,750,840,50.0,50.0,1.0,3,0700-0759,8.0,848,1056,4.0,1025,1100,35.0,35.0,1.0,2,1000-1059,0.0,0.0,275.0,260.0,248.0,1.0,1959.0,8,0,America/Chicago,America/Los_Angeles,50,7,25,10,2019-07-31T07:50:00.000+0000,2019-07-31T12:50:00.000+0000,2019-07-31T10:25:00.000+0000,2019-07-31T17:25:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N200WN,17,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,14100,1410005,34100,PHL,"Philadelphia, PA",PA,42,Pennsylvania,23,1910,1908,-2.0,0.0,0.0,-1,1900-1959,9.0,1917,2151,6.0,2200,2157,-3.0,0.0,0.0,-1,2200-2259,0.0,0.0,110.0,109.0,94.0,1.0,675.0,3,0,America/Chicago,America/New_York,10,19,0,22,2019-07-31T19:10:00.000+0000,2019-08-01T00:10:00.000+0000,2019-07-31T22:00:00.000+0000,2019-08-01T02:00:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N7717D,599,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,14100,1410005,34100,PHL,"Philadelphia, PA",PA,42,Pennsylvania,23,1025,1042,17.0,17.0,1.0,1,1000-1059,11.0,1053,1328,6.0,1330,1334,4.0,4.0,0.0,0,1300-1359,0.0,0.0,125.0,112.0,95.0,1.0,675.0,3,0,America/Chicago,America/New_York,25,10,30,13,2019-07-31T10:25:00.000+0000,2019-07-31T15:25:00.000+0000,2019-07-31T13:30:00.000+0000,2019-07-31T17:30:00.000+0000
2019,3,7,31,3,2019-07-31,WN,19393,WN,N241WN,1162,10693,1069302,30693,BNA,"Nashville, TN",TN,47,Tennessee,54,14100,1410005,34100,PHL,"Philadelphia, PA",PA,42,Pennsylvania,23,2205,2230,25.0,25.0,1.0,1,2200-2259,11.0,2241,106,3.0,100,109,9.0,9.0,0.0,0,0001-0559,0.0,0.0,115.0,99.0,85.0,1.0,675.0,3,0,America/Chicago,America/New_York,5,22,0,1,2019-07-31T22:05:00.000+0000,2019-08-01T03:05:00.000+0000,2019-08-01T01:00:00.000+0000,2019-08-01T05:00:00.000+0000


In [0]:
airlines.printSchema()

Because our feature is a sum built off of scheduled flight data, we are able to build this feature safely on data across all years.

#### First perform the group on origin and date to get the counts

In [0]:
orgin_date_agg = airlines.groupBy(f.to_date("ORIGIN_UTC").alias("date"), "ORIGIN") \
                         .agg(count("*").alias("count"))

print(orgin_date_agg.count())
orgin_date_agg.show()

In [0]:
display(orgin_date_agg.where((col("date").isNull()) | (col("ORIGIN").isNull()) | (col("count").isNull())))

date,ORIGIN,count


#### Next perform the group on destination and date to get the counts

In [0]:
dest_date_agg = airlines.groupBy(f.to_date("DEST_UTC").alias("date"), "DEST") \
                         .agg(count("*").alias("count"))

print(dest_date_agg.count())
dest_date_agg.show()

In [0]:
#Renaming to union the dataframes
dest_date_agg = dest_date_agg.withColumnRenamed("DEST", "ORIGIN")
display(dest_date_agg)

date,ORIGIN,count
2015-06-10,ORD,919
2015-06-17,PHL,210
2015-06-26,IAD,117
2015-06-03,DAY,27
2015-06-23,CLE,112
2015-06-12,SYR,16
2015-06-13,PWM,21
2015-06-03,SRQ,9
2015-06-01,FNT,17
2015-06-07,GSO,16


In [0]:
display(dest_date_agg.where((col("date").isNull()) | (col("DEST").isNull()) | (col("count").isNull())))

date,ORIGIN,count


#### Union the two datasets together so that we can do another aggregation this time to sum the to and from counts by day

In [0]:
#Not fully done yet...
flights_by_day = orgin_date_agg.union(dest_date_agg).withColumnRenamed("ORIGIN", "IATA")

print(flights_by_day.count())
display(flights_by_day)

date,IATA,count
2018-07-15,LIH,43
2018-07-24,BTV,26
2018-07-26,ATW,15
2018-07-07,RAP,19
2018-07-17,HNL,142
2018-07-12,CLE,147
2018-07-07,MVY,7
2018-07-08,TYS,48
2018-07-21,DCA,313
2018-07-19,EWR,423


In [0]:
#Sanity check that we've simply stacked the dataframes onto one another
568880 + 568916

In [0]:
#This should wrap it up
flights_by_day = flights_by_day.groupBy("date", "IATA") \
                      .agg(f.sum("count").alias("flight_count"))

print(flights_by_day.count())
display(flights_by_day)

date,IATA,flight_count
2019-01-11,BTV,50
2019-01-02,MKE,159
2019-01-02,PWM,57
2019-01-05,MAF,43
2019-01-12,PIE,43
2019-01-29,SCK,6
2019-01-23,FLL,496
2019-01-20,MKE,119
2019-01-05,MSN,53
2019-01-26,PBI,153


In [0]:
#Number of rows makes sense, its slightly larger than both previous tables
569988

In [0]:
#Verifying that we don't have any issues in our table
missing_ones = flights_by_day.where((col("date").isNull()) | (col("IATA").isNull()) | (col("flight_count").isNull()))
print(missing_ones.count())
display(missing_ones)

date,IATA,flight_count


#### Write the `flights_per_day` table to DBFS for use in data joining at a later stage.

In [0]:
dbutils.fs.rm("dbfs:/mnt/mids-w261/team20SSDK/cleaned_data/airlines/flights_per_day", True)

#Write cleaned airlines data to our store
flights_by_day.write.parquet("dbfs:/mnt/mids-w261/team20SSDK/cleaned_data/airlines/flights_per_day")

In [0]:
flights_by_day.printSchema()

In [0]:
display(flights_by_day)

date,IATA,flight_count
2016-08-07,HNL,284
2016-09-01,LAS,823
2016-08-13,ORD,1270
2016-09-01,DCA,447
2016-08-13,BNA,240
2016-08-14,SJC,208
2016-08-18,CMH,139
2016-08-13,SAT,145
2016-08-04,DTW,722
2016-08-11,BWI,543
