In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext, Window
from pyspark.sql.functions import concat, col, hour, minute, lpad, rpad, substring, year, month, dayofmonth, lit, to_timestamp, expr
import numpy as np
import plotly.express as px

sqlContext = SQLContext(sc)

import networkx as nx

In [0]:
# read airlines data
airlines_final = spark.read.parquet("dbfs:/mnt/mids-w261/team20SSDK/cleaned_data/final/airlines_final")
airlines_final.registerTempTable('airlines')

In [0]:
# initalize graph object
G = nx.DiGraph()

In [0]:
# get all origin and destination airports
nodes = airlines_final.select('ORIGIN').distinct().union(airlines_final.select('DEST').distinct())

In [0]:
# create all airports as nodes
for row in nodes.rdd.collect():
  G.add_node(row.ORIGIN)

In [0]:
# add all OD pairs as edges
for row in airlines_final.select('ORIGIN','DEST').distinct().rdd.collect():
  G.add_edge(row.ORIGIN, row.DEST)

In [0]:
# compute centrality
bc = nx.betweenness_centrality(G)

In [0]:
# get top 10 most central nodes
sorted(bc.items(), key=lambda x: x[1], reverse=True)

In [0]:
# -----
# Now calculate the count of delayed arrivals as departures for the top 10 airports

In [0]:
# training data
train = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/strategy/model_datasets/train")
train.registerTempTable('train')

In [0]:
# test data
test = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/strategy/model_datasets/test")
test.registerTempTable('test')

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,DIV_AIRPORT_LANDINGS,ORIGIN_TZ,DEST_TZ,DEP_MIN,DEP_HOUR,ARR_MIN,ARR_HOUR,ORIGIN_TS,ORIGIN_UTC,DEST_TS,DEST_UTC,ORIGIN_FLIGHT_COUNT,DEST_FLIGHT_COUNT,ORIGIN_STATION,ORIGIN_STATION_NAME,DEST_STATION,DEST_STATION_NAME,PAGERANK,ORIGIN_UTC_ADJ_MIN,ORIGIN_UTC_ADJ_MAX,ORIGIN_MAX_DATE,DEST_MAX_DATE,DELAYS_SO_FAR,CRS_ELAPSED_TIME_AVG_DIFF,WEST_TO_EAST,AVG_WND_SPEED_ORIGIN,AVG_CIG_HEIGHT_ORIGIN,MIN_CIG_HEIGHT_ORIGIN,AVG_VIS_DIS_ORIGIN,MIN_VIS_DIS_ORIGIN,AVG_TMP_DEG_ORIGIN,AVG_DEW_DEG_ORIGIN,AVG_SLP_ORIGIN,AVG_WND_SPEED_DEST,AVG_CIG_HEIGHT_DEST,MIN_CIG_HEIGHT_DEST,AVG_VIS_DIS_DEST,MIN_VIS_DIS_DEST,AVG_TMP_DEG_DEST,AVG_DEW_DEG_DEST,AVG_SLP_DEST
2019,4,11,16,6,2019-11-16,G4,20368,G4,217NV,2985,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14685,1468502,34685,SAV,"Savannah, GA",GA,13,Georgia,34,932,916,-16.0,0.0,0.0,-2,0900-0959,8.0,924,1102,7.0,1129,1109,-20.0,0.0,0.0,-2,1100-1159,0.0,0.0,117.0,113.0,98.0,1.0,669.0,3,0,America/New_York,America/New_York,32,9,29,11,2019-11-16T09:32:00.000+0000,2019-11-16T14:32:00.000+0000,2019-11-16T11:29:00.000+0000,2019-11-16T16:29:00.000+0000,25,93,72517014737,LEHIGH VALLEY INTERNATIONAL A,72207003822,SAVANNAH/HILTON HEAD INTL AIR,0.0009123668620151712,2019-11-14T12:32:00.000+0000,2019-11-16T12:32:00.000+0000,,,0.0,0.0,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,32.36965383496451,11927.960250259315,0.0,14292.956731186803,0.0,196.4743922935723,145.61755005462322,10181.833709909
2019,2,6,19,3,2019-06-19,G4,20368,G4,223NV,2985,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14685,1468502,34685,SAV,"Savannah, GA",GA,13,Georgia,34,932,917,-15.0,0.0,0.0,-1,0900-0959,12.0,929,1105,5.0,1129,1110,-19.0,0.0,0.0,-2,1100-1159,0.0,0.0,117.0,113.0,96.0,1.0,669.0,3,0,America/New_York,America/New_York,32,9,29,11,2019-06-19T09:32:00.000+0000,2019-06-19T13:32:00.000+0000,2019-06-19T11:29:00.000+0000,2019-06-19T15:29:00.000+0000,33,99,72517014737,LEHIGH VALLEY INTERNATIONAL A,72207003822,SAVANNAH/HILTON HEAD INTL AIR,0.0009123668620151712,2019-06-17T11:32:00.000+0000,2019-06-19T11:32:00.000+0000,,,0.0,0.0,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,32.36965383496451,11927.960250259315,0.0,14292.956731186803,0.0,196.4743922935723,145.61755005462322,10181.833709909
2019,2,6,26,3,2019-06-26,G4,20368,G4,223NV,2985,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14685,1468502,34685,SAV,"Savannah, GA",GA,13,Georgia,34,932,917,-15.0,0.0,0.0,-1,0900-0959,12.0,929,1102,15.0,1129,1117,-12.0,0.0,0.0,-1,1100-1159,0.0,0.0,117.0,120.0,93.0,1.0,669.0,3,0,America/New_York,America/New_York,32,9,29,11,2019-06-26T09:32:00.000+0000,2019-06-26T13:32:00.000+0000,2019-06-26T11:29:00.000+0000,2019-06-26T15:29:00.000+0000,32,106,72517014737,LEHIGH VALLEY INTERNATIONAL A,72207003822,SAVANNAH/HILTON HEAD INTL AIR,0.0009123668620151712,2019-06-24T11:32:00.000+0000,2019-06-26T11:32:00.000+0000,,,0.0,0.0,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,32.36965383496451,11927.960250259315,0.0,14292.956731186803,0.0,196.4743922935723,145.61755005462322,10181.833709909
2019,1,1,10,4,2019-01-10,G4,20368,G4,228NV,889,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14112,1411206,33195,PIE,"St. Petersburg, FL",FL,12,Florida,33,1452,1447,-5.0,0.0,0.0,-1,1400-1459,12.0,1459,1701,7.0,1730,1708,-22.0,0.0,0.0,-2,1700-1759,0.0,0.0,158.0,141.0,122.0,1.0,970.0,4,0,America/New_York,America/New_York,52,14,30,17,2019-01-10T14:52:00.000+0000,2019-01-10T19:52:00.000+0000,2019-01-10T17:30:00.000+0000,2019-01-10T22:30:00.000+0000,26,38,72517014737,LEHIGH VALLEY INTERNATIONAL A,72211612873,ST PETE-CLWTR INTL AIRPORT,0.0009123668620151712,2019-01-08T17:52:00.000+0000,2019-01-10T17:52:00.000+0000,2019-01-10T17:51:00.000+0000,2019-01-10T16:53:00.000+0000,0.0,1.0495626822157362,0,93.0,1372.0,1372.0,16093.0,16093.0,17.0,-89.0,10097.0,31.0,22000.0,22000.0,16093.0,16093.0,128.0,-6.0,10206.0
2019,2,6,9,7,2019-06-09,G4,20368,G4,251NV,2202,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,1457,1447,-10.0,0.0,0.0,-1,1400-1459,10.0,1457,1711,12.0,1716,1723,7.0,7.0,0.0,0,1700-1759,0.0,0.0,139.0,156.0,134.0,1.0,882.0,4,0,America/New_York,America/New_York,57,14,16,17,2019-06-09T14:57:00.000+0000,2019-06-09T18:57:00.000+0000,2019-06-09T17:16:00.000+0000,2019-06-09T21:16:00.000+0000,28,77,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2019-06-07T16:57:00.000+0000,2019-06-09T16:57:00.000+0000,,,1.0,-7.314677930306232,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,33.34186223071666,14357.086473169538,30.0,14687.406605804024,0.0,225.99600087488724,178.0760513017042,10176.972235997097
2019,1,2,5,2,2019-02-05,G4,20368,G4,256NV,2125,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,1111,1112,1.0,1.0,0.0,0,1100-1159,9.0,1121,1321,4.0,1342,1325,-17.0,0.0,0.0,-2,1300-1359,0.0,0.0,151.0,133.0,120.0,1.0,882.0,4,0,America/New_York,America/New_York,11,11,42,13,2019-02-05T11:11:00.000+0000,2019-02-05T16:11:00.000+0000,2019-02-05T13:42:00.000+0000,2019-02-05T18:42:00.000+0000,19,17,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2019-02-03T14:11:00.000+0000,2019-02-05T14:11:00.000+0000,2019-02-05T13:51:00.000+0000,2019-02-05T13:53:00.000+0000,0.0,4.685322069693768,0,0.0,1676.0,1676.0,6437.0,6437.0,50.0,11.0,10156.0,31.0,22000.0,22000.0,16093.0,16093.0,150.0,128.0,10205.0
2019,2,5,13,1,2019-05-13,G4,20368,G4,262NV,1683,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14082,1408202,34082,PGD,"Punta Gorda, FL",FL,12,Florida,33,2029,2051,22.0,22.0,1.0,1,2000-2059,10.0,2101,2320,4.0,2310,2324,14.0,14.0,0.0,0,2300-2359,0.0,0.0,161.0,153.0,139.0,1.0,1018.0,5,0,America/New_York,America/New_York,29,20,10,23,2019-05-13T20:29:00.000+0000,2019-05-14T00:29:00.000+0000,2019-05-13T23:10:00.000+0000,2019-05-14T03:10:00.000+0000,26,9,72517014737,LEHIGH VALLEY INTERNATIONAL A,72203412812,CHARLOTTE COUNTY AIRPORT,0.0009123668620151712,2019-05-11T22:29:00.000+0000,2019-05-13T22:29:00.000+0000,,,0.0,0.4720496894410075,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,33.38625059185719,14727.900486633764,30.0,13949.23209694723,0.0,230.97630403660244,188.11947724354,10177.884651494782
2019,1,1,13,7,2019-01-13,G4,20368,G4,274NV,889,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14112,1411206,33195,PIE,"St. Petersburg, FL",FL,12,Florida,33,1452,1445,-7.0,0.0,0.0,-1,1400-1459,9.0,1454,1713,5.0,1730,1718,-12.0,0.0,0.0,-1,1700-1759,0.0,0.0,158.0,153.0,139.0,1.0,970.0,4,0,America/New_York,America/New_York,52,14,30,17,2019-01-13T14:52:00.000+0000,2019-01-13T19:52:00.000+0000,2019-01-13T17:30:00.000+0000,2019-01-13T22:30:00.000+0000,24,46,72517014737,LEHIGH VALLEY INTERNATIONAL A,72211612873,ST PETE-CLWTR INTL AIRPORT,0.0009123668620151712,2019-01-11T17:52:00.000+0000,2019-01-13T17:52:00.000+0000,2019-01-13T17:51:00.000+0000,2019-01-13T16:53:00.000+0000,0.0,1.0495626822157362,0,51.0,3048.0,3048.0,16093.0,16093.0,0.0,-117.0,10273.0,72.0,22000.0,22000.0,16093.0,16093.0,244.0,183.0,10209.0
2019,4,11,1,5,2019-11-01,G4,20368,G4,312NV,2112,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476107,34761,SFB,"Sanford, FL",FL,12,Florida,33,1941,1932,-9.0,0.0,0.0,-1,1900-1959,7.0,1939,2148,7.0,2210,2155,-15.0,0.0,0.0,-1,2200-2259,0.0,0.0,149.0,143.0,129.0,1.0,882.0,4,0,America/New_York,America/New_York,41,19,10,22,2019-11-01T19:41:00.000+0000,2019-11-01T23:41:00.000+0000,2019-11-01T22:10:00.000+0000,2019-11-02T02:10:00.000+0000,26,38,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2019-10-30T21:41:00.000+0000,2019-11-01T21:41:00.000+0000,,,0.0,2.6853220696937683,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,33.34186223071666,14357.086473169538,30.0,14687.406605804024,0.0,225.99600087488724,178.0760513017042,10176.972235997097
2019,1,3,18,1,2019-03-18,G4,20368,G4,329NV,2295,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,945,938,-7.0,0.0,0.0,-1,0900-0959,11.0,949,1201,8.0,1212,1209,-3.0,0.0,0.0,-1,1200-1259,0.0,0.0,147.0,151.0,132.0,1.0,882.0,4,0,America/New_York,America/New_York,45,9,12,12,2019-03-18T09:45:00.000+0000,2019-03-18T13:45:00.000+0000,2019-03-18T12:12:00.000+0000,2019-03-18T16:12:00.000+0000,26,75,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2019-03-16T11:45:00.000+0000,2019-03-18T11:45:00.000+0000,,,0.0,0.6853220696937683,0,29.72442613031582,10902.78133872841,0.0,13160.448307473336,0.0,118.1308131720527,56.89295121113973,10168.912909902245,33.34186223071666,14357.086473169538,30.0,14687.406605804024,0.0,225.99600087488724,178.0760513017042,10176.972235997097


In [0]:
# validation data
validation = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/strategy/model_datasets/validation")
validation.registerTempTable('validation')

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,DIV_AIRPORT_LANDINGS,ORIGIN_TZ,DEST_TZ,DEP_MIN,DEP_HOUR,ARR_MIN,ARR_HOUR,ORIGIN_TS,ORIGIN_UTC,DEST_TS,DEST_UTC,ORIGIN_FLIGHT_COUNT,DEST_FLIGHT_COUNT,ORIGIN_STATION,ORIGIN_STATION_NAME,DEST_STATION,DEST_STATION_NAME,PAGERANK,ORIGIN_UTC_ADJ_MIN,ORIGIN_UTC_ADJ_MAX,ORIGIN_MAX_DATE,DEST_MAX_DATE,DELAYS_SO_FAR,CRS_ELAPSED_TIME_AVG_DIFF,WEST_TO_EAST,AVG_WND_SPEED_ORIGIN,AVG_CIG_HEIGHT_ORIGIN,MIN_CIG_HEIGHT_ORIGIN,AVG_VIS_DIS_ORIGIN,MIN_VIS_DIS_ORIGIN,AVG_TMP_DEG_ORIGIN,AVG_DEW_DEG_ORIGIN,AVG_SLP_ORIGIN,AVG_WND_SPEED_DEST,AVG_CIG_HEIGHT_DEST,MIN_CIG_HEIGHT_DEST,AVG_VIS_DIS_DEST,MIN_VIS_DIS_DEST,AVG_TMP_DEG_DEST,AVG_DEW_DEG_DEST,AVG_SLP_DEST
2018,2,5,12,6,2018-05-12,G4,20368,G4,221NV,889,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14112,1411206,33195,PIE,"St. Petersburg, FL",FL,12,Florida,33,1046,1032,-14.0,0.0,0.0,-1,1000-1059,7.0,1039,1254,5.0,1322,1259,-23.0,0.0,0.0,-2,1300-1359,0.0,0.0,156.0,147.0,135.0,1.0,970.0,4,0,America/New_York,America/New_York,46,10,22,13,2018-05-12T10:46:00.000+0000,2018-05-12T14:46:00.000+0000,2018-05-12T13:22:00.000+0000,2018-05-12T17:22:00.000+0000,23,37,72517014737,LEHIGH VALLEY INTERNATIONAL A,72211612873,ST PETE-CLWTR INTL AIRPORT,0.0009123668620151712,2018-05-10T12:46:00.000+0000,2018-05-12T12:46:00.000+0000,2018-05-12T12:00:00.000+0000,2018-05-12T11:53:00.000+0000,0.0,-0.9504373177842638,0,30.46153846153846,3635.4066098234166,91.0,13257.14,4023.0,128.56451612903226,112.4,10196.04435047312,3.934210526315789,17898.383561643837,2896.0,15489.53947368421,4023.0,207.25,174.68421052631578,10196.666666666666
2018,2,6,8,5,2018-06-08,G4,20368,G4,221NV,1775,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,2029,2343,194.0,194.0,1.0,12,2000-2059,9.0,2352,209,7.0,2312,216,184.0,184.0,1.0,12,2300-2359,0.0,0.0,163.0,153.0,137.0,1.0,1041.0,5,0,America/New_York,America/New_York,29,20,12,23,2018-06-08T20:29:00.000+0000,2018-06-09T00:29:00.000+0000,2018-06-08T23:12:00.000+0000,2018-06-09T03:12:00.000+0000,19,548,72517014737,LEHIGH VALLEY INTERNATIONAL A,74783012849,FT LAUD/HOLLYWOOD INTL APT,0.0009123668620151712,2018-06-06T22:29:00.000+0000,2018-06-08T22:29:00.000+0000,2018-06-08T21:51:00.000+0000,2018-06-08T21:53:00.000+0000,0.0,-2.7920000000000016,0,31.53125,21450.23209876543,6096.0,15862.333333333334,12875.0,257.09302325581393,161.6451612903226,10194.045454545454,32.04,12839.322916666666,762.0,15935.333333333334,12875.0,270.6666666666667,225.875,10193.135265700485
2018,3,8,6,1,2018-08-06,G4,20368,G4,224NV,1775,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,2028,2116,48.0,48.0,1.0,3,2000-2059,10.0,2126,2345,5.0,2314,2350,36.0,36.0,1.0,2,2300-2359,0.0,0.0,166.0,154.0,139.0,1.0,1041.0,5,0,America/New_York,America/New_York,28,20,14,23,2018-08-06T20:28:00.000+0000,2018-08-07T00:28:00.000+0000,2018-08-06T23:14:00.000+0000,2018-08-07T03:14:00.000+0000,23,507,72517014737,LEHIGH VALLEY INTERNATIONAL A,74783012849,FT LAUD/HOLLYWOOD INTL APT,0.0009123668620151712,2018-08-04T22:28:00.000+0000,2018-08-06T22:28:00.000+0000,2018-08-06T21:51:00.000+0000,2018-08-06T21:53:00.000+0000,0.0,0.2079999999999984,0,29.5,18085.445501730108,1524.0,15427.617647058823,0.0,297.95652173913044,218.76470588235293,10169.44,56.2,19331.510416666668,3658.0,15887.958333333334,11265.0,304.27777777777777,232.25,10172.296052631578
2018,3,9,10,1,2018-09-10,G4,20368,G4,247NV,1683,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14082,1408202,34082,PGD,"Punta Gorda, FL",FL,12,Florida,33,959,958,-1.0,0.0,0.0,-1,0900-0959,13.0,1011,1227,7.0,1241,1234,-7.0,0.0,0.0,-1,1200-1259,0.0,0.0,162.0,156.0,136.0,1.0,1018.0,5,0,America/New_York,America/New_York,59,9,41,12,2018-09-10T09:59:00.000+0000,2018-09-10T13:59:00.000+0000,2018-09-10T12:41:00.000+0000,2018-09-10T16:41:00.000+0000,23,24,72517014737,LEHIGH VALLEY INTERNATIONAL A,72203412812,CHARLOTTE COUNTY AIRPORT,0.0009123668620151712,2018-09-08T11:59:00.000+0000,2018-09-10T11:59:00.000+0000,2018-09-10T11:51:00.000+0000,2018-09-10T11:53:00.000+0000,0.0,1.4720496894410076,0,51.8208245243129,373.8890824622532,30.0,9506.375725900116,3219.0,137.99410774410774,131.54648526077096,10204.564005130673,2.873015873015873,19101.762711864405,152.0,14265.49152542373,1207.0,239.75903614457832,225.3272727272727,10149.958333333334
2018,1,2,2,5,2018-02-02,G4,20368,G4,253NV,1683,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14082,1408202,34082,PGD,"Punta Gorda, FL",FL,12,Florida,33,2007,1953,-14.0,0.0,0.0,-1,2000-2059,9.0,2002,2226,7.0,2255,2233,-22.0,0.0,0.0,-2,2200-2259,0.0,0.0,168.0,160.0,144.0,1.0,1018.0,5,0,America/New_York,America/New_York,7,20,55,22,2018-02-02T20:07:00.000+0000,2018-02-03T01:07:00.000+0000,2018-02-02T22:55:00.000+0000,2018-02-03T03:55:00.000+0000,23,21,72517014737,LEHIGH VALLEY INTERNATIONAL A,72203412812,CHARLOTTE COUNTY AIRPORT,0.0009123668620151712,2018-01-31T23:07:00.000+0000,2018-02-02T23:07:00.000+0000,2018-02-02T22:51:00.000+0000,2018-02-02T22:53:00.000+0000,0.0,7.472049689441008,0,56.32,12752.25,518.0,14371.944444444443,1207.0,-64.48333333333333,-162.89473684210526,10244.99523809524,31.075949367088608,18698.96923076923,1341.0,15855.942028985508,11000.0,205.73,99.83098591549296,10220.569952305246
2018,3,8,4,6,2018-08-04,G4,20368,G4,310NV,889,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14112,1411206,33195,PIE,"St. Petersburg, FL",FL,12,Florida,33,1753,1830,37.0,37.0,1.0,2,1700-1759,9.0,1839,2054,7.0,2029,2101,32.0,32.0,1.0,2,2000-2059,0.0,0.0,156.0,151.0,135.0,1.0,970.0,4,0,America/New_York,America/New_York,53,17,29,20,2018-08-04T17:53:00.000+0000,2018-08-04T21:53:00.000+0000,2018-08-04T20:29:00.000+0000,2018-08-05T00:29:00.000+0000,19,46,72517014737,LEHIGH VALLEY INTERNATIONAL A,72211612873,ST PETE-CLWTR INTL AIRPORT,0.0009123668620151712,2018-08-02T19:53:00.000+0000,2018-08-04T19:53:00.000+0000,2018-08-04T19:51:00.000+0000,2018-08-04T19:53:00.000+0000,3.0,-0.9504373177842638,0,36.75,21317.466666666667,1524.0,16039.366666666669,14484.0,289.30232558139534,208.4516129032258,10188.227272727272,23.416666666666668,18961.616666666665,1280.0,15829.672727272728,11265.0,330.51666666666665,212.52631578947367,10191.916666666666
2018,3,9,14,5,2018-09-14,G4,20368,G4,426NV,2125,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,1554,1542,-12.0,0.0,0.0,-1,1500-1559,8.0,1550,1740,7.0,1823,1747,-36.0,0.0,0.0,-2,1800-1859,0.0,0.0,149.0,125.0,110.0,1.0,882.0,4,0,America/New_York,America/New_York,54,15,23,18,2018-09-14T15:54:00.000+0000,2018-09-14T19:54:00.000+0000,2018-09-14T18:23:00.000+0000,2018-09-14T22:23:00.000+0000,22,41,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2018-09-12T17:54:00.000+0000,2018-09-14T17:54:00.000+0000,2018-09-14T17:51:00.000+0000,2018-09-14T17:53:00.000+0000,1.0,2.6853220696937683,0,38.78947368421053,1070.8333333333333,152.0,15556.583333333334,6437.0,232.26530612244895,207.27027027027023,10224.127367424242,37.36363636363637,15255.53846153846,823.0,15878.422222222222,11265.0,318.41964285714283,228.84615384615384,10123.158986175116
2018,1,1,7,7,2018-01-07,G4,20368,G4,876GA,625,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,1806,1835,29.0,29.0,1.0,1,1800-1859,19.0,1854,2056,7.0,2041,2103,22.0,22.0,1.0,1,2000-2059,0.0,0.0,155.0,148.0,122.0,1.0,882.0,4,0,America/New_York,America/New_York,6,18,41,20,2018-01-07T18:06:00.000+0000,2018-01-07T23:06:00.000+0000,2018-01-07T20:41:00.000+0000,2018-01-08T01:41:00.000+0000,17,64,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2018-01-05T21:06:00.000+0000,2018-01-07T21:06:00.000+0000,2018-01-07T20:51:00.000+0000,2018-01-07T20:53:00.000+0000,2.0,8.685322069693768,0,27.32558139534884,21497.321428571428,7925.0,16037.51724137931,14484.0,-83.35849056603773,-187.7333333333333,10338.451612903224,46.73033707865169,14976.76,884.0,16011.169491525425,11265.0,132.93846153846152,26.15873015873016,10285.860310421283
2018,1,1,18,4,2018-01-18,G4,20368,G4,876GA,625,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,1806,1833,27.0,27.0,1.0,1,1800-1859,15.0,1848,2048,7.0,2041,2055,14.0,14.0,0.0,0,2000-2059,0.0,0.0,155.0,142.0,120.0,1.0,882.0,4,0,America/New_York,America/New_York,6,18,41,20,2018-01-18T18:06:00.000+0000,2018-01-18T23:06:00.000+0000,2018-01-18T20:41:00.000+0000,2018-01-19T01:41:00.000+0000,22,64,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2018-01-16T21:06:00.000+0000,2018-01-18T21:06:00.000+0000,2018-01-18T20:51:00.000+0000,2018-01-18T20:53:00.000+0000,1.0,8.685322069693768,0,46.5,20737.78787878788,1158.0,15410.30303030303,4828.0,-0.3898305084745763,-115.36111111111111,10160.15625,30.571428571428573,20915.647058823528,762.0,15990.825396825398,11265.0,100.72932330827068,-67.31343283582089,10264.409677419357
2018,2,6,2,6,2018-06-02,G4,20368,G4,876GA,625,10135,1013505,30135,ABE,"Allentown/Bethlehem/Easton, PA",PA,42,Pennsylvania,23,14761,1476106,34761,SFB,"Sanford, FL",FL,12,Florida,33,951,941,-10.0,0.0,0.0,-1,0900-0959,7.0,948,1148,8.0,1216,1156,-20.0,0.0,0.0,-2,1200-1259,0.0,0.0,145.0,135.0,120.0,1.0,882.0,4,0,America/New_York,America/New_York,51,9,16,12,2018-06-02T09:51:00.000+0000,2018-06-02T13:51:00.000+0000,2018-06-02T12:16:00.000+0000,2018-06-02T16:16:00.000+0000,22,59,72517014737,LEHIGH VALLEY INTERNATIONAL A,72205712854,ORLANDO SANFORD AIRPORT,0.0009123668620151712,2018-05-31T11:51:00.000+0000,2018-06-02T11:51:00.000+0000,2018-06-02T11:51:00.000+0000,2018-06-02T10:53:00.000+0000,1.0,-1.3146779303062317,0,17.790697674418606,13764.024390243903,30.0,11971.658536585366,0.0,220.6296296296296,204.28571428571428,10078.48447204969,14.507246376811594,17001.714285714286,122.0,12030.887096774191,1609.0,227.28455284552845,223.11475409836063,10142.239305555557


In [0]:
# get all of the 2-hour prior to departure times in UTC
# for train
possible_utcs_train = spark.sql("""
SELECT DISTINCT ORIGIN_UTC_ADJ_MAX FROM train
""")
possible_utcs_train = possible_utcs_train.withColumn("DAY_ZERO", f.date_trunc("day", "ORIGIN_UTC_ADJ_MAX"))
possible_utcs_train.write.mode('overwrite').parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/possible_utcs_train')
# for test
possible_utcs_test = spark.sql("""
SELECT DISTINCT ORIGIN_UTC_ADJ_MAX FROM test
""")
possible_utcs_test = possible_utcs_test.withColumn("DAY_ZERO", f.date_trunc("day", "ORIGIN_UTC_ADJ_MAX"))
possible_utcs_test.write.mode('overwrite').parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/possible_utcs_test')
# for validation
possible_utcs_validation = spark.sql("""
SELECT DISTINCT ORIGIN_UTC_ADJ_MAX FROM validation
""")
possible_utcs_validation = possible_utcs_validation.withColumn("DAY_ZERO", f.date_trunc("day", "ORIGIN_UTC_ADJ_MAX"))
possible_utcs_validation.write.mode('overwrite').parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/possible_utcs_validation')

# read back parquet files for speed
possible_utcs_train = spark.read.parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/possible_utcs_train')
possible_utcs_train.registerTempTable('utc_train')
possible_utcs_test = spark.read.parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/possible_utcs_test')
possible_utcs_test.registerTempTable('utc_test')
possible_utcs_validation = spark.read.parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/possible_utcs_validation')
possible_utcs_validation.registerTempTable('utc_validation')

In [0]:
# Get just the necessary columns from the busiest airports to reduce the amount of shuffling, so they are each computed individually
# and added back together
for origin in ['ORD','DFW','ATL','DEN','MSP','ANC','CLT','SLC','DTW','LAX']:
  airport = spark.sql(f"""
    SELECT DEP_DEL15, ARR_DEL15, DEST_UTC, ORIGIN FROM train WHERE ORIGIN = '{origin}' ORDER BY DEST_UTC
  """)
  airport.write.mode('overwrite').parquet(f'dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{origin}_train')
  
for origin in ['ORD','DFW','ATL','DEN','MSP','ANC','CLT','SLC','DTW','LAX']:
  airport = spark.sql(f"""
    SELECT DEP_DEL15, ARR_DEL15, DEST_UTC, ORIGIN FROM test WHERE ORIGIN = '{origin}' ORDER BY DEST_UTC
  """)
  airport.write.mode('overwrite').parquet(f'dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{origin}_test')
  
for origin in ['ORD','DFW','ATL','DEN','MSP','ANC','CLT','SLC','DTW','LAX']:
  print(origin)
  airport = spark.sql(f"""
    SELECT DEP_DEL15, ARR_DEL15, DEST_UTC, ORIGIN FROM validation WHERE ORIGIN = '{origin}' ORDER BY DEST_UTC
  """)
  airport.write.mode('overwrite').parquet(f'dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{origin}_validation')

In [0]:
# The calculation of the sum of delays by departure time is divided by by airport and by year to keep the shuffle size small
for dataset in ['train', 'test', 'validation']:
  for origin in ['ORD','DFW','ATL','DEN','MSP','ANC','CLT','SLC','DTW','LAX']:
    for year in range(2015,2021):
      origin_airport_data = spark.read.parquet(f'dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{origin}_{dataset}')
      origin_airport_data.registerTempTable('origin_airport')
      hub_delays = spark.sql(f"""
      SELECT utc.ORIGIN_UTC_ADJ_MAX, COALESCE(sum(ord.ARR_DEL15)+sum(ord.DEP_DEL15),0) as NETWORK_CONGESTION
      FROM utc_{dataset}
      LEFT JOIN origin_airport AS ord ON ord.DEST_UTC BETWEEN utc.DAY_ZERO AND utc.ORIGIN_UTC_ADJ_MAX
      WHERE DAY_ZERO >= '{year}-01-01' AND DAY_ZERO < '{year+1}-01-01'
      GROUP BY utc.ORIGIN_UTC_ADJ_MAX
    """)
      hub_delays.write.mode('overwrite').parquet(f'dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{origin}_NC_{year}_{dataset}')

In [0]:
for dataset in ['train', 'test', 'validation']:
  # read back the delays from parqet files
  airport_groups = []
  for origin in ['ORD','DFW','ATL','DEN','MSP','ANC','CLT','SLC','DTW','LAX']:
    year_airports = []
    for year in range(2015, 2021):
      year_airports.append(spark.read.parquet(f'dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{origin}_NC_{year}_{dataset}'))
    airport_groups.append(year_airports)
  
  # an a year by year basis
  # make columns for each each airport and sum the columns together to add together all airports by year
  year_groups = []
  for i in range(0, len(airport_groups[0])):
    tmp1 = airport_groups[0][i].withColumnRenamed('NETWORK_CONGESTION','NETWORK_CONGESTION_0').join(airport_groups[0][i].withColumnRenamed('NETWORK_CONGESTION','NETWORK_CONGESTION_1'), ['ORIGIN_UTC_ADJ_MAX']).select('ORIGIN_UTC_ADJ_MAX', (col('NETWORK_CONGESTION_0') + col('NETWORK_CONGESTION_1')).alias('NETWORK_CONGESTION'))
    for group_index in range(1, len(airport_groups)):
      tmp1 = tmp1.withColumnRenamed('NETWORK_CONGESTION','NETWORK_CONGESTION_0').join(airport_groups[group_index][i].withColumnRenamed('NETWORK_CONGESTION','NETWORK_CONGESTION_1'), ['ORIGIN_UTC_ADJ_MAX']).select('ORIGIN_UTC_ADJ_MAX', (col('NETWORK_CONGESTION_0') + col('NETWORK_CONGESTION_1')).alias('NETWORK_CONGESTION'))
    year_groups.append(tmp1)
  # write out the computed columns
  final = year_groups[0].union(year_groups[1]).union(year_groups[2]).union(year_groups[3]).union(year_groups[4]).union(year_groups[5])
  final.write.mode('overwrite').parquet('dbfs:/mnt/mids-w261/team20SSDK/data/airline/network_congestion/{dataset}')
    

In [0]:
# at this point NETWORK_CONGESTION has been joined to the train table
# visualize Network Congestion vs Delays
train = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/project_data/train")
train.registerTempTable('train')
spark.sql('''SELECT ROUND(NETWORK_CONGESTION / 1000,0) AS NETWORK_CONGESTION_BINNED, SUM(DEP_DEL15) as DELAY_COUNT, COUNT(DEP_DELAY) as TOTAL_COUNT FROM train GROUP BY ROUND(NETWORK_CONGESTION / 1000,0) ORDER BY NETWORK_CONGESTION_BINNED''').display()

NETWORK_CONGESTION_BINNED,DELAY_COUNT,TOTAL_COUNT
0.0,53417.0,293481
1.0,75233.0,480347
2.0,75652.0,498205
3.0,82065.0,521396
4.0,85067.0,503328
5.0,80601.0,457678
6.0,75219.0,406216
7.0,67629.0,353298
8.0,61353.0,307816
9.0,54426.0,265790


In [0]:
network_histogram = spark.sql('''SELECT ROUND(NETWORK_CONGESTION / 1000,0) AS NETWORK_CONGESTION_BINNED, SUM(DEP_DEL15) as DELAY_COUNT, COUNT(DEP_DELAY) as TOTAL_COUNT FROM train GROUP BY ROUND(NETWORK_CONGESTION / 1000,0) ORDER BY NETWORK_CONGESTION_BINNED''').toPandas()

In [0]:
fig = px.bar(network_histogram, x='NETWORK_CONGESTION_BINNED', y='DELAY_COUNT',
             hover_data=['NETWORK_CONGESTION_BINNED', 'DELAY_COUNT'], color='DELAY_COUNT',
             labels={'pop':'Number of delays by amount of congestion'}, height=400)
fig.update_layout(barmode='stack')
fig.show()