In [0]:
from pyspark.sql import functions as f

from pyspark.sql.functions import concat, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

# West to East

##### Goal:
Use the longitudes of the origin and destination airport to determine if the flight is going from West to East.

##### Hypothesis:
We have constructed a number of features which focus on detecting propagation delay in the airline travel network, however delays can also be recovered from and we want to attempt to measure that likelihood as well.  The jet stream blows eastward and often provides a tail wild which allows aircraft to increase their ground speed without incurring higher fuel costs, which are typically a limiting factor imposed on pilots by airline policy.  The West to East feature identifies flight paths which are traveling in the same direction as the jet stream by comparing origin and destination longitudes and have a higher probability of recovering from a prior delay.

In [0]:
# get the filght data
airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/cleaned_data/airlines/airlines_latest_utc/part-00*.parquet")

In [0]:
# get the airport data
airports = spark.read.parquet('dbfs:/mnt/mids-w261/team20SSDK/cleaned_data/station/airport_meta')

In [0]:
# add the longitudes for origin and distination airports
airlines_with_lons = airlines.join(airports.select(col('IATA').alias('ORIGIN'),col('lon').cast('float').alias('ORIGIN_LONGITUDE')), ['ORIGIN']).join(airports.select(col('IATA').alias('DEST'),col('lon').cast('float').alias('DEST_LONGITUDE')), ['DEST'])

In [0]:
# add column which indicates if flight is going from west to east
airlines_west_east = airlines_with_lons.withColumn('WEST_TO_EAST', f.when(col('ORIGIN_LONGITUDE') < col('DEST_LONGITUDE'), 1).otherwise(0))

In [0]:
# count which direction flights are delayed more
airlines_west_east.registerTempTable('data')
spark.sql('''SELECT WEST_TO_EAST, SUM(DEP_DEL15) as DELAY_COUNT, COUNT(DEP_DEL15) as TOTAL_COUNT from data group by WEST_TO_EAST order by west_to_east''').display()
#airlines_west_east.filter('DEP_DEL15 == 1').select('WEST_TO_EAST').groupBy('WEST_TO_EAST').count().display()

WEST_TO_EAST,DELAY_COUNT,TOTAL_COUNT
0,2814307.0,15585323
1,2844404.0,15585876
