# FlightAware Statistics
The notebook intends to leverage flightaware raw data and generate the turnaround statisitcs by building data engineering pipelines.

In [1]:
import sys
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
import numpy as np

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1575409633942_0002,pyspark3,idle,,,✔


SparkSession available as 'spark'.


In [4]:
# prepare airline code schema
df_airline = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airlines_openflight.csv")
df_airline = df_airline.dropna(how = 'any',  subset = ['icao'])
df_airline = df_airline.dropDuplicates(subset = ['icao'])

In [5]:
# prepare airport code schema
df_airport = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airports_openflight_wgeo.csv")
df_airport = df_airport.dropna(how = 'any',  subset = ['icao'])
df_airport = df_airport.dropDuplicates(subset = ['icao'])

In [6]:
#total_count = df_ta.agg(F.count('facility_name')).first()[0]
#df_ta.groupBy('facility_name').agg((100*F.count('facility_name')/total_count).alias('facility_count')).orderBy('facility_count', ascending = False).show(200)

## Departure Table

In [7]:
# Load FA data from predifined table and create TempView
spark = SparkSession.builder.getOrCreate()
df = spark.read.load("s3://sita-coe-ds-prod-v1/mart/flightaware/der_departure_latest")
df.createOrReplaceTempView("der_dep")

In [8]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_departure = spark.sql("""
               SELECT id, id_timestamp, reg, ident, orig, dest FROM der_dep
               WHERE ident is not null
               AND ident != reg
               AND id is not null
               AND ident RLIKE '[A-Z][A-Z][A-Z]'
               AND orig RLIKE '[A-Z][A-Z][A-Z][A-Z]'
               AND dest RLIKE '[A-Z][A-Z][A-Z][A-Z]'
               """)
df_departure = df_departure.dropDuplicates()

In [9]:
# create year column (i.e., year 2017, 2018, 2019)
df_departure = df_departure.withColumn('year', F.year(F.date_trunc('year', df_departure.id_timestamp))).withColumn('quarter', F.quarter(F.date_trunc('quarter', df_departure.id_timestamp))).withColumn('month', F.month(F.date_trunc('mon', df_departure.id_timestamp))).withColumn('year-mon',  F.date_format(F.col('id_timestamp'), 'yyyy-MM'))

In [10]:
# derive and filter airline code 
df_departure = df_departure.withColumn("dep_airline_code", df_departure.ident.substr(0,3))
# join airline code
df_departure = df_departure.join(F.broadcast(df_airline), df_departure.dep_airline_code == df_airline.icao, 'left').drop('alias', 'icao', 'callsign', 'iata', 'active')
df_departure = df_departure.withColumnRenamed('country', 'dep_airline_country')
# join airport code
df_departure = df_departure.join(F.broadcast(df_airport), df_departure.orig == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_departure = df_departure.withColumnRenamed('airport', 'dep_airport').withColumnRenamed('GEO', 'dep_geo').withColumnRenamed('city', 'dep_city').withColumnRenamed('country', 'dep_country').withColumnRenamed('latitude', 'dep_lat').withColumnRenamed('longitude', 'dep_lon').withColumnRenamed('altitude', 'dep_alt').withColumnRenamed('dst', 'dep_dst').withColumnRenamed('tz_database', 'dep_tz')
df_departure = df_departure.filter('(dep_airport is not null) and (dep_city is not null) and (orig is not null) and (dep_airport is not null)')

In [11]:
# total number of (unique) flight per SITA GEO
df_dep_perairport = df_departure.groupBy('dep_country', 'dep_city','orig', 'dep_airport', 'year', 'quarter', 'month', 'year-mon').agg(F.countDistinct('id').alias('count')).cache()
df_dep_perairport = df_dep_perairport.repartition(1)

In [12]:
df_dep_perairport_yearly = df_dep_perairport.groupBy('dep_country', 'dep_city','orig', 'dep_airport').pivot('year').agg(F.sum('count'))
df_dep_perairport_yearmon = df_dep_perairport.groupBy('dep_country', 'dep_city','orig', 'dep_airport').pivot('year-mon').agg(F.first('count'))
df_dep_perairport_merge = df_dep_perairport_yearly.join(df_dep_perairport_yearmon, ['dep_country', 'dep_city','orig', 'dep_airport'], 'inner')
df_dep_perairport.unpersist()

DataFrame[dep_geo: string, year: int, month: int, year-mon: string, count: bigint]

## Arrival Table

In [13]:
# Load FA data from predifined table and create TempView
spark = SparkSession.builder.getOrCreate()
df = spark.read.load("s3://sita-coe-ds-prod-v1/mart/flightaware/der_arrival_latest")
df.createOrReplaceTempView("der_arr")

In [14]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_arrival = spark.sql("""
               SELECT id, id_timestamp, reg, ident, dest, orig FROM der_arr
               WHERE ident is not null
               AND ident != reg
               AND id is not null
               AND ident RLIKE '[A-Z][A-Z][A-Z]'
               AND dest RLIKE '[A-Z][A-Z][A-Z][A-Z]'
               AND orig RLIKE '[A-Z][A-Z][A-Z][A-Z]'
               """)
df_arrival = df_arrival.dropDuplicates()

In [15]:
# create year column (i.e., year 2017, 2018, 2019)
df_arrival = df_arrival.withColumn('year', F.year(F.date_trunc('year', df_arrival.id_timestamp))).withColumn('quarter', F.quarter(F.date_trunc('quarter', df_departure.id_timestamp))).withColumn('month', F.month(F.date_trunc('mon', df_arrival.id_timestamp))).withColumn('year-mon',  F.date_format(F.col('id_timestamp'), 'yyyy-MM'))

In [16]:
# derive and filter airline code 
df_arrival = df_arrival.withColumn("arr_airline_code", df_arrival.ident.substr(0,3))
# join airline code
df_arrival = df_arrival.join(df_airline, df_arrival.arr_airline_code == df_airline.icao, 'left').drop('alias', 'icao', 'callsign', 'iata', 'active')
df_arrival = df_arrival.withColumnRenamed('country', 'arr_airline_country')
# join airport code
df_arrival = df_arrival.join(df_airport, df_arrival.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_arrival = df_arrival.withColumnRenamed('airport', 'arr_airport').withColumnRenamed('GEO', 'arr_geo').withColumnRenamed('city', 'arr_city').withColumnRenamed('country', 'arr_country').withColumnRenamed('latitude', 'arr_lat').withColumnRenamed('longitude', 'arr_lon').withColumnRenamed('altitude', 'arr_alt').withColumnRenamed('dst', 'arr_dst').withColumnRenamed('tz_database', 'arr_tz')
df_arrival = df_arrival.filter('(arr_country is not null) and (dest is not null) and (arr_city is not null) and (arr_airport is not null)')

In [17]:
# total number of (unique) flight per SITA GEO
df_arr_perairport = df_arrival.groupBy('arr_country', 'arr_city','dest', 'arr_airport', 'year', 'quarter', 'month', 'year-mon').agg(F.countDistinct('id').alias('count')).cache()
df_arr_perairport = df_arr_perairport.repartition(1)

In [18]:
df_arr_perairport_yearly = df_arr_perairport.groupBy('arr_country', 'arr_city','dest', 'arr_airport').pivot('year').agg(F.sum('count'))
df_arr_perairport_yearmon = df_arr_perairport.groupBy('arr_country', 'arr_city','dest', 'arr_airport').pivot('year-mon').agg(F.first('count'))
df_arr_perairport_merge = df_arr_perairport_yearly.join(df_arr_perairport_yearmon, ['arr_country', 'arr_city','dest', 'arr_airport'], 'inner')
df_arr_perairport.unpersist()

DataFrame[arr_geo: string, year: int, month: int, year-mon: string, count: bigint]

## Combined Table

In [20]:
df_dep_perairport_merge.show()

+-----------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|dep_geo_dep|2017_dep|2018_dep|2019_dep|2017-02_dep|2017-03_dep|2017-04_dep|2017-05_dep|2017-06_dep|2017-07_dep|2017-08_dep|2017-09_dep|2017-10_dep|2017-11_dep|2017-12_dep|2018-01_dep|2018-02_dep|2018-03_dep|2018-04_dep|2018-05_dep|2018-06_dep|2018-07_dep|2018-08_dep|2018-09_dep|2018-10_dep|2018-11_dep|2018-12_dep|2019-01_dep|2019-02_dep|2019-03_dep|2019-04_dep|2019-05_dep|2019-06_dep|2019-07_dep|2019-08_dep|2019-09_dep|2019-10_dep|2019-11_dep|
+-----------+--------+--------+--------+-----------+-----------+-----------+-----------+-----------+--

In [21]:
dep_column_name_list = list(map(lambda x: x+'_dep', df_dep_perairport_merge.columns))
df_dep_perairport_merge = df_dep_perairport_merge.toDF(*dep_column_name_list)
df_dep_perairport_merge = df_dep_perairport_merge.withColumnRenamed('dep_country_dep', 'country').withColumnRenamed('dep_city_dep', 'city').withColumnRenamed('orig_dep', 'icao').withColumnRenamed('dep_airport_dep', 'airport')
arr_column_name_list = list(map(lambda x: x+'_arr', df_arr_percountry_merge.columns))
df_arr_perairport_merge = df_arr_perairport_merge.toDF(*arr_column_name_list)
df_arr_perairport_merge = df_arr_perairport_merge.withColumnRenamed('arr_country_arr', 'country').withColumnRenamed('arr_city_arr', 'city').withColumnRenamed('dest_arr', 'icao').withColumnRenamed('arr_airport_arr', 'airport')
df_perairport_combined = df_dep_perairport_merge.join(df_arr_perairport_merge, ['country', 'city', 'icao', 'airport'], 'inner' )

## Appendix

In [25]:
df_perairport_combined.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/perairport_full')