# FlightAware Statistics
The notebook intends to leverage flightaware raw data and generate the turnaround statisitcs by building data engineering pipelines.

In [1]:
import sys
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
import numpy as np

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2,application_1578380065523_0003,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [3]:
# prepare airline code schema
df_airline = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airlines_openflight.csv")
df_airline = df_airline.dropna(how = 'any',  subset = ['icao'])
df_airline = df_airline.dropDuplicates(subset = ['icao'])
df_airline.count()

5854

In [4]:
# prepare airport code schema
df_airport = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airports_openflight.csv")
df_airport = df_airport.dropna(how = 'any',  subset = ['icao'])
df_airport = df_airport.dropDuplicates(subset = ['icao'])

In [4]:
#total_count = df_ta.agg(F.count('facility_name')).first()[0]
#df_ta.groupBy('facility_name').agg((100*F.count('facility_name')/total_count).alias('facility_count')).orderBy('facility_count', ascending = False).show(200)

## Departure Table

In [None]:
# Load FA data from predifined table and create TempView
spark = SparkSession.builder.getOrCreate()
df = spark.read.load("s3://sita-coe-ds-prod-v1/mart/flightaware/der_departure_latest")
df.createOrReplaceTempView("der_dep")

In [None]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_departure = spark.sql("""
               SELECT id, id_timestamp, reg, ident, orig FROM der_dep
               WHERE ident is not null
               AND ident != reg
               AND id is not null
               AND ident RLIKE '[A-Z][A-Z][A-Z]'
               AND orig RLIKE '[A-Z][A-Z][A-Z][A-Z]'
               """)
df_departure = df_departure.dropDuplicates()

In [None]:
# create year column (i.e., year 2017, 2018, 2019)
df_departure = df_departure.withColumn('year', F.year(F.date_trunc('year', df_departure.id_timestamp)))

In [None]:
# derive and filter airline code 
df_departure = df_departure.withColumn("dep_airline_code", df_departure.ident.substr(0,3))
# join airline code
df_departure = df_departure.join(df_airline, df_departure.dep_airline_code == df_airline.icao, 'left').drop('alias', 'icao', 'callsign', 'iata', 'active')
df_departure = df_departure.withColumnRenamed('country', 'dep_airline_country')
# join airport code
df_departure = df_departure.join(df_airport, df_departure.orig == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_departure = df_departure.withColumnRenamed('airport', 'dep_airport').withColumnRenamed('city', 'dep_city').withColumnRenamed('country', 'dep_country').withColumnRenamed('latitude', 'dep_lat').withColumnRenamed('longitude', 'dep_lon').withColumnRenamed('altitude', 'dep_alt').withColumnRenamed('dst', 'dep_dst').withColumnRenamed('tz_database', 'dep_tz')

In [None]:
# total number of (unique) flight
df_departure.agg(F.countDistinct('id')).show()

In [None]:
# total number of (unique) flight per airport country
df_dep_flightcount_percountry = df_departure.groupBy('dep_country').agg(F.countDistinct('id').alias('count_dep_flight_percountry'),
                                                                        F.countDistinct('reg').alias('count_reg_flight_percountry'),
                                                                        F.countDistinct('id', F.when((F.col("year") == 2017) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2017"),
                                                                        F.countDistinct('id', F.when((F.col("year") == 2018) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2018"),
                                                                        F.countDistinct('id', F.when((F.col("year") == 2019) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2019")).orderBy('count_dep_flight_percountry',ascending = False)
df_dep_flightcount_percountry = df_dep_flightcount_percountry.filter("dep_country is not null")

In [None]:
# total number of flight by airport city
df_dep_flightcount_percity = df_departure.groupBy('dep_country', 'dep_city').agg(F.countDistinct('id').alias('count_dep_flight_percity'),
                                                                                 F.countDistinct('reg').alias('count_reg_flight_percity'),
                                                                                 F.countDistinct('id', F.when((F.col("year") == 2017) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2017"),
                                                                                 F.countDistinct('id', F.when((F.col("year") == 2018) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2018"),
                                                                                 F.countDistinct('id', F.when((F.col("year") == 2019) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2019"))
df_dep_flightcount_percity = df_dep_flightcount_percity.filter("(dep_country is not null) and (dep_city is not null)")

In [None]:
# total number of flight by airport 
df_dep_flightcount_perairport = df_departure.groupBy('dep_country', 'dep_city','orig', 'dep_airport').agg(F.countDistinct('id').alias('count_dep_flight_perairport'),
                                                                                                          F.countDistinct('reg').alias('count_reg_flight_perairport'),
                                                                                                          F.countDistinct('id', F.when((F.col("year") == 2017) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2017"),
                                                                                                          F.countDistinct('id', F.when((F.col("year") == 2018) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2018"),
                                                                                                          F.countDistinct('id', F.when((F.col("year") == 2019) & (F.col("year").isNotNull()),True)).alias("count_dep_flight_2019"))
df_dep_flightcount_perairport = df_dep_flightcount_perairport.filter("(dep_country is not null) and (dep_city is not null) and (dep_airport is not null)")

In [None]:
df_dep_flightcount_percountry.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/percountry')
df_dep_flightcount_percity.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/percity')
df_dep_flightcount_perairport.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/perairport')

## Arrival Table

In [5]:
# Load FA data from predifined table and create TempView
spark = SparkSession.builder.getOrCreate()
df = spark.read.load("s3://sita-coe-ds-prod-v1/mart/flightaware/der_arrival_latest")
df.createOrReplaceTempView("der_arr")

In [6]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_arrival = spark.sql("""
               SELECT id, id_timestamp, reg, ident, dest, orig FROM der_arr
               WHERE ident is not null
               AND ident != reg
               AND id is not null
               AND ident RLIKE '[A-Z][A-Z][A-Z]'
               AND dest RLIKE '[A-Z][A-Z][A-Z][A-Z]'
               """)
df_arrival = df_arrival.dropDuplicates()

In [10]:
df_arrival.show(2)

+--------------------+-------------------+------+------+----+----+
|                  id|       id_timestamp|   reg| ident|dest|orig|
+--------------------+-------------------+------+------+----+----+
|MAU219-1563857148...|2019-07-23 04:45:48| 3BNAU|MAU219|FIMP|FMEE|
|AHY830-1562820362...|2019-07-11 04:46:02|4KAZ80|AHY830|UUWW|UBBG|
+--------------------+-------------------+------+------+----+----+
only showing top 2 rows

In [7]:
# create year column (i.e., year 2017, 2018, 2019)
df_arrival = df_arrival.withColumn('year', F.year(F.date_trunc('year', df_arrival.id_timestamp)))

In [8]:
# derive and filter airline code 
df_arrival = df_arrival.withColumn("arr_airline_code", df_arrival.ident.substr(0,3))
# join airline code
df_arrival = df_arrival.join(df_airline, df_arrival.arr_airline_code == df_airline.icao, 'left').drop('alias', 'icao', 'callsign', 'iata', 'active')
df_arrival = df_arrival.withColumnRenamed('country', 'arr_airline_country')
# join airport code
df_arrival = df_arrival.join(df_airport, df_arrival.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_arrival = df_arrival.withColumnRenamed('airport', 'arr_airport').withColumnRenamed('city', 'arr_city').withColumnRenamed('country', 'arr_country').withColumnRenamed('latitude', 'arr_lat').withColumnRenamed('longitude', 'arr_lon').withColumnRenamed('altitude', 'arr_alt').withColumnRenamed('dst', 'arr_dst').withColumnRenamed('tz_database', 'arr_tz')

In [9]:
# total number of flight
df_arrival.agg(F.countDistinct('id')).show()

+------------------+
|count(DISTINCT id)|
+------------------+
|          94549052|
+------------------+

In [14]:
df_arrival.show(2)

+--------------------+-------------------+------+------+----+----+----+----------------+-------------------+-------------------+--------------------+---------+-----------+-------------+-------------+-------+--------+-------+----------------+
|                  id|       id_timestamp|   reg| ident|dest|orig|year|arr_airline_code|            airline|arr_airline_country|         arr_airport| arr_city|arr_country|      arr_lat|      arr_lon|arr_alt|timezone|arr_dst|          arr_tz|
+--------------------+-------------------+------+------+----+----+----+----------------+-------------------+-------------------+--------------------+---------+-----------+-------------+-------------+-------+--------+-------+----------------+
|MAU219-1563857148...|2019-07-23 04:45:48| 3BNAU|MAU219|FIMP|FMEE|2019|             MAU|      Air Mauritius|          Mauritius|Sir Seewoosagur R...|Plaisance|  Mauritius|   -20.430201|    57.683601|    186|       4|      N|Indian/Mauritius|
|AHY830-1562820362...|2019-07-11

In [None]:
# total number of flight per airport country
df_arr_flightcount_percountry = df_arrival.groupBy('arr_country').agg(F.countDistinct('id').alias('count_arr_flight_percountry'),
                                                                      F.countDistinct('reg').alias('count_arr_reg_percountry')
                                                                      F.countDistinct('id', F.when((F.col("year") == 2017) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2017"),
                                                                      F.countDistinct('id', F.when((F.col("year") == 2018) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2018"),
                                                                      F.countDistinct('id', F.when((F.col("year") == 2019) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2019")).orderBy('count_arr_flight_percountry',ascending = False)
df_arr_flightcount_percountry = df_arr_flightcount_percountry.filter("arr_country is not null")

In [None]:
# total number of flight by airport city
df_arr_flightcount_percity = df_arrival.groupBy('arr_country', 'arr_city').agg(F.countDistinct('id').alias('count_arr_flight_percity'),
                                                                               F.countDistinct('reg').alias('count_arr_reg_percity'),
                                                                             F.countDistinct('id', F.when((F.col("year") == 2017) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2017"),
                                                                             F.countDistinct('id', F.when((F.col("year") == 2018) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2018"),
                                                                             F.countDistinct('id', F.when((F.col("year") == 2019) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2019"))
df_arr_flightcount_percity = df_arr_flightcount_percity.filter("(arr_country is not null) and (arr_city is not null)")

In [None]:
# total number of flight by airport 
df_arr_flightcount_perairport = df_arrival.groupBy('arr_country', 'arr_city','dest', 'arr_airport').agg(F.countDistinct('id').alias('count_arr_flight_perairport'),
                                                                                                        F.countDistinct('reg').alias('count_arr_reg_perairport')
                                                                                                       F.countDistinct('id', F.when((F.col("year") == 2017) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2017"),
                                                                                                       F.countDistinct('id', F.when((F.col("year") == 2018) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2018"),
                                                                                                       F.countDistinct('id', F.when((F.col("year") == 2019) & (F.col("year").isNotNull()),True)).alias("count_arr_flight_2019"))
df_arr_flightcount_perairport = df_arr_flightcount_perairport.filter("(arr_country is not null) and (arr_city is not null) and (arr_airport is not null)")

In [None]:
df_arr_flightcount_percountry.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/percountry')
df_arr_flightcount_percity.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/percity')
df_arr_flightcount_perairport.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/perairport')

## Combined Table

In [None]:
# combine data into a single dataframe
df_bm = df_flightcount_perairport.join(df_flightcount_percity, ['dest_country', 'dest_city'], 'left')
df_bm = df_bm.join(df_flightcount_percountry, ['dest_country'], 'left')

In [None]:
# outliner removal based on TAT thresholds
#df_ta = df_ta.filter(" (turnaround > 10 and turnaround < 360) or turnaround is null")
#df_ta.count()

## Appendix

In [None]:
#df_dep_flightcount_perairport2017.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats7/')
#df_dep_flightcount_perairport2018.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats8/')
df_dep_flightcount_perairport2019.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats9/')

## Airlines data

In [12]:

df_arr_airlines_wsss = df_arrival.filter(F.col('arr_airport' )== 'Singapore Changi Airport')
df_arr_airlines_wsss =df_arr_airlines_wsss.filter(F.col('year')==2019)

In [13]:
df_arr_airlines_wsss.show(2)

+--------------------+-------------------+-----+------+----+----+----+----------------+-------+-------------------+--------------------+---------+-----------+-------+----------+-------+--------+-------+--------------+
|                  id|       id_timestamp|  reg| ident|dest|orig|year|arr_airline_code|airline|arr_airline_country|         arr_airport| arr_city|arr_country|arr_lat|   arr_lon|arr_alt|timezone|arr_dst|        arr_tz|
+--------------------+-------------------+-----+------+----+----+----+----------------+-------+-------------------+--------------------+---------+-----------+-------+----------+-------+--------+-------+--------------+
|AXM717-1562820351...|2019-07-11 04:45:51|9MAFC|AXM717|WSSS|WMKK|2019|             AXM|AirAsia|           Malaysia|Singapore Changi ...|Singapore|  Singapore|1.35019|103.994003|     22|       8|      N|Asia/Singapore|
|AXM717-1563857134...|2019-07-23 04:45:34|9MAFD|AXM717|WSSS|WMKK|2019|             AXM|AirAsia|           Malaysia|Singapore Cha

In [14]:
df_arr_airlines_wsss.select('arr_airline_code').distinct().rdd.map(lambda r: r[0]).collect()

['BOX', 'TGW', 'TRK', 'RPH', 'CTM', 'RBA', 'ETD', 'SVA', 'MMZ', 'IDA', 'DIR', 'BKP', 'ABW', 'JSA', 'NCT', 'NEP', 'ERF', 'CLX', 'XAX', 'SLK', 'PAL', 'PIC', 'CHB', 'ETH', 'ACA', 'CSH', 'KAL', 'ATG', 'TEN', 'RGE', 'CCA', 'HBH', 'ORF', 'IGO', 'IAM', 'AIR', 'BHS', 'AXM', 'ANA', 'ICV', 'CDG', 'CHH', 'AXY', 'BAV', 'TLM', 'SIN', 'MYU', 'MZT', 'CSC', 'PAC', 'BAW', 'SEP', 'AWQ', 'GZP', 'UBA', 'BTK', 'EVA', 'MLT', 'GIA', 'CSN', 'VDA', 'CXA', 'JAL', 'RSD', 'DXB', 'CKK', 'AAH', 'LMG', 'REU', 'THA', 'CPA', 'VOZ', 'LNI', 'CQH', 'THY', 'DLH', 'ORB', 'FIN', 'RMY', 'VJT', 'ROJ', 'CNX', 'KLM', 'AHK', 'RKS', 'NRS', 'UAL', 'ANG', 'TMG', 'FJI', 'CEB', 'APG', 'AFR', 'TOM', 'TUA', 'CEF', 'CSS', 'ADB', 'KMI', 'FDV', 'ATN', 'DAL', 'GTI', 'MMA', 'IFA', 'QFA', 'ESR', 'CAL', 'FDX', 'VTI', 'JCO', 'MXD', 'DRK', 'AXB', 'BKK', 'AJX', 'HVN', 'LGT', 'ULK', 'UAE', 'MED', 'UBG', 'IJM', 'CES', 'MAS', 'JJA', 'WMK', 'AAR', 'AZA', 'QQE', 'AAB', 'TBJ', 'SWR', 'DUB', 'TRW', 'AIC', 'VIP', 'MAU', 'LOT', 'HVM', 'AOJ', 'GAP', 'CUH'

In [15]:
df_arrival.select('arr_airline_code').distinct().rdd.map(lambda r: r[0]).collect()

['TEU', 'BOX', 'SCW', 'EXY', 'TVJ', 'PEA', 'NBC', 'GIS', 'PHV', 'FAV', 'WQN', 'FMY', 'MIZ', 'DAW', 'KGL', 'YUL', 'RKP', 'DWR', 'PAW', 'NWI', 'JJS', '4RR', 'CLQ', 'SUC', 'FIZ', 'CCK', 'IHD', 'MDX', 'N3D', 'UAB', 'MHF', 'CNU', 'HQJ', 'AKD', 'TLI', 'VRI', 'WNG', 'VKY', 'TGW', 'HEL', 'AMF', 'TRK', 'EGL', 'PHA', 'ULA', 'CXM', 'OPE', 'RRY', 'LWG', 'NVJ', 'VAM', 'APM', 'JAJ', 'IKY', 'IBJ', 'MDL', 'YVG', 'MOR', 'MWJ', 'APX', 'KFO', 'IML', 'GGI', '451', 'INL', 'AAT', 'CUD', 'LLA', 'RPH', 'FOF', '26W', 'AIV', 'CFX', 'DJA', '5XY', 'DNS', 'FDC', 'KAT', 'HAE', 'ELI', 'QDN', 'NNA', 'OLF', 'SXB', 'FWJ', 'XOJ', 'NDL', 'CJT', 'CRL', 'LGL', 'ENT', 'POL', 'MSY', 'CYO', 'FFR', 'LDX', 'MMM', 'FRN', 'DSP', '9HH', 'VLX', 'TXM', 'YTO', 'JNN', 'MGF', 'DTF', 'CGR', 'VII', 'MXX', 'TPD', 'BRB', 'OAU', '03C', '7ZQ', 'E02', 'WSF', 'ONO', '7TT', 'LTE', 'IRM', 'TPC', 'OTA', 'CTM', 'GEG', 'FMI', 'ARL', 'WML', 'KHK', 'JSL', 'RML', 'IFT', 'PSU', 'STF', 'ZON', 'GDP', 'JHE', 'NGS', 'FJS', 'EWS', 'MIS', 'ECS', 'UKK', 'TAQ'