# FlightAware Statistics
The notebook intends to leverage flightaware raw data and generate the turnaround statisitcs by building data engineering pipelines.

In [1]:
import sys
from pyspark.sql import functions as F
from pyspark.sql.functions import acos, cos, sin, lit, toRadians
from pyspark.sql import SparkSession, Window
import numpy as np

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1581023036489_0001,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
# prepare airline code schema
df_airline = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airlines_openflight.csv")
df_airline = df_airline.dropna(how = 'any',  subset = ['icao'])
df_airline = df_airline.dropDuplicates(subset = ['icao'])

In [3]:
# prepare airport code schema
df_airport = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airports_openflight_wgeo_updated_20200129.csv")
df_airport = df_airport.dropna(how = 'any',  subset = ['ICAO'])
df_airport = df_airport.dropDuplicates(subset = ['ICAO'])

In [4]:
df_airport = df_airport.withColumn('geo_seg', F.when((F.col('country') == 'Canada') | (F.col('country') == 'United States'), 'NA(CA&US)').when(F.col('Continent/Region') == 'Africa', 'Africa').when(F.col('country') == 'China', 'China').when(F.col('country') == 'India', 'India').otherwise(F.col('GEO'))).drop('GEO')
df_airport = df_airport.withColumn('geo_seg', F.when(F.col('geo_seg') == 'AMER', 'AMER(Excl US&CA)').when(F.col('geo_seg') == 'APAC', 'APAC(Excl China)').when(F.col('geo_seg') == 'MEIA', 'MEIA(Excl Africa&India)').otherwise(F.col('geo_seg')) )

In [5]:
# prepare aircraft model schema
df_aircraft = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/aircrafts_openflight.csv")
df_aircraft = df_aircraft.dropna(how = 'any',  subset = ['icao'])
df_aircraft = df_aircraft.dropDuplicates(subset = ['icao'])

In [6]:
def dist(lat_x, long_x, lat_y, long_y):
    """
    Inspired from https://stackoverflow.com/questions/38994903/how-to-sum-distances-between-data-points-in-a-dataset-using-pyspark
    """
    return acos(
        sin(toRadians(lat_x)) * sin(toRadians(lat_y)) +
        cos(toRadians(lat_x)) * cos(toRadians(lat_y)) *
            cos(toRadians(long_x) - toRadians(long_y))
    ) * lit(6371.0)

## Data Preparation

In [7]:
# Load FA data from predifined table and create TempView
spark = SparkSession.builder.getOrCreate()
#df = spark.read.load("s3://sita-coe-ds-prod-v1/mart/flightaware/der_fadooce_latest")
df = spark.read.load("s3://sita-coe-ds-prod-v1/mart/flightaware/position")
df.createOrReplaceTempView("der_dep")

In [9]:
df.show()

+----------+------------+-----+---------+--------+-------------------+----+-------------------+--------------------+-----------------+---+-------+------+--------------------+---------+-----------+------------+----+-------------------+---------+-----+------+--------+----------+--------------------+-------------------+------------------+------------------+-------------------+----------+-------+------------------+
|air_ground|aircrafttype|  alt|altChange|atcident|              clock|dest|                edt|       facility_hash|    facility_name| gs|heading| hexid|                  id|    ident|        lat|         lon|orig|               pitr|      reg|speed|squawk|    type|updateType|    _input_file_name|_record_ingest_date|record_ingest_time|record_ingest_hour|       id_timestamp|   id_date|id_hour|record_ingest_date|
+----------+------------+-----+---------+--------+-------------------+----+-------------------+--------------------+-----------------+---+-------+------+-----------------

In [8]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_departure = spark.sql("""
               SELECT id, id_timestamp, reg, aircrafttype, ident, orig, dest, clock_on, clock_off, aat_arr, adt_dep FROM der_dep
               WHERE ident is not null
               AND ident != reg
               AND id is not null
               AND ident RLIKE '[A-Z][A-Z][A-Z]'
               AND (orig RLIKE '[A-Z][A-Z][A-Z][A-Z]' OR dest RLIKE '[A-Z][A-Z][A-Z][A-Z]')
               """)
df_departure = df_departure.dropDuplicates()

"cannot resolve '`clock_on`' given input columns: [der_dep.atcident, der_dep.record_ingest_time, der_dep._record_ingest_date, der_dep.speed, der_dep.gs, der_dep.orig, der_dep.heading, der_dep.ident, der_dep.record_ingest_hour, der_dep.id, der_dep.id_date, der_dep.edt, der_dep.id_timestamp, der_dep.altChange, der_dep.clock, der_dep.lon, der_dep.facility_hash, der_dep.aircrafttype, der_dep.alt, der_dep.updateType, der_dep.reg, der_dep.pitr, der_dep.lat, der_dep.facility_name, der_dep.id_hour, der_dep.squawk, der_dep.type, der_dep.hexid, der_dep.air_ground, der_dep.dest, der_dep.record_ingest_date, der_dep._input_file_name]; line 2 pos 78;\n'Project [id#172, id_timestamp#187, reg#178, aircrafttype#160, ident#173, orig#176, dest#165, 'clock_on, 'clock_off, 'aat_arr, 'adt_dep]\n+- Filter (((isnotnull(ident#173) && NOT (ident#173 = reg#178)) && isnotnull(id#172)) && (ident#173 RLIKE [A-Z][A-Z][A-Z] && (orig#176 RLIKE [A-Z][A-Z][A-Z][A-Z] || dest#165 RLIKE [A-Z][A-Z][A-Z][A-Z])))\n   +- Subqu

In [None]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_departure = spark.sql("""
               SELECT id, id_timestamp, reg, aircrafttype, ident, orig, dest, clock_on, clock_off, aat_arr, adt_dep FROM der_dep
               WHERE id is not null
               AND (orig RLIKE '[A-Z][A-Z][A-Z][A-Z]' OR dest RLIKE '[A-Z][A-Z][A-Z][A-Z]')
               """)
df_departure = df_departure.dropDuplicates()

In [None]:
# create year column (i.e., year 2017, 2018, 2019)
df_departure = df_departure.withColumn('year', F.year(F.date_trunc('year', df_departure.id_timestamp))).withColumn('quarter', F.quarter(F.date_trunc('quarter', df_departure.id_timestamp))).withColumn('month', F.month(F.date_trunc('mon', df_departure.id_timestamp)))

In [None]:
# filter perticular year-date coverage
#df_departure = df_departure.filter(" (year == 2018)   ")

In [None]:
# total number of (unique) flight per SITA GEO
#df_departure.agg(F.countDistinct('id').alias('flight_count')).show()

In [None]:
# filter perticular year-date coverage
df_departure = df_departure.filter(" (year == 2017 and quarter > 1) or (year == 2018) or (year == 2019)   ")

In [None]:
df_departure = df_departure.withColumn('dep_arr', F.col('aat_arr').isNotNull() | F.col('adt_dep').isNotNull()) 
df_departure = df_departure.withColumn('on_off', F.col('clock_on').isNotNull() | F.col('clock_off').isNotNull()) 

In [None]:
# derive and filter airline code 
df_departure = df_departure.withColumn("airline_code", df_departure.ident.substr(0,3))
# join airline code
df_departure = df_departure.join(F.broadcast(df_airline), df_departure.airline_code == df_airline.icao, 'left').drop('alias', 'icao', 'callsign', 'iata', 'active')
df_departure = df_departure.withColumnRenamed('country', 'airline_country')

In [None]:
# join orign airport code
df_departure = df_departure.join(F.broadcast(df_airport), df_departure.orig == df_airport.ICAO, 'left').drop('IATA', 'ICAO', 'Type', 'Source', 'Timezone', 'dst', 'tz_dataset', 'Continent/Region')
df_departure = df_departure.withColumnRenamed('Airport Names', 'orig_airport').withColumnRenamed('geo_seg', 'orig_geo').withColumnRenamed('City', 'orig_city').withColumnRenamed('Country', 'orig_country').withColumnRenamed('Latitude', 'orig_lat').withColumnRenamed('Longitude', 'orig_lon').withColumnRenamed('Altitude', 'orig_alt').withColumnRenamed('Region Code', 'orig_region')

In [None]:
# join destination airport code
df_departure = df_departure.join(F.broadcast(df_airport), df_departure.dest == df_airport.ICAO, 'left').drop('IATA', 'ICAO', 'Type', 'Source', 'Timezone', 'dst', 'tz_dataset', 'Continent/Region')
df_departure = df_departure.withColumnRenamed('Airport Names', 'dest_airport').withColumnRenamed('geo_seg', 'dest_geo').withColumnRenamed('City', 'dest_city').withColumnRenamed('Country', 'dest_country').withColumnRenamed('Latitude', 'dest_lat').withColumnRenamed('Longitude', 'dest_lon').withColumnRenamed('Altitude', 'dest_alt').withColumnRenamed('Region Code', 'dest_region')

In [None]:
# join aircraft model code
df_departure = df_departure.join(F.broadcast(df_aircraft), df_departure.aircrafttype == df_aircraft.icao, 'left').drop('iata', 'icao')

In [None]:
# post-join filtering
#df_departure = df_departure.filter('(orig_airport is not null) and (dest_airport is not null) and (aircrafttype is not null) and (orig_city is not null) and (dest_city is not null) and (orig is not null) and (dest is not null) and (orig_airport is not null) and (dest_airport is not null) and (orig_geo is not null) and (dest_geo is not null)')

In [None]:
# add distances
df_departure = df_departure.withColumn('dist', dist('orig_lat', 'orig_lon', 'dest_lat', 'dest_lon') )
df_departure = df_departure.fillna(0, subset = ['dist'])

In [None]:
df_departure = df_departure.withColumn('flight_type', F.when(F.col('dist') <= F.lit(1500), 'SH').when((F.col('dist') > F.lit(1500)) & (F.col('dist') <= F.lit(4000)), 'MH').when(F.col('dist') > F.lit(4000), 'LH'))

In [None]:
#df_departure.filter("(dist > 0) and ((orig_geo is null) or (dest_geo is null))").count()

## By Geo

In [None]:
# total number of (unique) flight per SITA GEO
df_bygeo = df_departure.groupBy('dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'orig_geo', 'dest_geo').agg(F.countDistinct('id').alias('flight_count'),
                                                                                               F.countDistinct('reg').alias('reg_count'),
                                                                                               F.countDistinct('aircrafttype').alias('aircraftmodel_count'), 
                                                                                               F.sum('dist').alias('total_distance')).orderBy('dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'orig_geo', 'dest_geo')

In [None]:
#df_bygeo.show(100)

## By Country

In [None]:
# total number of (unique) flight per country
df_bycountry = df_departure.groupBy('dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'orig_country', 'dest_country').agg(F.countDistinct('id').alias('flight_count'),
                                                                                                           F.countDistinct('reg').alias('reg_count'),
                                                                                                           F.countDistinct('aircrafttype').alias('aircraftmodel_count'), 
                                                                                                           F.sum('dist').alias('total_distance')).orderBy( 'dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'orig_country', 'dest_country') 

# By Region

In [None]:
# total number of (unique) flight per region
df_byregion = df_departure.groupBy('dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'orig_region', 'dest_region').agg(F.countDistinct('id').alias('flight_count'),
                                                                                                           F.countDistinct('reg').alias('reg_count'),
                                                                                                           F.countDistinct('aircrafttype').alias('aircraftmodel_count'), 
                                                                                                           F.sum('dist').alias('total_distance')).orderBy( 'dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'orig_region', 'dest_region') 

# By Airline

In [None]:
# total number of (unique) flight per airline
df_byairline = df_departure.groupBy('dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'airline_code').agg(F.countDistinct('id').alias('flight_count'),
                                                                                                           F.countDistinct('reg').alias('reg_count'),
                                                                                                           F.countDistinct('aircrafttype').alias('aircraftmodel_count'), 
                                                                                                           F.sum('dist').alias('total_distance')).orderBy( 'dep_arr', 'on_off', 'year', 'quarter', 'aircrafttype', 'flight_type', 'airline_code')

## By Airport

In [None]:
# total number of (unique) flight per country
#df_byairport = df_departure.groupBy('year', 'orig_airport', 'dest_airport').agg(F.countDistinct('id').alias('flight_count'),F.countDistinct('aircrafttype').alias('aircraft_count'), F.sum('dist').alias('total_distance')).orderBy( 'year', 'orig_airport','dest_airport')

## Export

In [None]:
df_bygeo.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/bygeo_co_20200206_1')
df_bycountry.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/bycountry_co_20200206_1')
df_byregion.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/byregion_co_202000206_1')

In [None]:
df_airline_icao  = df_departure.filter( F.col('airline_code') == 'UAE' )
df_airline_icao.count()

In [None]:
# df_byairport.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/byairport_co')

In [None]:
df_byairline.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/byairline_co_20200203')
df_airline_icao.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats/uae_co_20200203')