# Turnaround Statistics
The notebook intends to leverage flightaware raw data and generate the turnaround statisitcs by building data engineering pipelines.

## Data preparation

In [1]:
import sys
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
import numpy as np

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1574691327262_0001,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
# Load FA data from der_turnaround and create TempView
spark = SparkSession.builder.getOrCreate()
df = spark.read.load("s3://sita-coe-ds-dev-v1/mart/der_turnaround")
df.createOrReplaceTempView("der_ta")
df.show(10)

+--------------------+------------+----+--------+----+-------------------+-------------------+----+----------------+--------------------+-------------------+----+------+-------+---------+----------+--------------------+-------------------+-------+-----+-----+------+----------+----------+--------------------+-------------------+------------------+------------------+-------+---------------------+-----------------------+--------------------+---------+---------+-------------------+-------------------+-------------------+------------+--------------------+-------+-------+-------+-----------------+-----------------+---------+--------------------+-------------------+-------+-------------+------------+--------+--------------------+-----------------------+----------------------+----------------------+----------------------+--------+-------+----------------+----------------+--------+-------+-------+-------+-------------------+----------------------+---------------------+---------------------+----

In [3]:
# filter der table by ident (i.e., null, helicopter, private jet etc.)
df_ta = spark.sql("""
               SELECT ident, reg, aircrafttype, dest, orig, edt, eta, status, clock_on, clock_off FROM der_ta
               WHERE ident is not null
               AND ident != reg
               AND ident RLIKE '[A-Z][A-Z][A-Z]'
               """)
df_ta = df_ta.dropDuplicates()

In [4]:
#df_ta = spark.sql("""
#               SELECT ident, reg, dest, orig, edt, eta FROM der_ta
#               """)
#df_ta = df_ta.dropDuplicates()

In [5]:
# create turnaround column
df_ta = df_ta.withColumn('turnaround', ((F.unix_timestamp(F.col('clock_off')) - F.unix_timestamp(F.col('clock_on')))/60).cast('decimal(8,1)'))
df_ta = df_ta.filter(" turnaround > 0 or turnaround is null")

In [6]:
# outliner removal based on quantile statistics
df_out_stats = df_ta.groupBy('dest').agg(F.expr('percentile_approx(turnaround, 0.1)').alias('quantile10_ta_perairport'),
                                         F.expr('percentile_approx(turnaround, 0.9)').alias('quantile90_ta_perairport'))
df_ta = df_ta.join(df_out_stats, ['dest'], 'left')
df_ta = df_ta.filter(" ((turnaround >= quantile10_ta_perairport) and (turnaround <= quantile90_ta_perairport)) or turnaround is null")
df_ta = df_ta.drop('quantile10_ta_perairport', 'quantile90_ta_perairport')

In [7]:
# outliner removal based on TAT thresholds
#df_ta = df_ta.filter(" (turnaround > 10 and turnaround < 360) or turnaround is null")
#df_ta.count()

In [8]:
# create turnaround time column (i.e., Month-th of year)
df_ta = df_ta.withColumn('ta_month', F.month(F.date_trunc('mon', df_ta.clock_on)))

In [9]:
# prepare airport code schema
df_airport = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airports_openflight.csv")
df_airport = df_airport.dropna(how = 'any',  subset = ['icao'])
df_airport = df_airport.dropDuplicates(subset = ['icao'])

In [10]:
# prepare airline code schema
df_airline = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airlines_openflight.csv")
df_airline = df_airline.dropna(how = 'any',  subset = ['icao'])
df_airline = df_airline.dropDuplicates(subset = ['icao'])

In [11]:
# prepare aircraft type schema
df_aircraft = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/aircrafts_openflight.csv")
df_aircraft = df_aircraft.dropna(how = 'any',  subset = ['icao'])
df_aircraft = df_aircraft.dropDuplicates(subset = ['icao'])

In [12]:
# prepare aircraft category schema
df_category = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/aircrafttype_flightpredictor.csv")
df_category = df_category.dropna(how = 'any',  subset = ['icao'])
df_category = df_category.dropDuplicates(subset = ['icao'])

## Data engineering (normalized turnaround table)

In [13]:
# derive and filter airline code 
df_ta = df_ta.withColumn("airline_code", df_ta.ident.substr(0,3))
df_ta = df_ta.filter(" dest RLIKE '[A-Z][A-Z][A-Z][A-Z]' ")

In [14]:
# create count of turnaournd categories, i.e, [0-30],[30-60],[60-90],[90-120],[120+]
df_ta = df_ta.withColumn('ta_00_30', F.when(F.col('turnaround') <= 30, F.lit(1)).otherwise(F.lit(0)))
df_ta = df_ta.withColumn('ta_30_60', F.when((F.col('turnaround') <= 60) & (F.col('turnaround') > 30), F.lit(1)).otherwise(F.lit(0)))
df_ta = df_ta.withColumn('ta_60_90', F.when((F.col('turnaround') <= 90) & (F.col('turnaround') > 60), F.lit(1)).otherwise(F.lit(0)))
df_ta = df_ta.withColumn('ta_90_120', F.when((F.col('turnaround') <= 120) & (F.col('turnaround') > 90), F.lit(1)).otherwise(F.lit(0)))
df_ta = df_ta.withColumn('ta_120', F.when(F.col('turnaround') > 120, F.lit(1)).otherwise(F.lit(0)))

In [15]:
# create benchmark table
df_bm_perairport = df_ta.groupBy('dest').agg(F.avg(F.col("turnaround")).alias('avg_ta_perairport'), 
                                          F.expr('percentile_approx(turnaround, 0.25)').alias('benchmark25_ta_perairport'),
                                          F.expr('percentile_approx(turnaround, 0.5)').alias('benchmark50_ta_perairport'),
                                          F.expr('percentile_approx(turnaround, 0.9)').alias('benchmark90_ta_perairport'))

In [16]:
# derive the turnaround status (less, on, or more than average turnaround per airport)
df_bm_der = df_ta.join(df_bm_perairport, ['dest'], 'left').drop('avg_ta_perairport')
df_bm_der = df_bm_der.withColumn('ta_below', F.when(F.col('turnaround') < 0.9*F.col('benchmark50_ta_perairport'), F.lit(1)).otherwise(F.lit(0)))
df_bm_der = df_bm_der.withColumn('ta_on', F.when((F.col('turnaround') < 1.1*F.col('benchmark50_ta_perairport')) & (F.col('turnaround') >= 0.9*F.col('benchmark50_ta_perairport')), F.lit(1)).otherwise(F.lit(0)))
df_bm_der = df_bm_der.withColumn('ta_above', F.when(F.col('turnaround') >= 1.1*F.col('benchmark50_ta_perairport'), F.lit(1)).otherwise(F.lit(0)))
df_bm_der = df_bm_der.drop('benchmark25_ta_perairport', 'bechmark50_ta_perairport', 'benchmark90_ta_perairport')

In [17]:
# join airlines by airline code
df_mg = df_bm_der.join(df_airline, df_bm_der.airline_code == df_airline.icao, 'left').drop('alias', 'iata', 'icao', 'callsign')
df_mg = df_mg.withColumnRenamed('country', 'airline_country').withColumnRenamed('active', 'airline_active')

In [18]:
# join airport by airport code
df_mg = df_mg.join(df_airport, df_mg.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_mg = df_mg.withColumnRenamed('airport', 'dest_airport').withColumnRenamed('city', 'dest_city').withColumnRenamed('country', 'dest_country').withColumnRenamed('latitude', 'dest_lat').withColumnRenamed('longitude', 'dest_lon').withColumnRenamed('altitude', 'dest_alt').withColumnRenamed('dst', 'dest_dst').withColumnRenamed('tz_database', 'dest_tz')

In [19]:
# join aircraft by aircraft type
df_mg = df_mg.join(df_aircraft, df_mg.aircrafttype == df_aircraft.icao, 'left').drop('iata', 'icao')

In [20]:
# join category by aircraft category 
df_mg = df_mg.join(df_category, df_mg.aircrafttype == df_category.icao, 'left').drop('iata', 'icao', 'longitude', 'mac', 'manufacturer', 'model', 'speed', 'wake')
df_mg = df_mg.withColumnRenamed('category', 'aircraft_category').withColumnRenamed('length', 'aircraft_length').withColumnRenamed('height', 'aircraft_height').withColumnRenamed('wingspan', 'aircraft_winspan')

In [None]:
#df_mg.show(100)

## Aggregation by airport

In [None]:
# group by: per airport
# agg: (1) number of (unique) airlines [num_airlines_perairport]
#      (2) number of (unique) flights (flight numbers) [num_flightnum_perairport]
#      (3) number of (unique) aircrafts (registration numbers) [num_reg_perairport]
#      (4) number of (unique) aircraft types [num_aircrafttype_perairport]
#      (5) total number of turnaround (arrival-departure pair) [num_turnaround_perairport]
#      (6) average/mean turnaround time (mins) [avg_ta_perairport]
#      (7) standard deviation turnaround time (mins) [std_ta_perairport]
#      (8) min turnaround time (mins) [min_ta_perairport]
#      (9) max turnaround time (mins) [max_ta_perairport]
#      (10) 25 quantile turnaround time (mins) [quantile25_ta_perairport]
#      (11) 50 quantile turnaround time (mins) [quantile50_ta_perairport]
#      (12) 90 quantile turnaround time (mins) [quantile90_ta_perairport]
#      (13) total number of turnaround in Jan [num_turnaround_jan_perairport]
#      (14) total number of turnaround in Feb [num_turnaround_feb_perairport]
#      (15) total number of turnaround in Mar [num_turnaround_mar_perairport]
#      (16) total number of turnaround in Apr [num_turnaround_apr_perairport]
#      (17) total number of turnaround in May [num_turnaround_may_perairport]
#      (18) total number of turnaround in Jun [num_turnaround_jun_perairport]
#      (19) total number of turnaround in Jul [num_turnaround_jul_perairport]
#      (20) total number of turnaround in Aug [num_turnaround_aug_perairport]
#      (21) total number of turnaround in Sep [num_turnaround_sep_perairport]
#      (22) total number of turnaround in Oct [num_turnaround_oct_perairport]
#      (23) total number of turnaround in Nov [num_turnaround_nov_perairport]
#      (24) total number of turnaround in Dec [num_turnaround_dec_perairport]
#      (25) total number of Type-A aircraft [num_type_a_perairport]
#      (26) total number of Type-B aircraft [num_type_b_perairport]
#      (27) total number of Type-C aircraft [num_type_c_perairport]
#      (28) total number of Type-D aircraft [num_type_d_perairport]
#      (29) total number of Type-E aircraft [num_type_e_perairport]
#      (30) total number of Type-F aircraft [num_type_f_perairport]

df_perairport = df_mg.groupBy('dest').agg(F.countDistinct(F.col('airline_code')).alias('num_airlines_perairport'),
                                          F.countDistinct(F.col('ident')).alias('num_flightnum_perairport'),
                                          F.countDistinct(F.col('reg')).alias('num_reg_perairport'), 
                                          F.countDistinct(F.col('aircrafttype')).alias('num_aircrafttype_perairport'), 
                                          F.count(F.when(F.col("turnaround").isNotNull(), 0)).alias('num_turnaround_perairport'), 
                                          F.avg(F.col("turnaround")).alias('avg_ta_perairport'), 
                                          F.stddev(F.col("turnaround")).alias('std_ta_perairport'),
                                          F.min(F.col("turnaround")).alias('min_ta_perairport'), 
                                          F.max(F.col("turnaround")).alias('max_ta_perairport'),
                                          F.count(F.when((F.col("ta_month") == 1) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_jan_perairport"),
                                          F.count(F.when((F.col("ta_month") == 2) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_feb_perairport"),
                                          F.count(F.when((F.col("ta_month") == 3) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_mar_perairport"),
                                          F.count(F.when((F.col("ta_month") == 4) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_apr_perairport"),
                                          F.count(F.when((F.col("ta_month") == 5) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_may_perairport"),
                                          F.count(F.when((F.col("ta_month") == 6) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_jun_perairport"),
                                          F.count(F.when((F.col("ta_month") == 7) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_jul_perairport"),
                                          F.count(F.when((F.col("ta_month") == 8) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_aug_perairport"),
                                          F.count(F.when((F.col("ta_month") == 9) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_sep_perairport"),
                                          F.count(F.when((F.col("ta_month") == 10) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_oct_perairport"),
                                          F.count(F.when((F.col("ta_month") == 11) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_nov_perairport"),
                                          F.count(F.when((F.col("ta_month") == 12) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_dec_perairport"),
                                          F.expr('percentile_approx(turnaround, 0.25)').alias('quantile25_ta_perairport'),
                                          F.expr('percentile_approx(turnaround, 0.5)').alias('quantile50_ta_perairport'),
                                          F.expr('percentile_approx(turnaround, 0.9)').alias('quantile90_ta_perairport'),
                                          F.count(F.when((F.col("aircraft_category") == 0) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_a_perairport"),
                                          F.count(F.when((F.col("aircraft_category") == 1) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_b_perairport"),
                                          F.count(F.when((F.col("aircraft_category") == 2) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_c_perairport"),
                                          F.count(F.when((F.col("aircraft_category") == 3) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_d_perairport"),
                                          F.count(F.when((F.col("aircraft_category") == 4) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_e_perairport"),
                                          F.count(F.when((F.col("aircraft_category") == 5) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_f_perairport"))
                                          
df_perairport.show(100)

In [None]:
# append airport info (city, country, region etc.) and re-order the record
df_perairport = df_perairport.join(df_airport, df_perairport.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_perairport = df_perairport.withColumnRenamed('airport', 'dest_airport') \
                             .withColumnRenamed('city', 'dest_city') \
                             .withColumnRenamed('country', 'dest_country')
df_perairport = df_perairport.withColumn('tz_database', F.split(df_perairport.tz_database, '/').getItem(0))
df_perairport = df_perairport.replace(float('nan'), None)
df_perairport = df_perairport.orderBy(df_perairport.num_turnaround_perairport.desc_nulls_last())
df_perairport.show()

In [None]:
#sample agg
df_sample = df_mg.groupBy('dest', 'airline_code').agg(F.bround((F.sum(F.col('ta_below'))*100/(F.sum(F.col('ta_below')) + F.sum(F.col('ta_fall')) + F.sum(F.col('ta_above')))),2).alias('ta_lower_percent'),
                                                      F.bround((F.sum(F.col('ta_fall'))*100/(F.sum(F.col('ta_below')) + F.sum(F.col('ta_fall')) + F.sum(F.col('ta_above')))),2).alias('ta_fall_percent'),
                                                      F.bround((F.sum(F.col('ta_above'))*100/(F.sum(F.col('ta_below')) + F.sum(F.col('ta_fall')) + F.sum(F.col('ta_above')))),2).alias('ta_above_percent'))

In [None]:
df_sample.show()

In [None]:
# group by: per airport + per airline
# agg: (1) number of (unique) flights (flight numbers) [num_flightnum_perairline]
#      (2) number of (unique) aircrafts (registration numbers) [num_reg_perairline]
#      (3) number of (unique) aircraft types [num_aircrafttype_perairline]
#      (4) total number of turnaround (arrival-departure pair) [num_turnaround_perairline]
#      (5) average/mean turnaround time (mins) [avg_ta_perairline]
#      (6) standard deviation turnaround time (mins) [std_ta_perairline]
#      (7) min turnaround time (mins) [min_ta_perairline]
#      (8) max turnaround time (mins) [max_ta_perairline]
#      (9) 25 quantile turnaround time (mins) [quantile25_ta_perairline]
#      (10) 50 quantile turnaround time (mins) [quantile50_ta_perairline]
#      (11) 90 quantile turnaround time (mins) [quantile90_ta_perairline]
#      (12) total number of turnaround in Jan [num_turnaround_jan_perairline]
#      (13) total number of turnaround in Feb [num_turnaround_feb_perairline]
#      (14) total number of turnaround in Mar [num_turnaround_mar_perairline]
#      (15) total number of turnaround in Apr [num_turnaround_apr_perairline]
#      (16) total number of turnaround in May [num_turnaround_may_perairline]
#      (17) total number of turnaround in Jun [num_turnaround_jun_perairline]
#      (18) total number of turnaround in Jul [num_turnaround_jul_perairline]
#      (19) total number of turnaround in Aug [num_turnaround_aug_perairline]
#      (20) total number of turnaround in Sep [num_turnaround_sep_perairline]
#      (21) total number of turnaround in Oct [num_turnaround_oct_perairline]
#      (22) total number of turnaround in Nov [num_turnaround_nov_perairline]
#      (23) total number of turnaround in Dec [num_turnaround_dec_perairline]
#      (24) total number of Type-A aircraft [num_type_a_perairline]
#      (25) total number of Type-B aircraft [num_type_b_perairline]
#      (26) total number of Type-C aircraft [num_type_c_perairline]
#      (27) total number of Type-D aircraft [num_type_d_perairline]
#      (28) total number of Type-E aircraft [num_type_e_perairline]
#      (29) total number of Type-F aircraft [num_type_f_perairline]

df_perairport_perairline = df_mg.groupBy('dest', 'airline_code').agg(F.countDistinct(F.col('ident')).alias('num_flightnum_perairline'),
                                          F.countDistinct(F.col('reg')).alias('num_reg_perairline'), 
                                          F.countDistinct(F.col('aircrafttype')).alias('num_aircrafttype_perairline'), 
                                          F.count(F.when(F.col("turnaround").isNotNull(), 0)).alias('num_turnaround_perairline'), 
                                          F.avg(F.col("turnaround")).alias('avg_ta_perairline'), 
                                          F.stddev(F.col("turnaround")).alias('std_ta_perairline'),
                                          F.min(F.col("turnaround")).alias('min_ta_perairline'), 
                                          F.max(F.col("turnaround")).alias('max_ta_perairline'),
                                          F.expr('percentile_approx(turnaround, 0.25)').alias('quantile25_ta_perairline'),
                                          F.expr('percentile_approx(turnaround, 0.5)').alias('quantile50_ta_perairline'),
                                          F.expr('percentile_approx(turnaround, 0.9)').alias('quantile90_ta_perairline'),
                                          F.bround((F.sum(F.col('ta_00_30'))*100/(F.sum(F.col('ta_00_30')) + F.sum(F.col('ta_30_60')) + F.sum(F.col('ta_60_90')) + F.sum(F.col('ta_90_120')) + F.sum(F.col('ta_120')))),2).alias('ta_range0_30'),
                                          F.bround((F.sum(F.col('ta_30_60'))*100/(F.sum(F.col('ta_00_30')) + F.sum(F.col('ta_30_60')) + F.sum(F.col('ta_60_90')) + F.sum(F.col('ta_90_120')) + F.sum(F.col('ta_120')))),2).alias('ta_range30_60'),
                                          F.bround((F.sum(F.col('ta_60_90'))*100/(F.sum(F.col('ta_00_30')) + F.sum(F.col('ta_30_60')) + F.sum(F.col('ta_60_90')) + F.sum(F.col('ta_90_120')) + F.sum(F.col('ta_120')))),2).alias('ta_range60_90'),
                                          F.bround((F.sum(F.col('ta_90_120'))*100/(F.sum(F.col('ta_00_30')) + F.sum(F.col('ta_30_60')) + F.sum(F.col('ta_60_90')) + F.sum(F.col('ta_90_120')) + F.sum(F.col('ta_120')))),2).alias('ta_range90_120'),
                                          F.bround((F.sum(F.col('ta_120'))*100/(F.sum(F.col('ta_00_30')) + F.sum(F.col('ta_30_60')) + F.sum(F.col('ta_60_90')) + F.sum(F.col('ta_90_120')) + F.sum(F.col('ta_120')))),2).alias('ta_range120'),
                                          F.bround((F.sum(F.col('ta_below'))*100/(F.sum(F.col('ta_below')) + F.sum(F.col('ta_on')) + F.sum(F.col('ta_above')))),2).alias('ta_below_percent'),
                                          F.bround((F.sum(F.col('ta_on'))*100/(F.sum(F.col('ta_below')) + F.sum(F.col('ta_on')) + F.sum(F.col('ta_above')))),2).alias('ta_on_percent'),
                                          F.bround((F.sum(F.col('ta_above'))*100/(F.sum(F.col('ta_below')) + F.sum(F.col('ta_on')) + F.sum(F.col('ta_above')))),2).alias('ta_above_percent'),
                                          F.count(F.when((F.col("ta_month") == 1) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_jan_perairline"),
                                          F.count(F.when((F.col("ta_month") == 2) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_feb_perairline"),
                                          F.count(F.when((F.col("ta_month") == 3) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_mar_perairline"),
                                          F.count(F.when((F.col("ta_month") == 4) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_apr_perairline"),
                                          F.count(F.when((F.col("ta_month") == 5) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_may_perairline"),
                                          F.count(F.when((F.col("ta_month") == 6) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_jun_perairline"),
                                          F.count(F.when((F.col("ta_month") == 7) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_jul_perairline"),
                                          F.count(F.when((F.col("ta_month") == 8) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_aug_perairline"),
                                          F.count(F.when((F.col("ta_month") == 9) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_sep_perairline"),
                                          F.count(F.when((F.col("ta_month") == 10) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_oct_perairline"),
                                          F.count(F.when((F.col("ta_month") == 11) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_nov_perairline"),
                                          F.count(F.when((F.col("ta_month") == 12) & (F.col("turnaround").isNotNull()),True)).alias("num_turnaround_dec_perairline"),
                                          F.count(F.when((F.col("aircraft_category") == 0) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_a_perairline"),
                                          F.count(F.when((F.col("aircraft_category") == 1) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_b_perairline"),
                                          F.count(F.when((F.col("aircraft_category") == 2) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_c_perairline"),
                                          F.count(F.when((F.col("aircraft_category") == 3) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_d_perairline"),
                                          F.count(F.when((F.col("aircraft_category") == 4) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_e_perairline"),
                                          F.count(F.when((F.col("aircraft_category") == 5) & (F.col("aircraft_category").isNotNull()),True)).alias("num_type_f_perairline"))
                                          
df_perairport_perairline.show(100)

In [None]:
# append airport info (city, country, region etc.) and re-order the record
df_perairport_perairline = df_perairport_perairline.join(df_airport, df_perairport_perairline.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
df_perairport_perairline = df_perairport_perairline.withColumnRenamed('airport', 'dest_airport') \
                             .withColumnRenamed('city', 'dest_city') \
                             .withColumnRenamed('country', 'dest_country')
df_perairport_perairline = df_perairport_perairline.withColumn('tz_database', F.split(df_perairport_perairline.tz_database, '/').getItem(0))
df_perairport_perairline = df_perairport_perairline.replace(float('nan'), None)
#df_perairport_perairline = df_perairport_perairline.orderBy(df_perairport_perairline.num_turnaround_perairport.desc_nulls_last()) 
df_perairport_perairline = df_perairport_perairline.withColumn('total_ta_perairport', F.sum('num_turnaround_perairline').over(Window.partitionBy("dest").orderBy('num_turnaround_perairline').rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)))
df_perairport_perairline = df_perairport_perairline.join(df_bm_perairport, ['dest'], 'left').drop('benchmark25_ta_perairport', 'benchmark90_ta_perairport')
df_perairport_perairline = df_perairport_perairline.drop('ta_below', 'ta_fall', 'ta_above', 'ta_00_30', 'ta_30_60','ta_60_90', 'ta_90_120', 'ta_120')
df_perairport_perairline.show()

## Appendix

In [None]:
df_dest_cyul_arrival = df_ta.filter(" (dest ==  'CYUL')")

In [None]:
# group by airport-airline
df_airport_airline = df_merg.groupBy('dest').agg(F.countDistinct('airline_code').alias('airline_count'))
#df_airport_airline = df_airport_airline.join(df_airport, df_airport_airline.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source')
#df_airport_airline = df_airport_airline.withColumnRenamed('airport', 'dest_airport').withColumnRenamed('city', 'dest_city').withColumnRenamed('country', 'dest_country')
#df_airport_airline = df_airport_airline.withColumn('tz_database', F.split(df_airport_airline.tz_database, '/').getItem(0))
#df_airport_airline = df_airport_airline.orderBy(F.desc('airline_count'))

In [None]:
#df_fa2 = df_fa.withColumn("ident", df_fa.ident.substr(0, 3)).groupBy("ident").count().orderBy(F.desc("count")).show(100)

In [21]:
df_mg.repartition(1).write.format('csv').option('header', 'true').option("encoding", "utf-8").save('s3://sita-coe-ds-dev-v1/stats0/')

In [None]:
# basic statistics
df.agg(F.countDistinct('reg')).show()
df.agg(F.countDistinct('dest')).show()
df.agg(F.countDistinct('orig')).show()
df.agg(F.countDistinct('ident')).show()
df.agg(F.countDistinct('edt')).show()
df.agg(F.countDistinct('eta')).show()
df.agg(F.countDistinct('status')).show()
#df_ap.agg(F.countDistinct('dest')).show()
df_perairport_perairline = df_perairport_perairline.orderBy(F.desc('total_ta_pergroup'), 'dest').drop('total_ta_pergroup')
df_perairport_perairline.show(100)
# df_perairport_perairline = df_perairport_perairline.orderBy('dest', F.desc('total_ta_pergroup'))

In [None]:
#df_t = df_ta.filter("dest = 'KPDK'").show()
#w =  Window.partitionBy("dest").orderBy(F.desc("num_turnaround_perairline"))
#df_perairport_perairline.withColumn('e', F.sum('num_turnaround_perairline').over(w)).show(100)
#df_perairport_perairline.groupBy('dest').agg(F.sum('num_turnaround_perairline').alias('sum_ta_perairport')).orderBy(F.desc('sum_ta_perairport')).show(100)