In [1]:
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType
from utils.geolocation import dist

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1580745277028_0001,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
fad = spark.table('prod_flightaware.der_fadooce_latest')


In [3]:
dme = spark.read.load('s3://sita-coe-ds-dev-v1/mart/dme/dme')
airlines_schema = StructType([
    StructField('airline_id', StringType()),
    StructField('name', StringType()),
    StructField('alias', StringType()),
    StructField('iata', StringType()),
    StructField('icao', StringType()),
    StructField('callsign', StringType()),
    StructField('country', StringType()),
    StructField('active', StringType())
])
airlines = spark.read.schema(airlines_schema).csv('s3://sita-coe-ds-prod-v1/jupyter/jovyan/airlines.dat')
iata2icao = airlines.filter(f.col('active')==f.lit('Y')).select('country', 'name', 'iata', 'icao').distinct().filter(f.col('iata').isNotNull()).filter(f.col('icao').isNotNull())

In [4]:
iata2icao.groupBy('country', 'iata').agg(f.count('icao').alias('count')).filter(f.col('count')>1).join(airlines, on=['country', 'iata']).orderBy('iata').show()

+---------+----+-----+----------+--------------------+------------------+----+---------------+------+
|  country|iata|count|airline_id|                name|             alias|icao|       callsign|active|
+---------+----+-----+----------+--------------------+------------------+----+---------------+------+
|    Japan|  JL|    2|      2987|      Japan Airlines|JAL Japan Airlines| JAL|       JAPANAIR|     Y|
|    Japan|  JL|    2|      2988|Japan Airlines Do...|                \N| JAL|         J-BIRD|     Y|
|  Germany|  LH|    2|      3320|           Lufthansa|                \N| DLH|      LUFTHANSA|     Y|
|  Germany|  LH|    2|      3321|     Lufthansa Cargo|                \N| GEC|LUFTHANSA CARGO|     Y|
|    Nepal|  RA|    2|      3637|      Nepal Airlines|                \N| RNA|    ROYAL NEPAL|     Y|
|    Nepal|  RA|    2|      4264|Royal Nepal Airlines|                \N| RNA|    ROYAL NEPAL|     Y|
|Singapore|  SQ|    2|      4435|  Singapore Airlines|                \N| SIA|    

In [5]:
iata2icao = iata2icao.select('country', 'iata', 'icao').distinct()

In [6]:
iata2icao.show()

+--------------+----+----+
|       country|iata|icao|
+--------------+----+----+
|       Germany|  01|  \N|
|     Argentina|  A4| SWD|
|       Romania|  X5| OTJ|
|       Germany|  QW| BWG|
|     Indonesia|  QG|  \N|
|         Italy|  BV| BPA|
|         China|  CA| CCA|
|         Italy|  GJ| EEU|
|       Tunisia|  UG| TUI|
|     Singapore|  TZ| SCO|
|        Greece|  G3| SEH|
|United Kingdom|  GT| GBL|
|          Peru|  T0|  \N|
| United States|  ZA| CYD|
| United States|  RX| RPO|
|   Switzerland|  WK| EDW|
|        Taiwan|  B7| UIA|
|     Indonesia|  RH| RPH|
|       Germany|  24|  \N|
|    Costa Rica|  9V| VC9|
+--------------+----+----+
only showing top 20 rows

In [7]:
print(dme.count())
# There may be duplicates here, but the issue is negligible as per record count.
dme = dme.join(iata2icao, on = f.col('airlinecode') == f.col('iata'))
dme = dme.withColumn('ident1', f.concat('icao', 'FlightNumber'))
dme = dme.withColumn('ident2', f.concat('icao', f.regexp_replace('FlightNumber', '^0*', '')))
print(dme.count())

13167128
13205690

In [8]:
dme.show()

+------+----------+-----------+------------+------+-----------+-----------+-------------+-------------+--------------------+-------------------+-----------------------+--------------------+------------------+--------------+----+----+-------+-------+
|   PNR|flightdate|airlinecode|FlightNumber|gender|nationality|Destination|ClassofTravel|     IntorDom|    _input_file_name|_record_ingest_date|encrypted_PassengerName|encrypted_passportno|record_ingest_date|       country|iata|icao| ident1| ident2|
+------+----------+-----------+------------+------+-----------+-----------+-------------+-------------+--------------------+-------------------+-----------------------+--------------------+------------------+--------------+----+----+-------+-------+
|2BC4D9|2019-01-12|         WZ|        0323|  NULL|       NULL|        EVN|            Y|International|s3://sita-coe-ds-...|2019-01-01 00:00:00|   00000b7cb473b9f0f...|fb329000228cc5a24...|        2019-01-01|        Russia|  WZ| RWZ|RWZ0323| RWZ323|


In [9]:
pax = dme.groupBy('airlinecode', 'FlightNumber', 'flightdate', 'ident1', 'ident2'). \
  agg(f.countDistinct('encrypted_PassengerName').alias('pax')). \
  orderBy('airlinecode', 'FlightNumber', 'flightdate'). \
  filter(f.col('airlinecode').isNotNull()).filter(f.col('FlightNumber').isNotNull())

In [10]:
pax.show(10)

+-----------+------------+----------+-------+------+---+
|airlinecode|FlightNumber|flightdate| ident1|ident2|pax|
+-----------+------------+----------+-------+------+---+
|         3K|        0241|2019-08-21|JSA0241|JSA241|  1|
|         3K|        0761|2019-04-30|JSA0761|JSA761|  2|
|         3K|        0761|2019-07-04|JSA0761|JSA761|  1|
|         3K|        0761|2019-08-06|JSA0761|JSA761|  2|
|         3K|        0761|2019-09-01|JSA0761|JSA761|  2|
|         3U|        0609|2019-03-23|CSC0609|CSC609|  1|
|         4O|        2118|2019-07-24| \N2118|\N2118|  1|
|         5N|        0118|2019-01-01|AUL0118|AUL118|103|
|         5N|        0118|2019-01-02|AUL0118|AUL118|148|
|         5N|        0118|2019-01-03|AUL0118|AUL118|145|
+-----------+------------+----------+-------+------+---+
only showing top 10 rows

In [11]:
pax.filter(f.col('flightdate').between('2019-03-01', '2019-03-31')).agg(f.sum('pax')).show()

+--------+
|sum(pax)|
+--------+
| 1118299|
+--------+

In [12]:
dme.filter(f.col('flightdate').between('2019-03-01', '2019-03-31')).agg(f.count('encrypted_passengername')).show()

+------------------------------+
|count(encrypted_passengername)|
+------------------------------+
|                       1118766|
+------------------------------+

In [3]:
# prepare airline code schema
df_airline = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airlines_openflight.csv")
df_airline = df_airline.dropna(how = 'any',  subset = ['icao'])
df_airline = df_airline.dropDuplicates(subset = ['icao'])
df_airline = df_airline.withColumnRenamed('airline', 'airline2')
# prepare airport code schema
df_airport = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/airports_openflight_wgeo.csv")
df_airport = df_airport.dropna(how = 'any',  subset = ['icao'])
df_airport = df_airport.dropDuplicates(subset = ['icao'])
df_airport = df_airport.withColumn('geo_seg', f.when((f.col('country') == 'Canada') | \
                                                     (f.col('country') == 'United States'), 'NA(Canada&US)'). \
                                   when(f.col('country') == 'China', 'China'). \
                                   when(f.col('country') == 'India', 'India'). \
                                   otherwise(f.col('GEO'))).drop('GEO')
# prepare aircraft model schema
df_aircraft = spark.read.format("csv").option("header", "true").option("delimiter", ",").option("encoding", "utf-8").load("s3://sita-coe-ds-dev-v1/jupyter/jovyan/aircrafts_openflight.csv")
df_aircraft = df_aircraft.dropna(how = 'any',  subset = ['icao'])
df_aircraft = df_aircraft.dropDuplicates(subset = ['icao'])

In [4]:
fad = fad.withColumn('airline', f.col('ident').substr(1, 3))
cargo = ['FDX', 'UPS', 'CLX', 'CLU', 'NCA', 'ABW', 'ADB', 'CKK', 'AZS', 'TPA', 'SAY', 'RSB', 'UNA', 'EFX', 'HLF', 'AHK']
fad = fad.withColumn('flight_type', f.when(f.col('airline').isin(cargo), f.lit('cargo')). \
                     when(f.col('ident').isNotNull() & f.col('id').isNotNull() &
                                            (f.col('ident') != f.col('reg')) &
                                            f.col('ident').rlike('^[A-Z][A-Z][A-Z]') &
                                            f.col('orig').rlike('^[A-Z][A-Z][A-Z][A-Z]$') &
                                            f.col('dest').rlike('^[A-Z][A-Z][A-Z][A-Z]$'),
                                            f.lit('commercial')).otherwise('non-commercial'))
#print(fad.count())
#fad = fad.join(pax, on = (f.col('edt_date') == f.col('flightdate')) &
#               ((f.col('ident')==f.col('ident1')) | (f.col('ident')==f.col('ident2'))))
#print(fad.count())


In [5]:
# join airline code
fad = fad.join(f.broadcast(df_airline), fad.airline == df_airline.icao, 'left').drop('alias', 'icao', 'callsign', 'iata', 'active')
fad = fad.withColumnRenamed('country', 'airline_country')
# join orign airport code
fad = fad.join(f.broadcast(df_airport), fad.orig == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source', 'timezone', 'dst', 'tz_database')
fad = fad.withColumnRenamed('airport', 'orig_airport').withColumnRenamed('geo_seg', 'orig_geo').withColumnRenamed('city', 'orig_city').withColumnRenamed('country', 'orig_country').withColumnRenamed('latitude', 'orig_lat').withColumnRenamed('longitude', 'orig_lon').withColumnRenamed('altitude', 'orig_alt')
# join destination airport code
fad = fad.join(f.broadcast(df_airport), fad.dest == df_airport.icao, 'left').drop('iata', 'icao', 'type', 'source', 'timezone', 'dst', 'tz_database')
fad = fad.withColumnRenamed('airport', 'dest_airport').withColumnRenamed('geo_seg', 'dest_geo').withColumnRenamed('city', 'dest_city').withColumnRenamed('country', 'dest_country').withColumnRenamed('latitude', 'dest_lat').withColumnRenamed('longitude', 'dest_lon').withColumnRenamed('altitude', 'dest_alt')
# join aircraft model code
fad = fad.join(f.broadcast(df_aircraft), fad.aircrafttype == df_aircraft.icao, 'left').drop('iata', 'icao')

In [6]:
fad = fad.withColumn('edt_year', f.date_format('edt_date', 'yyyy')). \
  withColumn('edt_yearmonth', f.date_format('edt_date', 'yyyyMM')). \
  withColumn('edt_month', f.date_format('edt_date', 'MM')). \
  withColumn('edt_quarter', f.ceil(f.col('edt_month').cast('int')/3)). \
  withColumn('dep_arr', f.col('adt_dep').isNotNull() | f.col('aat_arr').isNotNull()). \
  withColumn('orig', f.when(f.col('flight_type')==f.lit('commercial'), f.col('orig')).otherwise('N/C'))
#fad = fad.filter(f.col('edt_month').between('04', '10'))
fad = fad.withColumn('dist', dist('orig_lat', 'orig_lon', 'dest_lat', 'dest_lon') )

In [7]:
group = ['dep_arr', 'flight_type', 'edt_year']
group_name = '_year_apr_oct'
group_name = '_year_full'
#group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter']
#group_name = '_quarter'
group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month']
group_name = ''
#group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month', 'airline', 'aircrafttype']
#group_name = '_type'
#group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month', 'airline', 'aircrafttype', 'orig_geo', 'dest_geo']
#group_name = '_type_geo'
#group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month', 'airline', 'reg']
#group_name = '_reg_al_subset'
#group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month', 'airline', 'aircrafttype', 'reg']
#group_name = '_type_reg'
#group_name = '_type_reg_al_subset'
#fad = fad.filter(f.col('airline').isin('UAE', 'AFR', 'DAL', 'FDX', 'UPS'))
#group = ['orig', *group]
#group_name = f'{group_name}_orig'
summary = fad. \
  groupBy(*group). \
  agg(f.countDistinct('id').alias('id_count'),
      f.countDistinct('reg').alias('reg_count'),
      f.sum('dist').alias('total_distance')). \
  orderBy(*group)

In [8]:
summary.repartition(1).write.mode('overwrite'). \
  csv(f's3://sita-coe-ds-prod-v1/files_analysis/flightaware_summary{group_name}', header=True)

In [None]:
group = ['dep_arr', 'flight_type', 'edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month', 'reg']
group_name = f'{group_name}_n_airlines'
n_airlines = summary.groupBy(*group). \
  agg(f.sum('id_count').alias('id_count_reg'),
      f.sum('total_distance').alias('total_distance_reg'),
      f.countDistinct('airline').alias('n_airlines_reg'))
summary.join(n_airlines, on = group, how='inner').filter(f.col('airline').isin('UAE', 'AFR', 'DAL', 'FDX', 'UPS')). \
  repartition(1).write.mode('overwrite'). \
  csv(f's3://sita-coe-ds-prod-v1/files_analysis/flightaware_summary{group_name}', header=True)

In [22]:
fad.filter(f.col('dep_arr')).filter(f.col('flight_type')==f.lit('commercial')).select('edt_year', 'edt_quarter', 'edt_yearmonth', 'edt_month', 'reg').distinct().repartition(1).write.mode('overwrite'). \
  csv(f's3://sita-coe-ds-prod-v1/files_analysis/flightaware_reg{group_name}', header=True)