# Joins on Full Dataset

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType, TimestampType


sqlContext = SQLContext(sc)

In [0]:
# Configuration for Blob Storage 

blob_container = "container1" # The name of your container created in https://portal.azure.com
storage_account = "w261sp22team12" # The name of your Storage account created in https://portal.azure.com
secret_scope = "s1" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "k1" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
# SAS Token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key),
)

In [0]:
# Set partitions
spark.conf.set("spark.sql.shuffle.partitions", 1000)
spark.conf.set("spark.sql.files.minPartitionNum", 1000)

In [0]:
spark

In [0]:
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))

## Data

### Airlines

In [0]:
df_airports_pagerank_full = spark.read.parquet(f"{blob_url}/airlines_full_airport_airline_ripple_pagerank") \
                                                            .repartition(1000, 'fl_date') \
                                                            .persist()

In [0]:
'''
filter out airports that are not close to any weather station within 500 miles
'''
df_airports_pagerank = df_airports_pagerank_full \
                            .filter("(TRIM(origin_icao) NOT IN ('KCDV', 'KSTT', 'KYAK', 'KBET', 'KOGS'))") \
                            .persist()

In [0]:
print(df_airports_pagerank.filter(col('origin_icao').isNotNull()).count())

In [0]:
print(sparkShape(df_airports_pagerank))
display(df_airports_pagerank)

year,quarter,month,day_of_month,day_of_week,fl_date,time_zone,origin,origin_icao,origin_city_name,origin_airport_id,origin_state_abr,dest_airport_id,dest_state_abr,dest_city_name,dest,dest_icao,op_unique_carrier,op_carrier_airline_id,op_carrier_fl_num,tail_num,dep_time_blk,arr_time_blk,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,distance_group,dep_delay_new,dep_del15,cancelled,dep_time,arr_time,arr_delay_new,local_tz,local_crs_arr_tz,local_arr_tz,fl_tz,pwnd_date,pwnd_blk,dwnd_blk,total_flights,delayed_flights,avg_delayed_mins,pct_flight_delayed,airline_total_flights,airline_delayed_flights,airline_avg_delayed_mins,airline_pct_flight_delayed,prev_flight_arr_delay,timediff,pagerank
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,15376,AZ,"Tucson, AZ",TUS,KTUS,AA,19805,2202,N3MHAA,1000-1059,1100-1159,1055,1122,147.0,813.0,4,0.0,0.0,0,1047,1108,0.0,2017-05-14T10:55:00.000+0000,2017-05-14T11:22:00.000+0000,2017-05-14T11:22:00.000+0000,2017-05-14T15:55:00.000+0000,2017-05-14,0700-0759,1000-1059,41,6.0,7.95122,14.634146341463415,33,3.0,3.909091,9.090909090909092,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,14057,OR,"Portland, OR",PDX,KPDX,AA,19805,1017,N3FNAA,1200-1259,1400-1459,1240,1443,243.0,1616.0,7,0.0,0.0,0,1239,1431,0.0,2017-05-14T12:40:00.000+0000,2017-05-14T14:43:00.000+0000,2017-05-14T14:43:00.000+0000,2017-05-14T17:40:00.000+0000,2017-05-14,0900-0959,1200-1259,29,4.0,4.517241,13.79310344827586,24,4.0,5.458333,16.666666666666668,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,13198,MO,"Kansas City, MO",MCI,KMCI,AA,19805,2394,N026AA,0700-0759,0900-0959,725,904,99.0,460.0,2,34.0,1.0,0,759,927,23.0,2017-05-14T07:25:00.000+0000,2017-05-14T09:04:00.000+0000,2017-05-14T09:27:00.000+0000,2017-05-14T12:25:00.000+0000,2017-05-14,0400-0459,0700-0759,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,15370,OK,"Tulsa, OK",TUL,KTUL,AA,19805,299,N3AFAA,2200-2259,2300-2359,2210,2314,64.0,237.0,1,23.0,1.0,0,2233,2332,18.0,2017-05-14T22:10:00.000+0000,2017-05-14T23:14:00.000+0000,2017-05-14T23:32:00.000+0000,2017-05-15T03:10:00.000+0000,2017-05-14,1900-1959,2200-2259,19,4.0,13.526316,21.05263157894737,9,1.0,13.777778,11.11111111111111,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,10397,GA,"Atlanta, GA",ATL,KATL,DL,19790,1890,N355NW,0001-0559,0800-0859,550,855,125.0,731.0,3,0.0,0.0,0,545,851,0.0,2017-05-14T05:50:00.000+0000,2017-05-14T08:55:00.000+0000,2017-05-14T08:55:00.000+0000,2017-05-14T10:50:00.000+0000,2017-05-14,0200-0259,0500-0559,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,12889,NV,"Las Vegas, NV",LAS,KLAS,AA,19805,1611,N986AA,1100-1159,1200-1259,1105,1207,182.0,1055.0,5,0.0,0.0,0,1105,1200,0.0,2017-05-14T11:05:00.000+0000,2017-05-14T12:07:00.000+0000,2017-05-14T12:07:00.000+0000,2017-05-14T16:05:00.000+0000,2017-05-14,0800-0859,1100-1159,23,2.0,7.478261,8.695652173913043,14,0.0,0.428571,0.0,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,12892,CA,"Los Angeles, CA",LAX,KLAX,AA,19805,2425,N856AA,1100-1159,1200-1259,1125,1255,210.0,1235.0,5,2.0,0.0,0,1127,1239,0.0,2017-05-14T11:25:00.000+0000,2017-05-14T12:55:00.000+0000,2017-05-14T12:55:00.000+0000,2017-05-14T16:25:00.000+0000,2017-05-14,0800-0859,1100-1159,23,2.0,7.478261,8.695652173913043,14,0.0,0.428571,0.0,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,10693,TN,"Nashville, TN",BNA,KBNA,AA,19805,2673,N425AA,1100-1159,1200-1259,1100,1246,106.0,631.0,3,0.0,0.0,0,1054,1241,0.0,2017-05-14T11:00:00.000+0000,2017-05-14T12:46:00.000+0000,2017-05-14T12:46:00.000+0000,2017-05-14T16:00:00.000+0000,2017-05-14,0800-0859,1100-1159,23,2.0,7.478261,8.695652173913043,14,0.0,0.428571,0.0,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,14908,CA,"Santa Ana, CA",SNA,KSNA,AA,19805,2502,N3LRAA,1700-1759,1800-1859,1705,1822,197.0,1205.0,5,0.0,0.0,0,1659,1817,0.0,2017-05-14T17:05:00.000+0000,2017-05-14T18:22:00.000+0000,2017-05-14T18:22:00.000+0000,2017-05-14T22:05:00.000+0000,2017-05-14,1400-1459,1700-1759,40,3.0,9.525,7.5,32,1.0,2.4375,3.125,0.0,0.0,10.554148003940377
2017,2,5,14,7,2017-05-14,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,13244,TN,"Memphis, TN",MEM,KMEM,AA,19805,2267,N3NCAA,2000-2059,2200-2259,2030,2200,90.0,431.0,2,0.0,0.0,0,2026,2150,0.0,2017-05-14T20:30:00.000+0000,2017-05-14T22:00:00.000+0000,2017-05-14T22:00:00.000+0000,2017-05-15T01:30:00.000+0000,2017-05-14,1700-1759,2000-2059,26,3.0,5.192308,11.538461538461538,17,0.0,0.117647,0.0,0.0,0.0,10.554148003940377


In [0]:
df_airports_pagerank.createOrReplaceTempView('airlines')

### Weather

In [0]:
df_weather_full = spark.read.parquet(f"{blob_url}/weather_full") \
                    .repartition(1000, 'fl_date') \
                    .persist()

In [0]:
print(df_weather_full.filter(col('airport_icao').isNull()).count())

In [0]:
print(sparkShape(df_weather_full))
display(df_weather_full)

airport_icao,airport_distance_to_weather_station,rank,station,country,date,latitude,longitude,report_type,call_sign,fl_date,wind_directional_angle,wind_directional_qc,wind_directional_type_code,wind_directional_speed_rate,wind_directional_speed_qc,sky_ceiling_height_dimension,sky_ceiling_qc,sky_ceiling_determination_code,sky_ceiling_cavok_code,vis_distance,vis_distance_qc,vis_variability_code,vis_variability_qc,air_temperature,air_temperature_qc,dew_point_temperature,dew_point_qc,sea_level_pressure,sea_level_pressure_qc,lp_period_qty,lp_depth_dimension,lp_condition_code,lp_quality_code,ap_altimeter_setting_rate,ap_altimeter_qc,ap_station_pressure_rate,ap_station_pressure_qc,sky_coverage_code,sky_coverage_code_2,sky_coverage_qc,sky_height_dimension,sky_height_dimension_qc,sky_characteristic_code,sky_convective_cloud_attribute,sky_vertical_datum_attribute,sky_base_height_upper_range_attribute,sky_base_height_lower_range_attribute,ground_observation_code,ground_observation_qc,snow_depth_dimension,snow_depth_condition_code,snow_depth_qc,snow_depth_equivalent_water_depth,snow_depth_water_condition_code,snow_depth_water_qc,weather_obs_source_element,weaher_obs_weather_type_num,weather_obs_weather_type_abb,snow_depth_equivalent_qc
KIMT,0.0,1,72743794893,US,2015-03-09T08:54:00.000+0000,45.81833,-88.11444,FM-15,KIMT,2015-03-09,260.0,5,N,21.0,5,22000.0,5,9,N,16093.0,5,N,5,-11.0,5,-72.0,5,10160.0,5,1.0,0.0,9.0,5,10146.0,5.0,9720.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,
KSBA,0.0,1,72392523190,US,2015-03-09T18:53:00.000+0000,34.4258,-119.8425,FM-15,KSBA,2015-03-09,190.0,5,N,41.0,5,22000.0,5,9,N,16093.0,5,N,5,172.0,5,117.0,5,10159.0,5,1.0,0.0,9.0,5,10159.0,5.0,10152.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,
KPVU,0.0,1,72572424174,US,2015-03-09T08:55:00.000+0000,40.21889,-111.72333,FM-15,KPVU,2015-03-09,160.0,5,N,15.0,5,22000.0,5,9,N,16093.0,5,N,5,0.0,C,-40.0,C,,9,,,,,10196.0,5.0,8645.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,
KAVL,33.925690151506565,2,72314453890,US,2015-03-09T03:15:00.000+0000,35.42806,-81.935,FM-15,KFQD,2015-03-09,,9,C,0.0,5,22000.0,5,9,N,16093.0,5,N,5,148.0,5,-65.0,5,,9,,,,,10220.0,5.0,9828.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,
KSJC,0.0,1,72494523293,US,2015-03-09T08:53:00.000+0000,37.3591,-121.924,FM-15,KSJC,2015-03-09,330.0,5,N,26.0,5,244.0,5,M,N,16093.0,5,N,5,117.0,5,100.0,5,10181.0,5,1.0,0.0,9.0,5,10183.0,5.0,10165.0,5.0,4.0,99.0,1.0,244.0,5.0,9.0,,AGL,99999.0,99999.0,,,,,,,,,,,,
KTTN,0.0,1,72409514792,US,2015-03-09T14:53:00.000+0000,40.27679,-74.81594,FM-15,KTTN,2015-03-09,300.0,5,N,41.0,5,3048.0,5,M,N,16093.0,5,N,5,83.0,5,-11.0,5,10218.0,5,1.0,0.0,9.0,5,10224.0,5.0,10145.0,5.0,4.0,99.0,1.0,3048.0,5.0,9.0,,AGL,99999.0,99999.0,,,,,,,,,,,,
KTPA,6.3952209937668,2,72037492825,US,2015-03-09T01:35:00.000+0000,27.91556,-82.44917,FM-15,KTPF,2015-03-09,100.0,5,N,36.0,5,22000.0,5,9,N,16093.0,5,N,5,220.0,5,160.0,5,,9,,,,,10224.0,5.0,10221.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,
KMCI,0.0,1,72446003947,US,2015-03-09T13:53:00.000+0000,39.2972,-94.7306,FM-15,KMCI,2015-03-09,50.0,5,N,36.0,5,22000.0,5,9,N,11265.0,5,N,5,17.0,5,0.0,5,10198.0,5,1.0,0.0,9.0,5,10196.0,5.0,9825.0,5.0,1.0,99.0,1.0,7620.0,5.0,9.0,,AGL,99999.0,99999.0,,,,,,,,,,,,
KCRW,46.075723705767,2,74207963876,US,2015-03-09T15:35:00.000+0000,38.91472,-82.09861,FM-15,K3I2,2015-03-09,220.0,5,N,31.0,5,22000.0,5,9,N,16093.0,5,N,5,110.0,5,20.0,5,,9,,,,,10251.0,5.0,10015.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,
KDBQ,23.541157224524817,2,72058600183,US,2015-03-09T01:15:00.000+0000,42.683,-90.45,FM-15,KPVB,2015-03-09,190.0,5,N,15.0,5,22000.0,5,9,N,16093.0,5,N,5,25.0,5,-1.0,5,,9,,,,,10196.0,5.0,9825.0,5.0,0.0,99.0,1.0,,9.0,9.0,,,,,,,,,,,,,,,,


In [0]:
df_weather_full.createOrReplaceTempView('weather')

## Join

### Airlines joined with 2-3 top ranked weather stations

In [0]:
# Join with airline data
# We want to find flight and weather data that happened 2-3 hours before a flight departure

def joinWithInterval(start=3, end=2):
    query_string = f'''
        SELECT 
            /*+ REPARTITION(1000) */
            
            airlines.fl_tz,
            airlines.origin_icao,
            airlines.dest_icao,
            airlines.tail_num,
            
            airlines.fl_date,
            
            airlines.op_unique_carrier,
            airlines.op_carrier_airline_id,
            airlines.op_carrier_fl_num,
            
            airlines.year,
            airlines.quarter,
            airlines.month,
            airlines.day_of_month,
            airlines.day_of_week,
            airlines.dwnd_blk,
            
            airlines.dep_del15,
            airlines.dep_delay_new,
            
            airlines.distance,
            airlines.total_flights,
            airlines.delayed_flights,
            airlines.avg_delayed_mins,
            airlines.pct_flight_delayed,
            airlines.airline_total_flights,
            airlines.airline_delayed_flights,
            airlines.airline_avg_delayed_mins,
            airlines.airline_pct_flight_delayed,
            airlines.prev_flight_arr_delay,
            airlines.timediff,
            airlines.pagerank,

            weather.wind_directional_angle,
            weather.wind_directional_speed_rate,
            weather.sky_ceiling_height_dimension,
            weather.vis_distance,
            weather.air_temperature,
            weather.dew_point_temperature,
            weather.sea_level_pressure,
            weather.lp_period_qty,
            weather.ap_altimeter_setting_rate,
            weather.sky_height_dimension,
            weather.sky_convective_cloud_attribute,
            weather.snow_depth_dimension,
            weather.snow_depth_equivalent_water_depth
            
        FROM airlines
        INNER JOIN weather ON airlines.origin_icao = weather.airport_icao AND
                              airlines.fl_date = weather.fl_date AND
                              weather.date BETWEEN (fl_tz - INTERVAL {start} hours) AND (fl_tz - INTERVAL {end} hours)
    '''
    df = spark.sql(query_string)
    return df

In [0]:
df_airlines_weather = joinWithInterval().persist()

In [0]:
print("airlines with 2 top ranked stations", sparkShape(df_airlines_weather))
display(df_airlines_weather)

fl_tz,origin_icao,dest_icao,tail_num,fl_date,op_unique_carrier,op_carrier_airline_id,op_carrier_fl_num,year,quarter,month,day_of_month,day_of_week,dwnd_blk,dep_del15,dep_delay_new,distance,total_flights,delayed_flights,avg_delayed_mins,pct_flight_delayed,airline_total_flights,airline_delayed_flights,airline_avg_delayed_mins,airline_pct_flight_delayed,prev_flight_arr_delay,timediff,pagerank,wind_directional_angle,wind_directional_speed_rate,sky_ceiling_height_dimension,vis_distance,air_temperature,dew_point_temperature,sea_level_pressure,lp_period_qty,ap_altimeter_setting_rate,sky_height_dimension,sky_convective_cloud_attribute,snow_depth_dimension,snow_depth_equivalent_water_depth
2018-08-29T16:10:00.000+0000,KLAS,KRNO,N423WN,2018-08-29,WN,19393,1449,2018,3,8,29,3,0900-0959,0.0,7.0,345.0,28,0.0,0.0,0.0,10,0.0,0.0,0.0,0.0,0.0,5.480169253343784,,0.0,22000.0,16093.0,256.0,22.0,10111.0,1.0,10139.0,6096.0,,,
2019-12-31T14:40:00.000+0000,KCLT,KALB,N655AW,2019-12-31,AA,19805,1860,2019,4,12,31,2,0900-0959,0.0,0.0,646.0,9,4.0,18.222222,44.44444444444444,2,2.0,63.5,100.0,0.0,0.0,7.228397554396746,240.0,21.0,22000.0,16093.0,44.0,-6.0,10110.0,1.0,10112.0,,,,
2017-12-01T02:20:00.000+0000,KPHX,KSLC,N260SY,2017-11-30,OO,20304,4550,2017,4,11,30,4,1900-1959,0.0,0.0,507.0,31,2.0,4.903226,6.451612903225806,4,0.0,0.0,0.0,0.0,0.0,4.794342675325302,,0.0,22000.0,16093.0,228.0,28.0,10154.0,1.0,10169.0,3353.0,,,
2019-12-11T01:40:00.000+0000,KMSP,KFSD,N825SK,2019-12-10,OO,20304,4129,2019,4,12,10,2,1900-1959,0.0,0.0,196.0,8,2.0,14.5,25.0,2,1.0,34.0,50.0,0.0,0.0,7.159757795682704,260.0,57.0,22000.0,16093.0,-172.0,-228.0,10269.0,1.0,10244.0,,,,
2019-12-26T13:00:00.000+0000,KIAH,KLGA,N87306,2019-12-26,YV,20378,6059,2019,4,12,26,4,0700-0759,0.0,0.0,1416.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,6.192936306999375,,0.0,61.0,402.0,128.0,122.0,10164.0,1.0,10166.0,61.0,,,
2019-08-16T20:35:00.000+0000,KPHL,KORD,N14235,2019-08-16,UA,19977,563,2019,3,8,16,5,1600-1659,0.0,0.0,678.0,20,4.0,8.0,20.0,1,0.0,0.0,0.0,0.0,0.0,4.02709185215769,,0.0,22000.0,16093.0,288.0,211.0,,,10169.0,1981.0,,,
2018-03-22T01:00:00.000+0000,KPHX,KSTL,N8301J,2018-03-21,WN,19393,678,2018,1,3,21,3,1800-1859,1.0,27.0,1262.0,28,4.0,9.75,14.285714285714286,4,0.0,3.25,0.0,0.0,0.0,4.794342675325302,,26.0,7620.0,16093.0,272.0,-89.0,10127.0,1.0,10139.0,5791.0,,,
2015-12-24T19:20:00.000+0000,KATL,KIND,N970DL,2015-12-24,DL,19790,1534,2015,4,12,24,4,1400-1459,1.0,62.0,432.0,36,34.0,193.888889,94.44444444444444,22,22.0,219.681818,100.0,0.0,0.0,9.22422595180868,170.0,26.0,823.0,1609.0,200.0,194.0,10180.0,1.0,10186.0,152.0,,,
2015-10-14T23:56:00.000+0000,KMBS,KORD,N929SW,2015-10-14,OO,20304,5382,2015,4,10,14,3,1900-1959,0.0,0.0,222.0,1,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.3607109286779058,250.0,15.0,22000.0,16093.0,120.0,30.0,,,10132.0,1494.0,,,
2019-05-16T21:05:00.000+0000,KSMF,KGEG,N717SA,2019-05-16,WN,19393,582,2019,2,5,16,4,1400-1459,0.0,1.0,649.0,6,2.0,19.333333,33.333333333333336,3,0.0,0.0,0.0,0.0,0.0,1.524768179935736,160.0,31.0,,14484.0,130.0,110.0,,,10115.0,,,,


In [0]:
df_airlines_weather.write.mode('overwrite').parquet(f"{blob_url}/airlines_top3_weather_stations")

In [0]:
df_airlines_weather = spark.read.parquet(f"{blob_url}/airlines_top3_weather_stations") \
                                .repartition(1000, 'fl_date') \
                                .persist()

In [0]:
print("airlines", sparkShape(df_airports_pagerank))
print("airlines weather", sparkShape(df_airlines_weather))

In [0]:
df_airlines_weather.createOrReplaceTempView('airlines_weather')

### Aggregate weather data for each flight

In [0]:
def aggStations():
    query_string = '''
        SELECT
            /*+ REPARTITION(1000) */

            fl_tz,
            origin_icao,
            dest_icao,
            tail_num,
            
            last(fl_date) AS fl_date,
            last(op_unique_carrier) AS op_unique_carrier,
            last(op_carrier_airline_id) AS op_carrier_airline_id,
            last(op_carrier_fl_num) AS op_carrier_fl_num,
            
            last(year) AS year,
            last(quarter) AS quarter,
            last(month) AS month,
            last(day_of_month) AS day_of_month,
            last(day_of_week) AS day_of_week,
            last(dwnd_blk) AS dwnd_blk,
            
            avg(dep_del15) AS dep_del15,
            avg(dep_delay_new) AS dep_delay_new,
            
            avg(distance) AS distance,
            avg(total_flights) AS airport_total_flights_pwnd,
            avg(delayed_flights) AS airport_delayed_flights_pwnd,
            avg(avg_delayed_mins) AS airport_delayed_mins_pwnd,
            avg(pct_flight_delayed) AS airport_pct_flight_delayed,
            avg(airline_total_flights) AS airline_total_flights_pwnd,
            avg(airline_delayed_flights) AS airline_delayed_flights_pwnd,
            avg(airline_avg_delayed_mins) AS airline_delayed_mins_pwnd,
            avg(airline_pct_flight_delayed) AS airline_pct_flight_delayed,
            avg(prev_flight_arr_delay) AS ripple_effect_delay,
            avg(timediff) AS timediff_prev_flight,
            avg(pagerank) AS airport_pagerank,

            avg(wind_directional_angle) AS avg_wind_directional_angle,
            avg(wind_directional_speed_rate) AS avg_wind_directional_speed_rate,
            avg(sky_ceiling_height_dimension) AS avg_sky_ceiling_height_dimension,
            avg(vis_distance) AS avg_vis_distance,
            avg(air_temperature) AS avg_air_temperature,
            avg(dew_point_temperature) AS avg_dew_point_temperature,
            avg(sea_level_pressure) AS avg_sea_level_pressure,
            avg(lp_period_qty) AS avg_lp_period_qty,
            avg(ap_altimeter_setting_rate) as avg_ap_altimeter_setting_rate,
            avg(sky_height_dimension) AS avg_sky_height_dimension,
            avg(sky_convective_cloud_attribute) AS avg_sky_convective_cloud_attribute,
            avg(snow_depth_dimension) AS avg_snow_depth_dimension,
            avg(snow_depth_equivalent_water_depth) AS avg_snow_depth_equivalent_water_depth,
            
            min(wind_directional_angle) AS min_wind_directional_angle,
            min(wind_directional_speed_rate) AS min_wind_directional_speed_rate,
            min(sky_ceiling_height_dimension) AS min_sky_ceiling_height_dimension,
            min(vis_distance) AS min_vis_distance,
            min(air_temperature) AS min_air_temperature,
            min(dew_point_temperature) AS min_dew_point_temperature,
            min(sea_level_pressure) AS min_sea_level_pressure,
            min(lp_period_qty) AS min_lp_period_qty,
            min(ap_altimeter_setting_rate) AS min_ap_altimeter_setting_rate,
            min(sky_height_dimension) AS min_sky_height_dimension,
            min(sky_convective_cloud_attribute) AS min_sky_convective_cloud_attribute,
            min(snow_depth_dimension) AS min_snow_depth_dimension,
            min(snow_depth_equivalent_water_depth) AS min_snow_depth_equivalent_water_depth,
            
            max(wind_directional_angle) AS max_wind_directional_angle,
            max(wind_directional_speed_rate) AS max_wind_directional_speed_rate,
            max(sky_ceiling_height_dimension) AS max_sky_ceiling_height_dimension,
            max(vis_distance) AS max_vis_distance,
            max(air_temperature) AS max_air_temperature,
            max(dew_point_temperature) AS max_dew_point_temperature,
            max(sea_level_pressure) AS max_sea_level_pressure,
            max(lp_period_qty) AS max_lp_period_qty,
            max(ap_altimeter_setting_rate) AS max_ap_altimeter_setting_rate,
            max(sky_height_dimension) AS max_sky_height_dimension,
            max(sky_convective_cloud_attribute) AS max_sky_convective_cloud_attribute,
            max(snow_depth_dimension) AS max_snow_depth_dimension,
            max(snow_depth_equivalent_water_depth) AS max_snow_depth_equivalent_water_depth   
            
        FROM airlines_weather
        GROUP BY fl_tz, origin_icao, dest_icao, tail_num, op_carrier_airline_id, op_carrier_fl_num
    '''
    df = spark.sql(query_string)
    return df

In [0]:
df_airlines_weather_agg = aggStations().persist()

In [0]:
print("airlines agg stations", sparkShape(df_airlines_weather_agg))

In [0]:
display(df_airlines_weather_agg)

fl_tz,origin_icao,dest_icao,tail_num,fl_date,op_unique_carrier,op_carrier_airline_id,op_carrier_fl_num,year,quarter,month,day_of_month,day_of_week,dwnd_blk,dep_del15,dep_delay_new,distance,airport_total_flights_pwnd,airport_delayed_flights_pwnd,airport_delayed_mins_pwnd,airport_pct_flight_delayed,airline_total_flights_pwnd,airline_delayed_flights_pwnd,airline_delayed_mins_pwnd,airline_pct_flight_delayed,ripple_effect_delay,timediff_prev_flight,airport_pagerank,avg_wind_directional_angle,avg_wind_directional_speed_rate,avg_sky_ceiling_height_dimension,avg_vis_distance,avg_air_temperature,avg_dew_point_temperature,avg_sea_level_pressure,avg_lp_period_qty,avg_ap_altimeter_setting_rate,avg_sky_height_dimension,avg_sky_convective_cloud_attribute,avg_snow_depth_dimension,avg_snow_depth_equivalent_water_depth,min_wind_directional_angle,min_wind_directional_speed_rate,min_sky_ceiling_height_dimension,min_vis_distance,min_air_temperature,min_dew_point_temperature,min_sea_level_pressure,min_lp_period_qty,min_ap_altimeter_setting_rate,min_sky_height_dimension,min_sky_convective_cloud_attribute,min_snow_depth_dimension,min_snow_depth_equivalent_water_depth,max_wind_directional_angle,max_wind_directional_speed_rate,max_sky_ceiling_height_dimension,max_vis_distance,max_air_temperature,max_dew_point_temperature,max_sea_level_pressure,max_lp_period_qty,max_ap_altimeter_setting_rate,max_sky_height_dimension,max_sky_convective_cloud_attribute,max_snow_depth_dimension,max_snow_depth_equivalent_water_depth
2016-02-14T23:05:00.000+0000,KSAN,KDAL,N475WN,2016-02-14,WN,19393,2649,2016,1,2,14,7,1500-1559,0.0,0.0,1182.0,13.0,3.0,13.923077,23.07692307692308,7.0,2.0,7.428571,28.571428571428573,0.0,0.0,2.69022650852153,310.0,38.5,22000.0,16093.0,217.0,105.5,10156.5,1.0,10157.5,3901.5,,,,300.0,36.0,22000.0,16093.0,217.0,100.0,10154.0,1.0,10156.0,183.0,,,,320.0,41.0,22000.0,16093.0,217.0,111.0,10159.0,1.0,10159.0,7620.0,,,
2015-10-15T12:39:00.000+0000,KATL,KPNS,N944DL,2015-10-15,DL,19790,2212,2015,4,10,15,4,0800-0859,0.0,0.0,271.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.22422595180868,,0.0,22000.0,16093.0,113.5,88.5,10186.5,1.0,10191.5,,,,,,0.0,22000.0,16093.0,94.0,83.0,10185.0,1.0,10190.0,,,,,,0.0,22000.0,16093.0,133.0,94.0,10188.0,1.0,10193.0,,,,
2017-10-31T12:45:00.000+0000,KDTW,KMBS,N506CA,2017-10-31,OO,20304,4526,2017,4,10,31,2,0800-0859,0.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.006187328467007,230.0,62.0,1752.5,15288.5,47.0,-8.5,10127.5,1.0,10125.0,1310.5,,,,230.0,62.0,1676.0,14484.0,44.0,-11.0,10125.0,1.0,10125.0,945.0,,,,230.0,62.0,1829.0,16093.0,50.0,-6.0,10130.0,1.0,10125.0,1676.0,,,
2019-08-29T00:25:00.000+0000,KORD,KSEA,N224AK,2019-08-28,AS,19930,21,2019,3,8,28,3,1900-1959,1.0,100.0,1721.0,44.0,5.0,5.5,11.363636363636363,0.0,0.0,0.0,0.0,0.0,0.0,10.33692724031147,260.0,98.0,22000.0,16093.0,233.0,94.0,10130.0,1.0,10132.0,1676.0,,,,260.0,98.0,22000.0,16093.0,233.0,94.0,10130.0,1.0,10132.0,1676.0,,,,260.0,98.0,22000.0,16093.0,233.0,94.0,10130.0,1.0,10132.0,1676.0,,,
2019-09-17T23:10:00.000+0000,KSFO,KSEA,N113DQ,2019-09-17,DL,19790,2002,2019,3,9,17,2,1600-1659,1.0,62.0,679.0,24.0,10.0,29.958333,41.66666666666666,1.0,0.0,5.0,0.0,0.0,0.0,4.325772152684395,290.0,62.0,22000.0,16093.0,239.0,83.0,10175.0,1.0,10176.0,1829.0,,,,290.0,62.0,22000.0,16093.0,239.0,83.0,10175.0,1.0,10176.0,1829.0,,,,290.0,62.0,22000.0,16093.0,239.0,83.0,10175.0,1.0,10176.0,1829.0,,,
2015-04-21T16:15:00.000+0000,KRDU,KATL,N377NW,2015-04-21,DL,19790,1177,2015,2,4,21,2,1200-1259,0.0,0.0,356.0,8.0,1.0,1.875,12.5,1.0,1.0,15.0,100.0,0.0,0.0,2.3875598457732057,290.0,26.0,22000.0,16093.0,161.0,55.5,10119.5,1.0,10123.5,1158.0,,,,290.0,21.0,22000.0,16093.0,161.0,50.0,10119.0,1.0,10122.0,1158.0,,,,290.0,31.0,22000.0,16093.0,161.0,61.0,10120.0,1.0,10125.0,1158.0,,,
2019-11-20T10:13:00.000+0000,KCRW,KCLT,N505AE,2019-11-20,OH,20397,5365,2019,4,11,20,3,0500-0559,0.0,0.0,221.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6486837329798066,,0.0,610.0,16093.0,50.0,11.0,10160.0,,10159.0,610.0,,,,,0.0,610.0,16093.0,50.0,11.0,10160.0,,10159.0,610.0,,,,,0.0,610.0,16093.0,50.0,11.0,10160.0,,10159.0,610.0,,,
2019-09-01T23:27:00.000+0000,KCLT,KPHL,N332FR,2019-09-01,F9,20436,2314,2019,3,9,1,7,1900-1959,1.0,93.0,449.0,59.0,6.0,8.101695,10.169491525423728,0.0,0.0,0.0,0.0,0.0,0.0,7.228397554396746,30.0,23.5,22000.0,16093.0,305.5,155.5,10193.0,1.0,10201.5,1676.0,,,,30.0,21.0,22000.0,16093.0,300.0,150.0,10193.0,1.0,10200.0,1676.0,,,,30.0,26.0,22000.0,16093.0,311.0,161.0,10193.0,1.0,10203.0,1676.0,,,
2018-05-29T11:50:00.000+0000,KLFT,KIAH,N16981,2018-05-29,EV,20366,4249,2018,2,5,29,2,0600-0659,0.0,0.0,201.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4538052642362111,,0.0,22000.0,16093.0,239.0,214.0,10066.5,1.0,10067.5,,,,,,0.0,22000.0,16093.0,228.0,206.0,10064.0,1.0,10064.0,,,,,,0.0,22000.0,16093.0,250.0,222.0,10069.0,1.0,10071.0,,,,
2016-12-03T13:25:00.000+0000,KIAH,KDEN,N417UA,2016-12-03,UA,19977,75,2016,4,12,3,6,0700-0759,0.0,0.0,862.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.192936306999375,65.0,56.5,335.0,11265.5,100.0,89.0,10131.5,1.0,10132.0,182.5,,,,60.0,51.0,213.0,8047.0,100.0,89.0,10131.0,1.0,10132.0,152.0,,,,70.0,62.0,457.0,14484.0,100.0,89.0,10132.0,1.0,10132.0,213.0,,,


### Checkpoint complete join

In [0]:
df_airlines_weather_agg.write.mode('overwrite').parquet(f"{blob_url}/airlines_agg")

In [0]:
df_airlines_weather_agg = spark.read.parquet(f"{blob_url}/airlines_agg") \
                                    .repartition(1000, 'fl_date') \
                                    .persist()

In [0]:
df_airlines_weather_agg.createOrReplaceTempView('airlines_agg')

In [0]:
print("airlines agg stations", sparkShape(df_airlines_weather_agg))
display(df_airlines_weather_agg)

fl_tz,origin_icao,dest_icao,tail_num,fl_date,op_unique_carrier,op_carrier_airline_id,op_carrier_fl_num,year,quarter,month,day_of_month,day_of_week,dwnd_blk,dep_del15,dep_delay_new,distance,airport_total_flights_pwnd,airport_delayed_flights_pwnd,airport_delayed_mins_pwnd,airport_pct_flight_delayed,airline_total_flights_pwnd,airline_delayed_flights_pwnd,airline_delayed_mins_pwnd,airline_pct_flight_delayed,ripple_effect_delay,timediff_prev_flight,airport_pagerank,avg_wind_directional_angle,avg_wind_directional_speed_rate,avg_sky_ceiling_height_dimension,avg_vis_distance,avg_air_temperature,avg_dew_point_temperature,avg_sea_level_pressure,avg_lp_period_qty,avg_ap_altimeter_setting_rate,avg_sky_height_dimension,avg_sky_convective_cloud_attribute,avg_snow_depth_dimension,avg_snow_depth_equivalent_water_depth,min_wind_directional_angle,min_wind_directional_speed_rate,min_sky_ceiling_height_dimension,min_vis_distance,min_air_temperature,min_dew_point_temperature,min_sea_level_pressure,min_lp_period_qty,min_ap_altimeter_setting_rate,min_sky_height_dimension,min_sky_convective_cloud_attribute,min_snow_depth_dimension,min_snow_depth_equivalent_water_depth,max_wind_directional_angle,max_wind_directional_speed_rate,max_sky_ceiling_height_dimension,max_vis_distance,max_air_temperature,max_dew_point_temperature,max_sea_level_pressure,max_lp_period_qty,max_ap_altimeter_setting_rate,max_sky_height_dimension,max_sky_convective_cloud_attribute,max_snow_depth_dimension,max_snow_depth_equivalent_water_depth
2017-05-14T18:40:00.000+0000,KSTL,KDEN,N715FR,2017-05-14,F9,20436,283,2017,2,5,14,7,1300-1359,0.0,0.0,770.0,5.0,0.0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.547640377423018,255.0,41.0,22000.0,16093.0,255.5,117.0,10145.0,1.0,10152.5,7620.0,,,,250.0,41.0,22000.0,16093.0,250.0,117.0,10145.0,1.0,10149.0,7620.0,,,,260.0,41.0,22000.0,16093.0,261.0,117.0,10145.0,1.0,10156.0,7620.0,,,
2017-05-15T01:35:00.000+0000,KMIA,KSFO,N889AA,2017-05-14,AA,19805,2647,2017,2,5,14,7,2100-2159,0.0,0.0,2585.0,5.0,0.0,2.4,0.0,3.0,0.0,4.0,0.0,69.0,436.0,2.8711367279493127,260.0,46.0,14810.0,16093.0,308.5,214.0,10109.0,1.0,10108.0,1219.0,,,,260.0,46.0,7620.0,16093.0,306.0,206.0,10109.0,1.0,10108.0,1219.0,,,,260.0,46.0,22000.0,16093.0,311.0,222.0,10109.0,1.0,10108.0,1219.0,,,
2017-05-14T13:37:00.000+0000,KATL,KFNT,N931DL,2017-05-14,DL,19790,1767,2017,2,5,14,7,0900-0959,0.0,0.0,645.0,15.0,2.0,3.4,13.333333333333334,2.0,0.0,0.0,0.0,0.0,0.0,9.22422595180868,50.0,15.5,22000.0,16093.0,147.0,141.5,10134.0,1.0,10142.0,,,,,50.0,0.0,22000.0,16093.0,122.0,122.0,10133.0,1.0,10142.0,,,,,50.0,31.0,22000.0,16093.0,172.0,161.0,10135.0,1.0,10142.0,,,,
2017-05-14T20:15:00.000+0000,KSLC,KATL,N536US,2017-05-14,DL,19790,1121,2017,2,5,14,7,1400-1459,0.0,0.0,1590.0,43.0,5.0,7.581394999999999,11.627906976744184,18.0,3.0,13.277778,16.666666666666668,0.0,0.0,5.392209820268163,334.0,32.0,18941.2,16093.0,145.0,21.6,10092.0,1.0,10122.6,2438.0,,,,300.0,26.0,6706.0,16093.0,141.0,-17.0,10092.0,1.0,10119.0,2438.0,,,,350.0,41.0,22000.0,16093.0,150.0,34.0,10092.0,1.0,10125.0,2438.0,,,
2017-05-14T16:30:00.000+0000,KMSP,KHNL,N822NW,2017-05-14,DL,19790,1469,2017,2,5,14,7,1100-1159,0.0,0.0,3972.0,9.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,7.159757795682704,125.0,36.0,22000.0,16093.0,183.5,83.0,10089.5,1.0,10093.0,7620.0,,,,120.0,31.0,22000.0,16093.0,178.0,83.0,10088.0,1.0,10091.0,7620.0,,,,130.0,41.0,22000.0,16093.0,189.0,83.0,10091.0,1.0,10095.0,7620.0,,,
2017-05-14T22:20:00.000+0000,KLAS,KMDW,N8623F,2017-05-14,WN,19393,3995,2017,2,5,14,7,1500-1559,1.0,22.0,1521.0,25.0,2.0,2.56,8.0,17.0,2.0,3.411765,11.764705882352942,0.0,0.0,5.480169253343784,215.0,67.0,22000.0,16093.0,275.0,-102.5,10043.5,1.0,10064.5,7620.0,,,,210.0,67.0,22000.0,16093.0,272.0,-122.0,10036.0,1.0,10058.0,7620.0,,,,220.0,67.0,22000.0,16093.0,278.0,-83.0,10051.0,1.0,10071.0,7620.0,,,
2017-05-14T14:55:00.000+0000,KOAK,KSLC,N261WN,2017-05-14,WN,19393,1307,2017,2,5,14,7,0700-0759,0.0,0.0,588.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0126422257897154,,0.0,22000.0,16093.0,100.0,78.0,10133.0,1.0,10132.0,457.0,,,,,0.0,22000.0,16093.0,100.0,78.0,10133.0,1.0,10132.0,457.0,,,,,0.0,22000.0,16093.0,100.0,78.0,10133.0,1.0,10132.0,457.0,,,
2017-05-15T01:00:00.000+0000,KORD,KBOS,N3HUAA,2017-05-14,AA,19805,2754,2017,2,5,14,7,2000-2059,1.0,25.0,867.0,52.0,8.0,5.711538,15.384615384615383,13.0,2.0,8.076923,15.384615384615383,0.0,0.0,10.33692724031147,40.0,67.0,22000.0,16093.0,156.0,22.0,10132.0,1.0,10132.0,4572.0,,,,40.0,67.0,22000.0,16093.0,156.0,22.0,10132.0,1.0,10132.0,4572.0,,,,40.0,67.0,22000.0,16093.0,156.0,22.0,10132.0,1.0,10132.0,4572.0,,,
2017-05-15T02:35:00.000+0000,KPHX,KSNA,N904WN,2017-05-14,WN,19393,3138,2017,2,5,14,7,1900-1959,0.0,8.0,338.0,24.0,6.0,12.416667,25.0,9.0,3.0,8.666667,33.333333333333336,0.0,0.0,4.794342675325302,280.0,54.0,14505.0,16093.0,325.0,14.0,10042.5,1.0,10062.5,2743.0,,,,260.0,51.0,7010.0,16093.0,317.0,11.0,10037.0,1.0,10054.0,2743.0,,,,300.0,57.0,22000.0,16093.0,333.0,17.0,10048.0,1.0,10071.0,2743.0,,,
2017-05-14T19:31:00.000+0000,KPSC,KSLC,N781CA,2017-05-14,OO,20304,4774,2017,2,5,14,7,1200-1259,0.0,0.0,521.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4685114338053792,240.0,67.0,22000.0,16093.0,144.0,22.0,10161.0,1.0,10163.0,,,,,240.0,67.0,22000.0,16093.0,144.0,22.0,10161.0,1.0,10163.0,,,,,240.0,67.0,22000.0,16093.0,144.0,22.0,10161.0,1.0,10163.0,,,,
