# 2020 Weather

In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType, TimestampType
from pyspark.ml.feature import Imputer


sqlContext = SQLContext(sc)

In [0]:
# Configuration for Blob Storage 

blob_container = "container1" # The name of your container created in https://portal.azure.com
storage_account = "w261sp22team12" # The name of your Storage account created in https://portal.azure.com
secret_scope = "s1" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "k1" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
# SAS Token
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# Set partitions
spark.conf.set("spark.sql.shuffle.partitions", 1000)
spark.conf.set("spark.sql.files.minPartitionNum", 1000)

In [0]:
#Spark details
spark

In [0]:
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))

## Data

### Weather Data

In [0]:
schema = StructType([
    StructField("STATION", StringType(), True),
    StructField("DATE", TimestampType(), True),
    StructField("SOURCE", ShortType(), True),
    StructField("LATITUDE", DoubleType(), True),
    StructField("LONGITUDE", DoubleType(), True),
    StructField("ELEVATION", DoubleType(), True),
    StructField("NAME", StringType(), True),
    StructField("REPORT_TYPE", StringType(), True),
    StructField("CALL_SIGN", StringType(), True),
    StructField("QUALITY_CONTROL", StringType(), True),
    StructField("WND", StringType(), True),
    StructField("CIG", StringType(), True),
    StructField("VIS", StringType(), True),
    StructField("TMP", StringType(), True),
    StructField("DEW", StringType(), True),
    StructField("SLP", StringType(), True),
    StructField("AW1", StringType(), True),
    StructField("GA1", StringType(), True),
    StructField("GA2", StringType(), True),
    StructField("GA3", StringType(), True),
    StructField("GA4", StringType(), True),
    StructField("GE1", StringType(), True),
    StructField("GF1", StringType(), True),
    StructField("KA1", StringType(), True),
    StructField("KA2", StringType(), True),
    StructField("MA1", StringType(), True),
    StructField("MD1", StringType(), True),
    StructField("MW1", StringType(), True),
    StructField("MW2", StringType(), True),
    StructField("OC1", StringType(), True),
    StructField("OD1", StringType(), True),
    StructField("OD2", StringType(), True),
    StructField("REM", StringType(), True),
    StructField("EQD", StringType(), True),
    StructField("AW2", StringType(), True),
    StructField("AX4", StringType(), True),
    StructField("GD1", StringType(), True),
    StructField("AW5", StringType(), True),
    StructField("GN1", StringType(), True),
    StructField("AJ1", StringType(), True),
    StructField("AW3", StringType(), True),
    StructField("MK1", StringType(), True),
    StructField("KA4", StringType(), True),
    StructField("GG3", StringType(), True),
    StructField("AN1", StringType(), True),
    StructField("RH1", StringType(), True),
    StructField("AU5", StringType(), True),
    StructField("HL1", StringType(), True),
    StructField("OB1", StringType(), True),
    StructField("AT8", StringType(), True),
    StructField("AW7", StringType(), True),
    StructField("AZ1", StringType(), True),
    StructField("CH1", StringType(), True),
    StructField("RH3", StringType(), True),
    StructField("GK1", StringType(), True),
    StructField("IB1", StringType(), True),
    StructField("AX1", StringType(), True),
    StructField("CT1", StringType(), True),
    StructField("AK1", StringType(), True),
    StructField("CN2", StringType(), True),
    StructField("OE1", StringType(), True),
    StructField("MW5", StringType(), True),
    StructField("AO1", StringType(), True),
    StructField("KA3", StringType(), True),
    StructField("AA3", StringType(), True),
    StructField("CR1", StringType(), True),
    StructField("CF2", StringType(), True),
    StructField("KB2", StringType(), True),
    StructField("GM1", StringType(), True),
    StructField("AT5", StringType(), True),
    StructField("AY2", StringType(), True),
    StructField("MW6", StringType(), True),
    StructField("MG1", StringType(), True),
    StructField("AH6", StringType(), True),
    StructField("AU2", StringType(), True),
    StructField("GD2", StringType(), True),
    StructField("AW4", StringType(), True),
    StructField("MF1", StringType(), True),
    StructField("AA1", StringType(), True),
    StructField("AH2", StringType(), True),
    StructField("AH3", StringType(), True),
    StructField("OE3", StringType(), True),
    StructField("AT6", StringType(), True),
    StructField("AL2", StringType(), True),
    StructField("AL3", StringType(), True),
    StructField("AX5", StringType(), True),
    StructField("IB2", StringType(), True),
    StructField("AI3", StringType(), True),
    StructField("CV3", StringType(), True),
    StructField("WA1", StringType(), True),
    StructField("GH1", StringType(), True),
    StructField("KF1", StringType(), True),
    StructField("CU2", StringType(), True),
    StructField("CT3", StringType(), True),
    StructField("SA1", StringType(), True),
    StructField("AU1", StringType(), True),
    StructField("KD2", StringType(), True),
    StructField("AI5", StringType(), True),
    StructField("GO1", StringType(), True),
    StructField("GD3", StringType(), True),
    StructField("CG3", StringType(), True),
    StructField("AI1", StringType(), True),
    StructField("AL1", StringType(), True),
    StructField("AW6", StringType(), True),
    StructField("MW4", StringType(), True),
    StructField("AX6", StringType(), True),
    StructField("CV1", StringType(), True),
    StructField("ME1", StringType(), True),
    StructField("KC2", StringType(), True),
    StructField("CN1", StringType(), True),
    StructField("UA1", StringType(), True),
    StructField("GD5", StringType(), True),
    StructField("UG2", StringType(), True),
    StructField("AT3", StringType(), True),
    StructField("AT4", StringType(), True),
    StructField("GJ1", StringType(), True),
    StructField("MV1", StringType(), True),
    StructField("GA5", StringType(), True),
    StructField("CT2", StringType(), True),
    StructField("CG2", StringType(), True),
    StructField("ED1", StringType(), True),
    StructField("AE1", StringType(), True),
    StructField("CO1", StringType(), True),
    StructField("KE1", StringType(), True),
    StructField("KB1", StringType(), True),
    StructField("AI4", StringType(), True),
    StructField("MW3", StringType(), True),
    StructField("KG2", StringType(), True),
    StructField("AA2", StringType(), True),
    StructField("AX2", StringType(), True),
    StructField("AY1", StringType(), True),
    StructField("RH2", StringType(), True),
    StructField("OE2", StringType(), True),
    StructField("CU3", StringType(), True),
    StructField("MH1", StringType(), True),
    StructField("AM1", StringType(), True),
    StructField("AU4", StringType(), True),
    StructField("GA6", StringType(), True),
    StructField("KG1", StringType(), True),
    StructField("AU3", StringType(), True),
    StructField("AT7", StringType(), True),
    StructField("KD1", StringType(), True),
    StructField("GL1", StringType(), True),
    StructField("IA1", StringType(), True),
    StructField("GG2", StringType(), True),
    StructField("OD3", StringType(), True),
    StructField("UG1", StringType(), True),
    StructField("CB1", StringType(), True),
    StructField("AI6", StringType(), True),
    StructField("CI1", StringType(), True),
    StructField("CV2", StringType(), True),
    StructField("AZ2", StringType(), True),
    StructField("AD1", StringType(), True),
    StructField("AH1", StringType(), True),
    StructField("WD1", StringType(), True),
    StructField("AA4", StringType(), True),
    StructField("KC1", StringType(), True),
    StructField("IA2", StringType(), True),
    StructField("CF3", StringType(), True),
    StructField("AI2", StringType(), True),
    StructField("AT1", StringType(), True),
    StructField("GD4", StringType(), True),
    StructField("AX3", StringType(), True),
    StructField("AH4", StringType(), True),
    StructField("KB3", StringType(), True),
    StructField("CU1", StringType(), True),
    StructField("CN4", StringType(), True),
    StructField("AT2", StringType(), True),
    StructField("CG1", StringType(), True),
    StructField("CF1", StringType(), True),
    StructField("GG1", StringType(), True),
    StructField("MV2", StringType(), True),
    StructField("CW1", StringType(), True),
    StructField("GG4", StringType(), True),
    StructField("AB1", StringType(), True),
    StructField("AH5", StringType(), True),
    StructField("CN3", StringType(), True)
])

In [0]:
# Load the full Weather data

df_weather = spark.read.schema(schema).csv(f'{blob_url}/2020.tar.gz') \
                        .withColumn('fl_date', to_date(col('date'))) \
                        .repartition(1000, 'fl_date') \
                        .persist()

In [0]:
print("weather", sparkShape(df_weather))

In [0]:
display(df_weather)

STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AW1,GA1,GA2,GA3,GA4,GE1,GF1,KA1,KA2,MA1,MD1,MW1,MW2,OC1,OD1,OD2,REM,EQD,AW2,AX4,GD1,AW5,GN1,AJ1,AW3,MK1,KA4,GG3,AN1,RH1,AU5,HL1,OB1,AT8,AW7,AZ1,CH1,RH3,GK1,IB1,AX1,CT1,AK1,CN2,OE1,MW5,AO1,KA3,AA3,CR1,CF2,KB2,GM1,AT5,AY2,MW6,MG1,AH6,AU2,GD2,AW4,MF1,AA1,AH2,AH3,OE3,AT6,AL2,AL3,AX5,IB2,AI3,CV3,WA1,GH1,KF1,CU2,CT3,SA1,AU1,KD2,AI5,GO1,GD3,CG3,AI1,AL1,AW6,MW4,AX6,CV1,ME1,KC2,CN1,UA1,GD5,UG2,AT3,AT4,GJ1,MV1,GA5,CT2,CG2,ED1,AE1,CO1,KE1,KB1,AI4,MW3,KG2,AA2,AX2,AY1,RH2,OE2,CU3,MH1,AM1,AU4,GA6,KG1,AU3,AT7,KD1,GL1,IA1,GG2,OD3,UG1,CB1,AI6,CI1,CV2,AZ2,AD1,AH1,WD1,AA4,KC1,IA2,CF3,AI2,AT1,GD4,AX3,AH4,KB3,CU1,CN4,AT2,CG1,CF1,GG1,MV2,CW1,GG4,AB1,AH5,CN3,fl_date
1001099999,2020-03-07T00:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"346,1,N,0085,1","00200,1,C,9",20000199,-701,-971,100121,06999999,,,21061,21061,"06,1,+00200,1,07,1",,,"9,MSL ,+99999,+99999",06991061999002001999999,,"010,M,-0070,1","010,N,-0072,1",999999100001,"3,1,006,1,+999,9",011,01671,90601421999,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T01:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"342,1,N,0091,1",99999999,999999999,-721,-1021,100131,,,,,,,,,,,,,,999999100011,"3,1,009,1,+999,9",,01461,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T02:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"344,1,N,0072,1",99999999,999999999,-721,-1031,100181,,,,,,,,,,,,,,999999100061,"1,1,014,1,+999,9",,01261,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T03:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"330,1,N,0100,1","99999,9,9,N",999999999,-751,-1081,100121,,,,,,,,,,,,,,999999100001,"9,9,001,1,+999,9",,,,,,SYN04801001 46/// /3310 11075 21108 30000 40012 50001=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T04:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"353,1,N,0079,1",99999999,999999999,-751,-1081,100181,,,,,,,,,,,,,,999999100061,"0,1,005,1,+999,9",,01251,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T05:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"353,1,N,0067,1",99999999,999999999,-771,-1141,100191,,,,,,,,,,,,,,999999100071,"3,1,001,1,+999,9",,01021,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T06:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"355,1,N,0062,1","00200,1,C,9",20000199,-771,-1181,100221,12999999,24000531,,21061,21061,"06,1,+00200,1,07,1",,,"9,MSL ,+99999,+99999",06991061999002001999999,229,"120,M,-0063,1","120,N,-0079,1",999999100101,"1,1,010,1,+999,9",021,01111,90601131999,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T07:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"345,1,N,0064,1",99999999,999999999,-771,-1181,100251,,,,,,,,,,,,,,999999100131,"2,1,007,1,+999,9",,00991,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T08:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"341,1,N,0064,1",99999999,999999999,-791,-1151,100281,,,,,,,,,,,,,,999999100161,"3,1,009,1,+999,9",,00881,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07
1001099999,2020-03-07T09:00:00.000+0000,4,70.9333333,-8.6666667,9.0,"JAN MAYEN NOR NAVY, NO",FM-12,99999,V020,"345,1,N,0059,1","00200,1,C,9",5000199,-801,-1161,100281,,,,51031,21031,"07,1,+00200,1,07,1","99,9,+99999,9,10,1","99,9,+99999,9,10,1","9,MSL ,+99999,+99999",07991071999002001999999,,,,999999100161,"2,1,006,1,+999,9",561,00841,,,,SYN004BUFR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020-03-07


In [0]:
df_weather.write.mode('overwrite').parquet(f'{blob_url}/weather_2020_raw')

In [0]:
df_weather.createOrReplaceTempView("weather")

### Station Data

In [0]:
# Weather Station Supplementary Data
df_stations = spark.read.parquet("/mnt/mids-w261/datasets_final_project/stations_data/*").persist()

# Create a tempview so we can use SQL
df_stations.createOrReplaceTempView("stations")

In [0]:
display(df_stations)

In [0]:
#print("stations", sparkShape(df_stations))

In [0]:
# Read cleaned and transformed airlines data
df_airlines = spark.read.parquet(f"{blob_url}/airlines_2020_airport_airline_ripple_agg") \
                                    .repartition(1000, 'fl_date') \
                                    .persist()

# Create a tempview so we can use SQL
df_airlines.createOrReplaceTempView("airlines")

In [0]:
display(df_airlines)

year,quarter,month,day_of_month,day_of_week,fl_date,time_zone,origin,origin_icao,origin_city_name,origin_airport_id,origin_state_abr,dest_airport_id,dest_state_abr,dest_city_name,dest,dest_icao,op_unique_carrier,op_carrier_airline_id,op_carrier_fl_num,tail_num,dep_time_blk,arr_time_blk,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,distance_group,dep_delay_new,dep_del15,cancelled,dep_time,arr_time,arr_delay_new,local_tz,local_crs_arr_tz,local_arr_tz,fl_tz,pwnd_date,pwnd_blk,dwnd_blk,total_flights,delayed_flights,avg_delayed_mins,pct_flight_delayed,airline_total_flights,airline_delayed_flights,airline_avg_delayed_mins,airline_pct_flight_delayed,prev_flight_arr_delay,timediff
2020,2,6,20,6,2020-06-20,America/Los_Angeles,SFO,KSFO,"San Francisco, CA",14771,CA,11298,TX,"Dallas/Fort Worth, TX",DFW,KDFW,AA,19805,2904,N983AN,0800-0859,1300-1359,800,1328,208.0,1464.0,6,0.0,0.0,0,753,1314,0.0,2020-06-20T08:00:00.000+0000,2020-06-20T13:28:00.000+0000,2020-06-20T13:28:00.000+0000,2020-06-20T15:00:00.000+0000,2020-06-20,0500-0559,0800-0859,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Denver,DEN,KDEN,"Denver, CO",11292,CO,10693,TN,"Nashville, TN",BNA,KBNA,OO,20304,5230,N109SY,1100-1159,1500-1559,1130,1502,152.0,1014.0,5,37.0,1.0,0,1207,1551,49.0,2020-06-20T11:30:00.000+0000,2020-06-20T15:02:00.000+0000,2020-06-20T15:51:00.000+0000,2020-06-20T17:30:00.000+0000,2020-06-20,0800-0859,1100-1159,16,0.0,0.25,0.0,0,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Chicago,BNA,KBNA,"Nashville, TN",10693,TN,12451,FL,"Jacksonville, FL",JAX,KJAX,WN,19393,705,N461WN,1400-1459,1700-1759,1440,1705,85.0,483.0,2,0.0,0.0,0,1435,1651,0.0,2020-06-20T14:40:00.000+0000,2020-06-20T17:05:00.000+0000,2020-06-20T17:05:00.000+0000,2020-06-20T19:40:00.000+0000,2020-06-20,1100-1159,1400-1459,7,0.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Los_Angeles,SMF,KSMF,"Sacramento, CA",14893,CA,13891,CA,"Ontario, CA",ONT,KONT,WN,19393,453,N798SW,1200-1259,1400-1459,1245,1405,80.0,390.0,2,0.0,0.0,0,1242,1356,0.0,2020-06-20T12:45:00.000+0000,2020-06-20T14:05:00.000+0000,2020-06-20T14:05:00.000+0000,2020-06-20T19:45:00.000+0000,2020-06-20,0900-0959,1200-1259,4,0.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/New_York,MCO,KMCO,"Orlando, FL",13204,FL,14747,WA,"Seattle, WA",SEA,KSEA,AS,19930,9,N290AK,1700-1759,2000-2059,1735,2022,347.0,2554.0,11,0.0,0.0,0,1715,2013,0.0,2020-06-20T17:35:00.000+0000,2020-06-20T20:22:00.000+0000,2020-06-20T20:22:00.000+0000,2020-06-20T21:35:00.000+0000,2020-06-20,1400-1459,1700-1759,11,3.0,5.727273,27.272727272727277,0,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Chicago,FSM,KFSM,"Fort Smith, AR",11778,AR,11298,TX,"Dallas/Fort Worth, TX",DFW,KDFW,MQ,20398,3315,N697AB,0700-0759,0800-0859,700,804,64.0,227.0,1,0.0,0.0,0,656,754,0.0,2020-06-20T07:00:00.000+0000,2020-06-20T08:04:00.000+0000,2020-06-20T08:04:00.000+0000,2020-06-20T12:00:00.000+0000,2020-06-20,0400-0459,0700-0759,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/New_York,MCO,KMCO,"Orlando, FL",13204,FL,12898,PA,"Latrobe, PA",LBE,KLBE,NK,20416,107,N515NK,1000-1059,1200-1259,1030,1249,139.0,823.0,4,0.0,0.0,0,1021,1247,0.0,2020-06-20T10:30:00.000+0000,2020-06-20T12:49:00.000+0000,2020-06-20T12:49:00.000+0000,2020-06-20T14:30:00.000+0000,2020-06-20,0700-0759,1000-1059,19,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,14842,TX,"San Angelo, TX",SJT,KSJT,MQ,20398,3341,N803AE,1600-1659,1700-1759,1647,1750,63.0,229.0,1,0.0,0.0,0,1643,1741,0.0,2020-06-20T16:47:00.000+0000,2020-06-20T17:50:00.000+0000,2020-06-20T17:50:00.000+0000,2020-06-20T21:47:00.000+0000,2020-06-20,1300-1359,1600-1659,5,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Chicago,HSV,KHSV,"Huntsville, AL",12217,AL,10397,GA,"Atlanta, GA",ATL,KATL,DL,19790,2960,N977AT,0700-0759,0800-0859,700,856,56.0,151.0,1,0.0,0.0,0,653,847,0.0,2020-06-20T07:00:00.000+0000,2020-06-20T08:56:00.000+0000,2020-06-20T08:56:00.000+0000,2020-06-20T12:00:00.000+0000,2020-06-20,0400-0459,0700-0759,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
2020,2,6,20,6,2020-06-20,America/Chicago,DFW,KDFW,"Dallas/Fort Worth, TX",11298,TX,10397,GA,"Atlanta, GA",ATL,KATL,AA,19805,333,N753US,1400-1459,1700-1759,1432,1742,130.0,731.0,3,0.0,0.0,0,1422,1720,0.0,2020-06-20T14:32:00.000+0000,2020-06-20T17:42:00.000+0000,2020-06-20T17:42:00.000+0000,2020-06-20T19:32:00.000+0000,2020-06-20,1100-1159,1400-1459,14,2.0,3.928571,14.285714285714286,6,1.0,6.666667,16.666666666666668,0.0,0.0


In [0]:
'''
Step 1: 
    Find the distinct origin airport codes (ICAO)
    These will be used to join with station data to filter
    for weather station close to the airports
'''


qs_distinct_airports = '''
    SELECT         
        DISTINCT origin_icao AS distinct_airports
    FROM 
        airlines
'''

df_distinct_airports = spark.sql(qs_distinct_airports).persist()

df_distinct_airports.createOrReplaceTempView("distinct_airports")



In [0]:
print(df_distinct_airports.count())

In [0]:
%sql 

SELECT *
FROM distinct_airports

distinct_airports
KRFD
KACV
KBRO
KGGG
KBGM
KMRY
KACK
KSTC
KABR
KCGI


In [0]:
'''
Step 2: 
    Join distinct airport origin ICAOs with station data
'''

max_distance = 500
qs_join_airport_stations = f'''
    WITH ranking_table AS (
      SELECT
          /*+ REPARTITION(1000) */
          neighbor_call, 
          station_id,
          distance_to_neighbor, 
          ROW_NUMBER() OVER(
            PARTITION BY neighbor_call
            ORDER BY distance_to_neighbor ASC
          ) AS rank_neighbor
      FROM distinct_airports
      INNER JOIN stations 
        ON distinct_airports.distinct_airports = stations.neighbor_call
      WHERE distance_to_neighbor <= {max_distance}
    )
    
SELECT 
    /*+ REPARTITION(1000) */
    *
FROM 
    ranking_table
WHERE 
    rank_neighbor <= 2
'''

# Create df using query string
df_join_airport_stations = spark.sql(qs_join_airport_stations).persist()

# Create view for SQL
df_join_airport_stations.createOrReplaceTempView("join_airport_stations")

# Find shape
print(sparkShape(df_join_airport_stations)) # 654, 4

### Transformation

In [0]:
# Columns on interest based on EDA
weather_columns_of_interest = [
    "STATION", "COUNTRY", "DATE", "LATITUDE", "LONGITUDE", "REPORT_TYPE", "CALL_SIGN",
    "WND", "CIG", "VIS", "TMP", "DEW", "SLP", 
    "GA1", "GE1", "GD1", 
    "AA1", "AJ1", "AT1", 
    "IA1", "MA1",
    "fl_date"
]

In [0]:
# Filter US only data and Report Type FM-15 and FM-16
def transform_weather_data(weather_data, weather_columns_of_interest):
    return (
        weather_data
            .withColumn("COUNTRY", F.substring(F.col("NAME"), -2, 2))
            .filter("COUNTRY = 'US'")        
            .filter("(TRIM(REPORT_TYPE) IN ('FM-15'))")
            .select(weather_columns_of_interest)
    )

For un-signed columns, the values "99" , "999" , "9999", "99999" or "999999" are used as the value where the data is missing. For the signed columns, the values "+99", "+9999" or "+99999" are used as the value where the data is missing. The missing values are also indicated by condition code , discrepancy code or quality code "9".

In [0]:
# Split the comma separated values for columns of interest
'''
NOAA Source: 
https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf
'''

def split_weather_data_features(weather_data):
    return (
        weather_data
            # WND (page 7 NOAA)
            .withColumn("wind_directional_angle", split(weather_data['WND'], ',').getItem(0).cast(DoubleType()))
            .withColumn("wind_directional_qc", split(weather_data['WND'], ',').getItem(1))
            .withColumn("wind_directional_type_code", split(weather_data['WND'], ',').getItem(2))
            .withColumn("wind_directional_speed_rate", split(weather_data['WND'], ',').getItem(3).cast(DoubleType()))
            .withColumn("wind_directional_speed_qc", split(weather_data['WND'], ',').getItem(4))
            .withColumn("wind_directional_angle", F.when((F.col("wind_directional_angle") == 999.0) | F.col("wind_directional_qc").isin("3","7","999"), None).otherwise(F.col("wind_directional_angle")))                                                         
            .withColumn("wind_directional_speed_rate", F.when((F.col("wind_directional_speed_rate") == 9999.0) |F.col("wind_directional_speed_qc").isin("3","7","999"), None).otherwise(F.col("wind_directional_speed_rate")))
        
            # CIG
            .withColumn("sky_ceiling_height_dimension", split(weather_data['CIG'], ',').getItem(0).cast(DoubleType()))
            .withColumn("sky_ceiling_qc", split(weather_data['CIG'], ',').getItem(1))
            .withColumn("sky_ceiling_determination_code", split(weather_data['CIG'], ',').getItem(2))
            .withColumn("sky_ceiling_cavok_code", split(weather_data['CIG'], ',').getItem(3))
            #99999 = Missing 
            .withColumn("sky_ceiling_height_dimension", F.when((F.col("sky_ceiling_height_dimension")==99999.0)| F.col("sky_ceiling_qc").isin("3","7","999"), None).otherwise(F.col("sky_ceiling_height_dimension")))
        
            # VIS (page 10 NOAA) ; vis means Visibility Observation ; qc means quality code
            .withColumn("vis_distance", split(weather_data['VIS'], ',').getItem(0).cast(DoubleType()))
            .withColumn("vis_distance_qc", split(weather_data['VIS'], ',').getItem(1))
            .withColumn("vis_variability_code", split(weather_data['VIS'], ',').getItem(2))
            .withColumn("vis_variability_qc", split(weather_data['VIS'], ',').getItem(3))
            # Missing value
            .withColumn("vis_distance", F.when((F.col("vis_distance")==999999.0) | F.col("vis_distance_qc").isin("3","7","999"),None).otherwise(F.col("vis_distance")))
                                         
                                         
            # TMP (page 10 NOAA)
            .withColumn("air_temperature", split(weather_data['TMP'], ',').getItem(0).cast(DoubleType()))
            .withColumn("air_temperature_qc", split(weather_data['TMP'], ',').getItem(1))
            .withColumn("air_temperature", F.when((F.col("air_temperature")==9999.0) | F.col("air_temperature_qc").isin("3","7","999"),None).otherwise(F.col("air_temperature")))

        
            # DEW (page 11 NOAA)
            .withColumn("dew_point_temperature", split(weather_data['DEW'], ',').getItem(0).cast(DoubleType()))
            .withColumn("dew_point_qc", split(weather_data['DEW'], ',').getItem(1))
            .withColumn("dew_point_temperature", F.when((F.col("dew_point_temperature")==9999.0 )| F.col("dew_point_qc").isin("3","7","999"),None).otherwise(F.col("dew_point_temperature")))
        
            # SLP (sea level pressure; page 12 NOAA)
            .withColumn("sea_level_pressure", split(weather_data['SLP'], ',').getItem(0).cast(DoubleType()))
            .withColumn("sea_level_pressure_qc", split(weather_data['SLP'], ',').getItem(1))
            .withColumn("sea_level_pressure", F.when((F.col("sea_level_pressure")==99999.0 ) | F.col("sea_level_pressure_qc").isin("3","7","999"),None).otherwise(F.col("sea_level_pressure")))
        
            # AA1 (page 13 NOAA) ; lp means liquid precipitation ; qc means quality code
            .withColumn("lp_period_qty", split(weather_data['AA1'], ',').getItem(0).cast(DoubleType()))
            .withColumn("lp_depth_dimension", split(weather_data['AA1'], ',').getItem(1))
            .withColumn("lp_condition_code", split(weather_data['AA1'], ',').getItem(2))
            .withColumn("lp_quality_code", split(weather_data['AA1'], ',').getItem(3))
            .withColumn("lp_period_qty", F.when((F.col("lp_period_qty")==99.0) | F.col("lp_quality_code").isin("3","7","999"), None).otherwise(F.col("lp_period_qty")))
        
            # MA1 (page 88 NOAA) ; ap means atmospheric pressure ; qc means quality code
            .withColumn("ap_altimeter_setting_rate", split(weather_data['MA1'], ',').getItem(0).cast(DoubleType()))
            .withColumn("ap_altimeter_qc", split(weather_data['MA1'], ',').getItem(1))
            .withColumn("ap_station_pressure_rate", split(weather_data['MA1'], ',').getItem(2).cast(DoubleType()))
            .withColumn("ap_station_pressure_qc", split(weather_data['MA1'], ',').getItem(3))
            .withColumn("ap_altimeter_setting_rate", F.when((F.col("ap_altimeter_setting_rate") == 99999.0) | F.col("ap_altimeter_qc").isin("3","7","999"), None).otherwise(F.col("ap_altimeter_setting_rate")))        
            .withColumn("ap_station_pressure_rate", F.when((F.col("ap_station_pressure_rate") == 99999.0) | F.col("ap_station_pressure_qc").isin("3","7","999"), None).otherwise(F.col("ap_station_pressure_rate")))

            # GD1 Sky Cover Summation State Identifiers (page 55 NOAA) ;  qc means quality code
            .withColumn("sky_coverage_code", split(weather_data['GD1'], ',').getItem(0))
            .withColumn("sky_coverage_code_2", split(weather_data['GD1'], ',').getItem(1))
            .withColumn("sky_coverage_qc", split(weather_data['GD1'], ',').getItem(2))
            .withColumn("sky_height_dimension", split(weather_data['GD1'], ',').getItem(3).cast(DoubleType()))
            .withColumn("sky_height_dimension_qc", split(weather_data['GD1'], ',').getItem(4))
            .withColumn("sky_characteristic_code", split(weather_data['GD1'], ',').getItem(5))
            .withColumn("sky_height_dimension", F.when((F.col("sky_height_dimension") == 99999.0) | F.col("sky_height_dimension_qc").isin("3","7","999"), None).otherwise(F.col("sky_height_dimension")))
                        
            # GE1 / SKY Condition Observation (page 9 and 57 NOAA)
            .withColumn("sky_convective_cloud_attribute", split(weather_data['GE1'], ',').getItem(0).cast(DoubleType()))
            .withColumn("sky_vertical_datum_attribute", split(weather_data['GE1'], ',').getItem(1))
            .withColumn("sky_base_height_upper_range_attribute", split(weather_data['GE1'], ',').getItem(2))
            .withColumn("sky_base_height_lower_range_attribute", split(weather_data['GE1'], ',').getItem(3))
            .withColumn("sky_convective_cloud_attribute", F.when((F.col("sky_convective_cloud_attribute") == 9.0) | F.col("sky_vertical_datum_attribute").isin("999999"), None).otherwise(F.col("sky_convective_cloud_attribute")))
                                    
            # GF1 -- not very useful, lots of missing values
        
            # IA1 Ground Surface Observation (page 76 NOAA)
            .withColumn("ground_observation_code", split(weather_data['IA1'], ',').getItem(0))
            .withColumn("ground_observation_qc", split(weather_data['IA1'], ',').getItem(1))
        
            # AJ1 Snow Depth Identifier (page 21 NOAA)
            .withColumn("snow_depth_dimension", split(weather_data['AJ1'], ',').getItem(0).cast(DoubleType()))
            .withColumn("snow_depth_condition_code", split(weather_data['AJ1'], ',').getItem(1))
            .withColumn("snow_depth_qc", split(weather_data['AJ1'], ',').getItem(2))
            .withColumn("snow_depth_equivalent_water_depth", split(weather_data['AJ1'], ',').getItem(3).cast(DoubleType()))
            .withColumn("snow_depth_water_condition_code", split(weather_data['AJ1'], ',').getItem(4))
            .withColumn("snow_depth_water_qc", split(weather_data['AJ1'], ',').getItem(5))
            .withColumn("snow_depth_dimension", F.when((F.col("snow_depth_dimension") == 9999.0) | F.col("snow_depth_qc").isin("3","7","999"), None).otherwise(F.col("snow_depth_dimension")))
            .withColumn("snow_depth_equivalent_water_depth", F.when((F.col("snow_depth_equivalent_water_depth") == 9999.0) | F.col("snow_depth_water_qc").isin("3","7","999"), None).otherwise(F.col("snow_depth_equivalent_water_depth")))
                        
            # AT1 Daily Present Weather Observation (page 27 NOAA)
            .withColumn("weather_obs_source_element", split(weather_data['AT1'], ',').getItem(0))
            .withColumn("weather_obs_weather_type_num", split(weather_data['AT1'], ',').getItem(1))
            .withColumn("weather_obs_weather_type_abb", split(weather_data['AT1'], ',').getItem(2))
            .withColumn("weather_obs_qc", split(weather_data['AT1'], ',').getItem(3))
            .drop("WND", "CIG", "VIS", "TMP", "DEW", "SLP", "GA1", "GE1", "GD1", "AA1", "AJ1", "AT1", "IA1", "MA1")
    )

In [0]:
#Filter the columns and US only
df_weather_filtered = transform_weather_data(df_weather, weather_columns_of_interest) 
print(df_weather_filtered.columns)

In [0]:
#Split the columns
df_weather_split = split_weather_data_features(df_weather_filtered).persist()
df_weather_split = df_weather_split.select([col(c).alias(c.lower()) for c in df_weather_split.columns]).persist()

In [0]:
# Create view for SQL
df_weather_split.createOrReplaceTempView("weather")

In [0]:
print(df_weather_split.count())

In [0]:
qs_join_weather_distinct_airport_stations = '''
    SELECT 
        /*+ REPARTITION(1000) */
        
        join_airport_stations.neighbor_call AS airport_icao,
        join_airport_stations.distance_to_neighbor AS airport_distance_to_weather_station, 
        join_airport_stations.rank_neighbor AS rank,        
        
        weather.*
    FROM 
        join_airport_stations
    INNER JOIN weather ON join_airport_stations.station_id = weather.station
'''

df_weather_join_final = spark.sql(qs_join_weather_distinct_airport_stations).persist()

In [0]:
print(sparkShape(df_weather_join_final))

In [0]:

# Create view for SQL
df_weather_join_final.createOrReplaceTempView("weather_final")

### Summary

In [0]:
# Checkpoint final dataset: distinct airport <--> station <--> weather  
df_weather_join_final.write.mode("overwrite").parquet(f"{blob_url}/weather_2020")

In [0]:
display(df_weather_join_final)

airport_icao,airport_distance_to_weather_station,rank,station,country,date,latitude,longitude,report_type,call_sign,fl_date,wind_directional_angle,wind_directional_qc,wind_directional_type_code,wind_directional_speed_rate,wind_directional_speed_qc,sky_ceiling_height_dimension,sky_ceiling_qc,sky_ceiling_determination_code,sky_ceiling_cavok_code,vis_distance,vis_distance_qc,vis_variability_code,vis_variability_qc,air_temperature,air_temperature_qc,dew_point_temperature,dew_point_qc,sea_level_pressure,sea_level_pressure_qc,lp_period_qty,lp_depth_dimension,lp_condition_code,lp_quality_code,ap_altimeter_setting_rate,ap_altimeter_qc,ap_station_pressure_rate,ap_station_pressure_qc,sky_coverage_code,sky_coverage_code_2,sky_coverage_qc,sky_height_dimension,sky_height_dimension_qc,sky_characteristic_code,sky_convective_cloud_attribute,sky_vertical_datum_attribute,sky_base_height_upper_range_attribute,sky_base_height_lower_range_attribute,ground_observation_code,ground_observation_qc,snow_depth_dimension,snow_depth_condition_code,snow_depth_qc,snow_depth_equivalent_water_depth,snow_depth_water_condition_code,snow_depth_water_qc,weather_obs_source_element,weather_obs_weather_type_num,weather_obs_weather_type_abb,weather_obs_qc
KMSY,15.312351544587084,2,72231553917,US,2020-05-08T04:53:00.000+0000,30.0494,-90.0288,FM-15,KNEW,2020-05-08,170.0,5,N,36.0,5,22000.0,5,9,N,16093.0,5,N,5,189.0,5,122.0,5,10182.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
KAVL,0.0,1,72315003812,US,2020-05-08T21:54:00.000+0000,35.4319,-82.5375,FM-15,KAVL,2020-05-08,,9,C,0.0,5,1433.0,5,M,N,16093.0,5,N,5,111.0,5,78.0,5,10094.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
KTYR,27.330878084360556,2,72204253978,US,2020-05-08T06:55:00.000+0000,32.74222,-95.49639,FM-15,KJDD,2020-05-08,190.0,5,N,72.0,5,22000.0,5,9,N,16093.0,5,N,5,220.0,C,170.0,C,,9,,,,,,,,,00,99,1.0,99.0,9.0,99.0,,,,,,,,,,,,,,,,
KDHN,0.0,1,72226813839,US,2020-05-08T07:53:00.000+0000,31.3167,-85.45,FM-15,KDHN,2020-05-08,,9,C,0.0,5,22000.0,5,9,N,16093.0,5,N,5,117.0,5,78.0,5,10189.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
KDCA,9.2703189319455,2,74594013705,US,2020-05-08T19:32:00.000+0000,38.81667,-76.86667,FM-15,KADW,2020-05-08,240.0,5,N,51.0,5,22000.0,5,9,N,16093.0,5,N,5,170.0,5,40.0,5,10054.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
KJAX,15.061774703960443,2,A0002363890,US,2020-05-08T19:56:00.000+0000,30.35028,-81.88306,FM-15,KNEN,2020-05-08,220.0,5,N,51.0,5,22000.0,5,9,N,,9,N,5,267.0,5,0.0,5,10226.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
KBHM,0.0,1,72228013876,US,2020-05-08T17:53:00.000+0000,33.56556,-86.745,FM-15,KBHM,2020-05-08,180.0,5,N,51.0,5,7620.0,5,M,N,16093.0,5,N,5,161.0,5,122.0,5,10123.0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
KHIB,16.149680020319913,2,72747404923,US,2020-05-08T00:35:00.000+0000,47.42417,-92.4975,FM-15,KEVM,2020-05-08,340.0,5,N,57.0,5,22000.0,5,9,N,16093.0,5,N,5,60.0,5,-105.0,5,,9,,,,,,,,,04,99,5.0,99.0,9.0,99.0,,,,,,,,,,,,,,,,
KBTR,23.350849959673425,2,72096700457,US,2020-05-08T00:16:00.000+0000,30.7183,-91.4786,FM-15,KHZR,2020-05-08,140.0,5,N,31.0,5,22000.0,5,9,N,16093.0,5,N,5,230.0,5,110.0,5,,9,,,,,,,,,00,99,1.0,99.0,9.0,99.0,,,,,,,,,,,,,,,,
KATY,44.88681042290992,2,72751504982,US,2020-05-08T13:55:00.000+0000,45.30556,-96.42417,FM-15,KVVV,2020-05-08,330.0,5,N,51.0,5,22000.0,5,9,N,16093.0,5,N,5,40.0,C,-30.0,C,,9,,,,,,,,,10261,5,9860.0,5.0,,,,,,,,,,,,,,,,,,
