# Airline delays 
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

2015 - 2019

### Additional sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/

In [0]:
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext
from pyspark.sql.functions import concat, col, hour, minute, lpad, rpad, substring, year, month, dayofmonth, lit, to_timestamp, expr,split,explode,split
from pyspark.sql.functions import isnan, when, count, col,isnull

import numpy as np
#spark = SparkSession.builder.getOrCreate()

sqlContext = SQLContext(sc)


# Weather
https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.ncdc:C00532

## EDA on weather data
As a frequent flyer, we know that flight departures (and arrivals)  often get affected by weather conditions, so it makes sense to collect and process weather data corresponding to the origin and destination airports at the time of departure and arrival respectively and build features based upon this data. A weather table  has been pre-downloaded from the National Oceanic and Atmospheric Administration repository  to S3 in the form of  parquet files (thereby enabling pushdown querying and efficient joins). The weather data is for the period Jan 2015 – December 2019.

In [0]:
# ORIGINAL WEATHER DATA
weather = spark.read.option("header", "true")\
                    .parquet(f"dbfs:/mnt/mids-w261/datasets_final_project/weather_data/*.parquet")

# Adding specific date columns I need for JOIN
weather = weather.select(year(col("DATE")).alias("YEAR"), month(col("DATE")).alias("MONTH"), dayofmonth(col("DATE")).alias("DAY_OF_MONTH"), concat(rpad(lpad(hour(col("DATE")), 2, '0'), 4, '0'), lit('-'), lpad(hour(col("DATE")), 2, '0'), lit('59')).alias('HOUR_BLOCK'), *weather)

# Filter out all weather observations that don't align between station begin and end

display(weather)

YEAR,MONTH,DAY_OF_MONTH,HOUR_BLOCK,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AW1,GA1,GA2,GA3,GA4,GE1,GF1,KA1,KA2,MA1,MD1,MW1,MW2,OC1,OD1,OD2,REM,EQD,AW2,AX4,GD1,AW5,GN1,AJ1,AW3,MK1,KA4,GG3,AN1,RH1,AU5,HL1,OB1,AT8,AW7,AZ1,CH1,RH3,GK1,IB1,AX1,CT1,AK1,CN2,OE1,MW5,AO1,KA3,AA3,CR1,CF2,KB2,GM1,AT5,AY2,MW6,MG1,AH6,AU2,GD2,AW4,MF1,AA1,AH2,AH3,OE3,AT6,AL2,AL3,AX5,IB2,AI3,CV3,WA1,GH1,KF1,CU2,CT3,SA1,AU1,KD2,AI5,GO1,GD3,CG3,AI1,AL1,AW6,MW4,AX6,CV1,ME1,KC2,CN1,UA1,GD5,UG2,AT3,AT4,GJ1,MV1,GA5,CT2,CG2,ED1,AE1,CO1,KE1,KB1,AI4,MW3,KG2,AA2,AX2,AY1,RH2,OE2,CU3,MH1,AM1,AU4,GA6,KG1,AU3,AT7,KD1,GL1,IA1,GG2,OD3,UG1,CB1,AI6,CI1,CV2,AZ2,AD1,AH1,WD1,AA4,KC1,IA2,CF3,AI2,AT1,GD4,AX3,AH4,KB3,CU1,CN4,AT2,CG1,CF1,GG1,MV2,CW1,GG4,AB1,AH5,CN3
2016,1,1,0000-0059,7650099999,2016-01-01T00:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-12,99999,V020,"190,1,N,0015,1","99999,9,9,N",7000199,1011,901,102551,,"99,9,+02250,1,99,9",,,,"9,AGL ,+99999,+99999",08991999999022501999999,,,999999102161,"8,1,004,1,+999,9",611.0,,,39900261999.0,,SYN09807650 04857 81903 10101 20090 30216 40255 58004 69901 761// 333 4/000 69907 90710 91105 555 69905=,,,,,,,99991999999999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3000021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6000021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0000-0059,7650099999,2016-01-01T00:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-15,99999,V020,"190,1,N,0015,1","22000,1,9,N",9000199,1001,901,999999,611.0,,,,,,00991999999999999999999,,,102501999999,,,,,,,MET057METAR LFML 010000Z AUTO 19003KT 9000 -RA NSC 10/09 Q1025=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0000-0059,7650099999,2016-01-01T00:30:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-15,99999,V020,"250,1,N,0010,1","99999,9,9,N",8000199,99999,99999,999999,,,,,,,,,,102501999999,,,,,,,MET056METAR LFML 010030Z AUTO 25002KT 8000 ///TCU 10/09 Q1025=,Q019 2ATOD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0100-0159,7650099999,2016-01-01T01:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-12,99999,V020,"999,9,C,0000,1","99999,9,9,N",4900199,991,941,102511,,"99,9,+02250,1,99,9",,,,"9,AGL ,+99999,+99999",08991999999022501999999,,,999999102121,"8,1,006,1,+999,9",101.0,,,39900151999.0,,SYN07607650 24849 80000 10099 20094 30212 40251 58006 710// 333 69925 90710 91103=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000231.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0100-0159,7650099999,2016-01-01T01:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-15,99999,V020,"999,9,C,0000,1","22000,1,9,N",7000199,1001,901,999999,,,,,,,00991999999999999999999,,,102501999999,,,,,,,MET053METAR LFML 010100Z AUTO 00000KT 7000 NSC 10/09 Q1025=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0100-0159,7650099999,2016-01-01T01:30:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-15,99999,V020,"999,9,C,0000,1","22000,1,9,N",9000199,1001,901,999999,,,,,,,00991999999999999999999,,,102501999999,,,,,,,MET053METAR LFML 010130Z AUTO 00000KT 9000 NSC 10/09 Q1025=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0200-0259,7650099999,2016-01-01T02:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-12,99999,V020,"999,9,C,0000,1","99999,9,9,N",11000199,1001,881,102541,,"99,9,+01750,1,99,9",,,,"9,AGL ,+99999,+99999",08991999999017501999999,,,999999102151,"5,1,005,1,+999,9",1.0,,,39900101999.0,,SYN07607650 24761 80000 10100 20088 30215 40254 55005 700// 333 60005 90710 91102=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0200-0259,7650099999,2016-01-01T02:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-15,99999,V020,"999,9,C,0000,1","99999,9,9,Y",999999999,1001,901,999999,,,,,,,,,,102501999999,,,,,,,MET050METAR LFML 010200Z AUTO 00000KT CAVOK 10/09 Q1025=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0200-0259,7650099999,2016-01-01T02:30:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-15,99999,V020,"320,1,N,0015,1","22000,1,9,N",8000199,1001,901,999999,,,,,,,00991999999999999999999,,,102501999999,,,,,,,MET053METAR LFML 010230Z AUTO 32003KT 8000 NSC 10/09 Q1025=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2016,1,1,0300-0359,7650099999,2016-01-01T03:00:00.000+0000,4,43.435555,5.213611,22.55,"PROVENCE, FR",FM-12,99999,V020,"040,1,N,0010,1","01500,1,9,N",6000199,991,931,102531,,"07,1,+01500,1,06,1",,,,"9,AGL ,+99999,+99999",07991071999015001999999,,,999999102141,"6,1,002,1,+999,9",,,,39900211999.0,,SYN09207650 22756 70402 10099 20093 30214 40253 56002 875// 333 69927 87650 90710 91104 555 60005=,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3000231.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
weather.groupby('YEAR').count().display()

YEAR,count
2015,117057365
2017,127044487
2016,126167653
2018,127793669
2019,132841262


In [0]:
#stations is a csv file that contains the details of all the weather stations and the associated 'lat' and longitude
stations = spark.read.option("header", "true").csv("dbfs:/mnt/mids-w261/DEMO8/gsod/stations.csv.gz")

cleaned_weather = weather.join(stations, [concat(col("usaf"), col("wban")) == weather.STATION, col('country') == "US"]).filter(weather.DATE.between(to_timestamp(stations.begin, 'yyyyMMdd'), to_timestamp(stations.end, 'yyyyMMdd') + expr("INTERVAL 24 hours"))).select(*weather,stations.country)

cleaned_weather.display()

YEAR,MONTH,DAY_OF_MONTH,HOUR_BLOCK,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AW1,GA1,GA2,GA3,GA4,GE1,GF1,KA1,KA2,MA1,MD1,MW1,MW2,OC1,OD1,OD2,REM,EQD,AW2,AX4,GD1,AW5,GN1,AJ1,AW3,MK1,KA4,GG3,AN1,RH1,AU5,HL1,OB1,AT8,AW7,AZ1,CH1,RH3,GK1,IB1,AX1,CT1,AK1,CN2,OE1,MW5,AO1,KA3,AA3,CR1,CF2,KB2,GM1,AT5,AY2,MW6,MG1,AH6,AU2,GD2,AW4,MF1,AA1,AH2,AH3,OE3,AT6,AL2,AL3,AX5,IB2,AI3,CV3,WA1,GH1,KF1,CU2,CT3,SA1,AU1,KD2,AI5,GO1,GD3,CG3,AI1,AL1,AW6,MW4,AX6,CV1,ME1,KC2,CN1,UA1,GD5,UG2,AT3,AT4,GJ1,MV1,GA5,CT2,CG2,ED1,AE1,CO1,KE1,KB1,AI4,MW3,KG2,AA2,AX2,AY1,RH2,OE2,CU3,MH1,AM1,AU4,GA6,KG1,AU3,AT7,KD1,GL1,IA1,GG2,OD3,UG1,CB1,AI6,CI1,CV2,AZ2,AD1,AH1,WD1,AA4,KC1,IA2,CF3,AI2,AT1,GD4,AX3,AH4,KB3,CU1,CN4,AT2,CG1,CF1,GG1,MV2,CW1,GG4,AB1,AH5,CN3,country
2016,1,1,0000-0059,70305226652,2016-01-01T00:13:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-16,99999,V020,"230,1,N,0036,1","00091,1,9,N",999999999,1,-101,999999,,,,,,"9,AGL ,+99999,+99999",99999999999001521999999,,,,,,,,,,MET079SPECI PALG 010013Z AUTO 23007KT OVC005 00/M01 RMK AO2 PWINO TSNO CIG 003V009 $=,,,,"4,99,1,+00152,1,1",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0000-0059,70305226652,2016-01-01T00:27:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-16,99999,V020,"999,9,V,0026,1","00122,1,9,N",999999999,1,-101,999999,,,,,,"9,AGL ,+99999,+99999",99999999999002131999999,,,,,,,,,,MET079SPECI PALG 010027Z AUTO VRB05KT OVC007 00/M01 RMK AO2 PWINO TSNO CIG 004V009 $=,,,,"4,99,1,+00213,1,1",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0000-0059,70305226652,2016-01-01T00:56:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-15,99999,V020,"220,1,N,0036,1","00183,1,9,N",999999999,1,-51,999999,,,,,,"9,AGL ,+99999,+99999",99999999999002741999999,,,,,,,,,,MET102METAR PALG 010056Z AUTO 22007KT BKN009 OVC014 00/M01 RMK AO2 PWINO TSNO T00001005 CIG 006V013 SLPNO $=,,,,"3,99,1,+00274,1,1",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"4,99,1,+00427,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0100-0159,70305226652,2016-01-01T01:19:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-16,99999,V020,"230,1,N,0021,1","00274,1,9,N",999999999,101,-101,999999,,,,,,"9,AGL ,+99999,+99999",99999999999003351999999,,,,,,,,,,MET079SPECI PALG 010119Z AUTO 23004KT OVC011 01/M01 RMK AO2 PWINO TSNO CIG 009V014 $=,,,,"4,99,1,+00335,1,1",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0100-0159,70305226652,2016-01-01T01:56:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-15,99999,V020,"210,1,N,0015,1","00427,1,9,N",999999999,51,-111,999999,,,,,,"9,AGL ,+99999,+99999",99999999999002741999999,,,,,,,,,,MET097METAR PALG 010156Z AUTO 21003KT FEW009 BKN014 OVC023 01/M01 RMK AO2 PWINO TSNO T00051011 SLPNO $=,,,,"1,99,1,+00274,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"3,99,1,+00427,1,9",,,,,,,,,,,,,,,,,,,,,,,,"4,99,1,+00701,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0200-0259,70305226652,2016-01-01T02:08:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-16,99999,V020,"999,9,C,0000,1","00610,1,9,N",999999999,101,-101,999999,,,,,,"9,AGL ,+99999,+99999",99999999999004271999999,,,,,,,,,,MET081SPECI PALG 010208Z AUTO 00000KT FEW014 BKN020 OVC026 01/M01 RMK AO2 PWINO TSNO $=,,,,"1,99,1,+00427,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"3,99,1,+00610,1,9",,,,,,,,,,,,,,,,,,,,,,,,"4,99,1,+00792,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0200-0259,70305226652,2016-01-01T02:56:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-15,99999,V020,"110,1,N,0015,1","00549,1,9,N",999999999,1,-51,999999,,,,,,"9,AGL ,+99999,+99999",99999999999000911999999,,,,,,,,,,MET113METAR PALG 010256Z AUTO 11003KT FEW003 BKN018 OVC025 00/M01 RMK AO2 PWINO TSNO 5//// T00001005 BKN V SCT SLPNO $=,,,,"1,99,1,+00091,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"3,99,1,+00549,1,9",,,,,,,,,,,,,,,,,,,,,,,,"4,99,1,+00762,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0300-0359,70305226652,2016-01-01T03:56:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-15,99999,V020,"999,9,V,0015,1","00853,1,9,N",999999999,51,-111,999999,,,,,,"9,AGL ,+99999,+99999",99999999999008531999999,,,,,,,,,,MET083METAR PALG 010356Z AUTO VRB03KT OVC028 01/M01 RMK AO2 PWINO TSNO T00051011 SLPNO $=,,,,"4,99,1,+00853,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0400-0459,70305226652,2016-01-01T04:56:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-15,99999,V020,"999,9,V,0021,1","00945,1,9,N",999999999,51,-161,999999,,,,,,"9,AGL ,+99999,+99999",99999999999009451999999,,,,,,,,,,MET083METAR PALG 010456Z AUTO VRB04KT OVC031 01/M02 RMK AO2 PWINO TSNO T00051016 SLPNO $=,,,,"4,99,1,+00945,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US
2016,1,1,0500-0559,70305226652,2016-01-01T05:56:00.000+0000,4.0,61.53639,-160.34139,16.8,"KALSKAG AIRPORT, AK US",FM-15,99999,V020,"999,9,C,0000,1","01067,1,9,N",999999999,111,-221,999999,,,,,,"9,AGL ,+99999,+99999",99999999999010671999999,"060,M,+0011,1","060,N,+0000,1",,,,,,,,MET101METAR PALG 010556Z AUTO 00000KT BKN035 01/M02 RMK AO2 PWINO TSNO 5//// T00111022 10011 20000 SLPNO $=,,,,"3,99,1,+01067,1,9",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,US


In [0]:
# we'll remove the rows from 2019 in this dataframe and treat 2019 separately as our test data
cleaned_weather = cleaned_weather.filter(cleaned_weather["YEAR"]!=2019)

In [0]:
cleaned_weather[['YEAR']].distinct().show()

In [0]:
weather.registerTempTable('weather')
stations.registerTempTable('stations')

cleaned_weather_19 = spark.sql("""
SELECT weather.*, stations.country
FROM weather, stations
WHERE weather.YEAR==2019 and concat(stations.usaf, stations.wban) == weather.STATION AND stations.country == 'US' """)
display(cleaned_weather_19)
cleaned_weather_19.count()

YEAR,MONTH,DAY_OF_MONTH,HOUR_BLOCK,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,AW1,GA1,GA2,GA3,GA4,GE1,GF1,KA1,KA2,MA1,MD1,MW1,MW2,OC1,OD1,OD2,REM,EQD,AW2,AX4,GD1,AW5,GN1,AJ1,AW3,MK1,KA4,GG3,AN1,RH1,AU5,HL1,OB1,AT8,AW7,AZ1,CH1,RH3,GK1,IB1,AX1,CT1,AK1,CN2,OE1,MW5,AO1,KA3,AA3,CR1,CF2,KB2,GM1,AT5,AY2,MW6,MG1,AH6,AU2,GD2,AW4,MF1,AA1,AH2,AH3,OE3,AT6,AL2,AL3,AX5,IB2,AI3,CV3,WA1,GH1,KF1,CU2,CT3,SA1,AU1,KD2,AI5,GO1,GD3,CG3,AI1,AL1,AW6,MW4,AX6,CV1,ME1,KC2,CN1,UA1,GD5,UG2,AT3,AT4,GJ1,MV1,GA5,CT2,CG2,ED1,AE1,CO1,KE1,KB1,AI4,MW3,KG2,AA2,AX2,AY1,RH2,OE2,CU3,MH1,AM1,AU4,GA6,KG1,AU3,AT7,KD1,GL1,IA1,GG2,OD3,UG1,CB1,AI6,CI1,CV2,AZ2,AD1,AH1,WD1,AA4,KC1,IA2,CF3,AI2,AT1,GD4,AX3,AH4,KB3,CU1,CN4,AT2,CG1,CF1,GG1,MV2,CW1,GG4,AB1,AH5,CN3,country
2019,1,1,0000-0059,99999994081,2019-01-01T00:00:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,"999,9,R,0027,1","99999,9,9,N",999999999,-1521,99999,999999,,,,,,,,"010,M,-0147,1","010,N,-0154,1",,,,,,,,,,,,,,,,,,,,,,,,0600050109999000100109999990,,,,"05,-0153,1,0,0670,1,0",,,,,-15210.0,,"-0135,1,0,-0160,1,0,00,1,0",,,5000091.0,,,262210.0,81910.0,,,,,,,,,,,,1000091.0,,,,,,,,-15210000310.0,,"-0153,1,0,9999,9,0,-0147,1,0,9999,9,0",,0000010000001000000100000010,-1511.0,999990000210.0,-15210.0,,,,,,,198610.0,,,,,,"-0154,1,0,9999,9,0,-0147,1,0,9999,9,0",,,1.291001251001161e+16,,,,,,,,,-15210.0,198910.0,,,"99,-07",,,,,,,,,,,999990000210.0,,,,,,,,,,,,,,"05,+00000,1,0",,"-0155,1,0,-0148,1,0,00002,1,0,00018,1,0","-0154,1,0,9999,9,0,-0147,1,0,9999,9,0",,,,,,,,79210.0,,,,,,,999990000210.0,,,198010.0,81310.0,,,11660102913010.0,,,,149981019736010.0,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:05:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1531,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0153,1,0,0669,1,0",,,,,-15310.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15210.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15310.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,198010.0,,,,11650102913010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:10:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1531,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0154,1,0,0674,1,0",,,,,-15310.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15310.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15310.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,11650102912010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:15:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1541,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0154,1,0,0674,1,0",,,,,-15410.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15310.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15410.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,11640102912010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:20:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1541,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0155,1,0,0671,1,0",,,,,-15410.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15410.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15510.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,360102911010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:25:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1551,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0156,1,0,0669,1,0",,,,,-15510.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15410.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15510.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,452102912010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:30:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1551,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0156,1,0,0669,1,0",,,,,-15510.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15410.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15510.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,11630102912010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:35:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1561,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0157,1,0,0675,1,0",,,,,-15610.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15510.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15610.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,11620102912010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:40:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1571,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0158,1,0,0685,1,0",,,,,-15710.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15710.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15710.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,11620102912010.0,,,,,US
2019,1,1,0000-0059,99999994081,2019-01-01T00:45:00.000+0000,,45.516,-103.3017,878.7,"BUFFALO 13 ESE, SD US",CRN05,99999,V020,9999999999,"99999,9,9,N",999999999,-1581,99999,999999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"05,-0159,1,0,0695,1,0",,,,,-15810.0,,,,,5000091.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-15810.0,,,,,,,198610.0,,,,,,,,,,,,,,,,,,-15810.0,198910.0,,,"99,-07",,,,,,,,,,,,,,,,,,,,,,,,,"05,+00000,1,0",,,,,,,,,,,,,,,,,,,,,197910.0,,,,11620102912010.0,,,,,US


In [0]:
# The  original weather table has around 230 M records
cleaned_weather.count()

Weather plays a crucial component in scheduling and on-time prediction of the flights.In this section we analyze the data collected at various stations and would try to identify their pattern and distribution.

Goal - We have the following high-level goals in mind:
- Understand the distribution of various variables.
- Understand the pattern with a goal to identify outliers and possible errors in reading(s).

Data :
Weather data is collected from various weather sensors placed at different stations. Each station has a corresponding number and readings from that station.
Let us start by taking an overall statistical summary of the data.

## Preface
The Integrated Surface Dataset (ISD) is composed of worldwide surface weather observations from over 35,000 stations parameters included are: air quality, atmospheric pressure, atmospheric temperature/dew point, atmospheric winds, clouds, precipitation, ocean waves, tides and more. 

We can broadly classify the elements of the dataset into two main categories:
- Mandatory data =>The mandatory data section contains meteorological information on the basic elements such as winds, visibility, and temperature.
These are the most commonly reported parameters and are available most of the time.
- Additional Data Section => These additional data contain information of significance and/or which are received with varying degrees of frequency.

For the purpose of this study :
- We will limit the weather observations to the stations in the US only.
- We'll focus on mandatory data section as the additional data is missing for most of the records.
- We'll ignore the 'remarks section as ,as these are a set of characeters in plain language that do not provide much insight into decision making.

### Analysis of Mandatory data variables.

#### W1.0-Station
This field represents a unique station id and is formed from the concatenation of weather station id and WBAN id. We'll use this identifier to get the weather details at a particular location.

#### W2.0 - Date
This field indicates a timestamp when the reading was taken corresponding to a station.

#### W3.0 Source 
This is categorical data attribute.The flag of a GEOPHYSICAL-POINT-OBSERVATION showing the source or combination of sources used in creating the
observation.
We analyze the distribution of this column below.

### W4.0 Latitude and Longitude information.
This is latitude and longitude co-ordinate of a geo-location.

###W5.0 Report- type
The code that denotes the type of geophysical surface observation.

### W6.0 Elevation
The elevation of a GEOPHYSICAL-POINT-OBSERVATION relative to Mean Sea Level (MSL).

### W7.0 Call_sign
 The identifier that represents the call letters assigned to a FIXED-WEATHER-STATION.
   - Missing values are denoted by 99999

#### W8.0 Quality Control 
 The name of the quality control process applied to a weather observation.
   - V01 = No A or M Quality Control applied
   - V02 = Automated Quality Control
   - V03 = subjected to Quality Control

### Weather observations- readings from various weather stations
The section below describes the actuel readings that are made from the sensors.

#### W9.0 Wind
The wind measurement is a composite field in our observation and consists of the following attributes.

#### W9.1 WIND-OBSERVATION direction angle
The angle(in angular degrees), measured in a clockwise direction, between true north and the direction from which the wind is blowing.

### W9.2 WIND-OBSERVATION direction quality code
The code that denotes a quality status of a reported WIND-OBSERVATION direction angle.
 0 = Passed gross limits check
 1 = Passed all quality control checks
 2 = Suspect
 3 = Erroneous
 4 = Passed gross limits check, data originate from an NCEI data source
 5 = Passed all quality control checks, data originate from an NCEI data source
 6 = Suspect, data originate from an NCEI data source
 7 = Erroneous, data originate from an NCEI data source
 9 = Passed gross limits check if element is present
 
#### W9.2 Filters
   - We'll not use the following values in our analysis:
     - 2 = Suspect
     - 3 = Erroneous
     
### W9.3 WIND-OBSERVATION type code
The code that denotes the character of the WIND-OBSERVATION.


### W9.4 WIND-OBSERVATION speed rate
 The rate of horizontal travel of air past a fixed point.

**NOTE: If a value of 9 appears with a wind speed of 0000, this indicates calm winds.

### W9.5 WIND-OBSERVATION speed quality code
 The code that denotes a quality status of a reported WIND-OBSERVATION speed rate.
- 0 = Passed gross limits check
- 1 = Passed all quality control checks
- 2 = Suspect
- 3 = Erroneous
- 4 = Passed gross limits check, data originate from an NCEI data source
- 5 = Passed all quality control checks, data originate from an NCEI data source
- 6 = Suspect, data originate from an NCEI data source
- 7 = Erroneous, data originate from an NCEI data source
- 9 = Passed gross limits check if element is present
 
#### W9.5 WIND-OBSERVATION speed quality code filters
We'll only retain the observations with the following values in our analysis
- 0 = Passed gross limits check
- 1 = Passed all quality control checks
- 4 = Passed gross limits check, data originate from an NCEI data source
- 5 = Passed all quality control checks, data originate from an NCEI data source
- 9 = Passed gross limits check if element is present

### 10.Sky condition observations
The section below describes the high level description of the sky-condition observations.

### W10.1 SKY-CONDITION-OBSERVATION ceiling height dimension
The height above ground level (AGL) of the lowest cloud or obscuring phenomena layer aloft with 5/8 or more summation total sky
cover, which may be predominantly opaque, or the vertical visibility into a surface-based obstruction.

### W10.2 SKY-CONDTION-OBSERVATION ceiling quality code
 The code that denotes a quality status of a reported ceiling height dimension.
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 2 = Suspect
 - 3 = Erroneous
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 6 = Suspect, data originate from an NCEI data source
 - 7 = Erroneous, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present
 
#### W10.2 SKY-CONDTION-OBSERVATION ceiling quality code filters
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present
 
### W10.3 SKY-CONDTION-OBSERVATION ceiling quality code
The code that denotes the method used to determine the ceiling.
 - A = Aircraft
 - B = Balloon
 - C = Statistically derived
 - D = Persistent cirriform ceiling (pre-1950 data)
 - E = Estimated
 - M = Measured
 - P = Precipitation ceiling (pre-1950 data)
 - R = Radar
 - S = ASOS augmented
 - U = Unknown ceiling (pre-1950 data)
 - V = Variable ceiling (pre-1950 data)
 - W = Obscured
 - 9 = Missing

#### W10.3 SKY-CONDTION-OBSERVATION ceiling quality code filter
We'll only use the following attributes in our analysis.
 - A = Aircraft
 - B = Balloon
 - C = Statistically derived
 - E = Estimated
 - M = Measured
 - R = Radar
 - S = ASOS augmented
 - W = Obscured
 
 

### W10.4 SKY-CONDITION-OBSERVATION CAVOK code
 The code that represents whether the 'Ceiling and Visibility Okay' (CAVOK) condition has been reported.
- N = No
- Y = Yes
- 9 = Missing

#### W10.4 SKY-CONDITION-OBSERVATION CAVOK code filter
We'll only use the following in our analysis:
- N= No
- Y = Yes

### Visibility observations
The section below provides a brief overview of the explanatory variables that indicate visibility

### W11.1 VISIBILITY-OBSERVATION distance dimension
The horizontal distance at which an object can be seen and identified.

### W11.2 VISIBILITY-OBSERVATION distance quality code
The horizontal distance at which an object can be seen and identified.
The code that denotes a quality status of a reported distance of a visibility observation.
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 6 = Suspect, data originate from an NCEI data source
 - 7 = Erroneous, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present

#### W11.1 VISIBILITY-OBSERVATION distance quality code filter
We'll only retain the observations with the following attributes 
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present


### W11.3  VISIBILITY-OBSERVATION variability code
The code that denotes whether or not the reported visibility is variable.
 - N = Not variable
 - V = Variable
 - 9 = Missing

### W11.4 VISIBILITY-OBSERVATION quality variability code
The code that denotes a quality status of a reported VISIBILITY-OBSERVATION variability code.
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 2 = Suspect
 - 3 = Erroneous
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 6 = Suspect, data originate from an NCEI data source
 - 7 = Erroneous, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present

#### W11.4  VISIBILITY-OBSERVATION quality variability code filter
We'll only retain the following values in our analysis:
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present

### Air Temperature attributes.
The section below provides a geenral overview of the data attributes for the observations on air temperature.

### W12.1 Air temperature
 The temperature of the air.
 MIN: -0932 MAX: +0618 UNITS: Degrees Celsius
 SCALING FACTOR: 10
### W12.1 Air Temperature filter
Missing values are indicated by +9999 and will be ignored

### W12.2 Air temperature quality code
 The code that denotes a quality status of an AIR-TEMPERATURE-OBSERVATION.
   - 0 = Passed gross limits check
   - 1 = Passed all quality control checks
   - 2 = Suspect
   - 3 = Erroneous
   - 4 = Passed gross limits check, data originate from an NCEI data source
   - 5 = Passed all quality control checks, data originate from an NCEI data source
   - 6 = Suspect, data originate from an NCEI data source
   - 7 = Erroneous, data originate from an NCEI data source
   - 9 = Passed gross limits check if element is present
   - A = Data value flagged as suspect, but accepted as a good value
   - C = Temperature and dew point received from Automated Weather Observing System (AWOS) 
   - I = Data value not originally in data, but inserted by validator
   - M = Manual changes made to value based on information provided by NWS or FAA
   - P = Data value not originally flagged as suspect, but replaced by validator
   - R = Data value replaced with value computed by NCEI software
   - U = Data value replaced with edited value 
#### W12.2 Air temperature quality code filter
We'll only retain the following values
   - 0 = Passed gross limits check
   - 1 = Passed all quality control checks
   - 4 = Passed gross limits check, data originate from an NCEI data source
   - 5 = Passed all quality control checks, data originate from an NCEI data source
   - 9 = Passed gross limits check if element is present
   - A = Data value flagged as suspect, but accepted as a good value
   - C = Temperature and dew point received from Automated Weather Observing System (AWOS) 
   - I = Data value not originally in data, but inserted by validator
   - M = Manual changes made to value based on information provided by NWS or FAA
   - P = Data value not originally flagged as suspect, but replaced by validator
   - R = Data value replaced with value computed by NCEI software
   - U = Data value replaced with edited value

### Dew Point observations
The sectio below describes the key attributes of the dew-point data collected from weather stations.

### W13.1 Dew Point temperature
The temperature in ,Degrees Celsius,to which a given parcel of air must be cooled at constant pressure and water vapor content in order for saturation to occur.
MIN: -0982 MAX: +0368 UNITS: 

### W13.2 Dew point quality code
 The code that denotes a quality status of the reported dew point temperature.
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 2 = Suspect
 - 3 = Erroneous
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 6 = Suspect, data originate from an NCEI data source
 - 7 = Erroneous, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present
 - A = Data value flagged as suspect, but accepted as a good value
 - C = Temperature and dew point received from Automated Weather Observing System (AWOS) 
 - I = Data value not originally in data, but inserted by validator
 - M = Manual changes made to value based on information provided by NWS or FAA
 - P = Data value not originally flagged as suspect, but replaced by validator
 - R = Data value replaced with value computed by NCEI software
 - U = Data value replaced with edited value
#### W13.2 Dew point quality code filter
 We'll retain the following values for our analysis
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present
 - A = Data value flagged as suspect, but accepted as a good value
 - C = Temperature and dew point received from Automated Weather Observing System (AWOS) 
 - I = Data value not originally in data, but inserted by validator
 - M = Manual changes made to value based on information provided by NWS or FAA
 - P = Data value not originally flagged as suspect, but replaced by validator
 - R = Data value replaced with value computed by NCEI software
 - U = Data value replaced with edited value

### Atmospheric Pressure
The section below describes the various data observations on atmospheric pressure.

#### W14.1 Sea level pressure 
The air pressure relative to Mean Sea Level (MSL).
#### W14.1 Sea level pressure filter
Missing values are denoted by 99999 and wo;; be ignored

#### W14.1 Sea level pressure quality code
 The code that denotes a quality status of the sea level pressure of an observation
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 2 = Suspect
 - 3 = Erroneous
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 6 = Suspect, data originate from an NCEI data source
 - 7 = Erroneous, data originate from an NCEI data sou
 - 9 = Passed gross limits check if element is present
 
#### W14.2 Sea level pressure quality code filter
 We'll only retain the following observations.
 - 0 = Passed gross limits check
 - 1 = Passed all quality control checks
 - 4 = Passed gross limits check, data originate from an NCEI data source
 - 5 = Passed all quality control checks, data originate from an NCEI data source
 - 9 = Passed gross limits check if element is present

### Additional Data section
As mentioned in the section above the 'additional data' corresponds to the attributes which are observed in addition to the mandatory observations. We present our analysis below on this additional data.

In [0]:
all_cols = cleaned_weather.columns

  
  

In [0]:

# df.orderBy(["age","city"],ascending=[0,1]).collect()
#[cleaned_weather.groupBy("AW1").count().orderBy(["count","AW1"],ascending = False).collect()][0]

In [0]:
# vals = []
# for col in (range(20,len(all_cols))):
#   print(cleaned_weather[col])
#   vals.append(cleaned_weather.groupBy(cleaned_weather[col]).count().orderBy(["count",cleaned_weather[col]],ascending = False).take(1))
#   #print(vals.append(cleaned_weather.groupBy(cleaned_weather[col]).count().orderBy(["count",cleaned_weather[col]],ascending = False).take(1)))


In [0]:
# for i in vals:
#   print(f'\n {i}')
  
  

### Dropping columns with NA values
As we have seen in the previous section the columns that represent 'Additional Data' in the weather table has a lot of NA values. 
We'll remove these columns and work with the columns that indicate 'Mandatory weather variables'

In [0]:
cols_list = cleaned_weather.columns

In [0]:
# getting an index of all the columns
for i in range(0,len(cols_list)):
  print(f'i = {i} and col = {cols_list[i]}')
  
  

In [0]:
# Now lets drop the columns
temp_df = cleaned_weather
temp_df_new = cleaned_weather

cleaned_weather_trimmed = cleaned_weather

for i in range(20,len(cols_list)):
  print(f'ith column = {cols_list[i]}')
  temp_df_new = temp_df.drop(cols_list[i])
  cleaned_weather_trimmed = temp_df_new 
  temp_df = temp_df_new
  
#clean_weather_trimmed = temp_df_new
  
  
  
  

### Column split
Now that we've removed the unwanted columns we will focus on individual weather attibutes(WND etc.).
These observations related to the weather are grouped in a single column and we want to split it into multiple columns.

In the section that follows , we aim to wplit each combined column into multiple ones to continue our analysis.

#### Split 1- The Wind attributes.
The wind column has been combined to indicate the following attribute values of the measurement related to the wind:
- W1=>WIND-OBSERVATION direction angle,the angle, measured in a clockwise direction, between true north and the direction from which the wind is blowing.
 MIN: 001 MAX: 360 UNITS: Angular Degrees
 SCALING FACTOR: 1
 999 = Missing. If type code (below) = V, then 999 indicates variable wind direction.
 
- W2=> WIND-OBSERVATION direction quality code.The code that denotes a quality status of a reported WIND-OBSERVATION direction angle.

- W3=> WIND-OBSERVATION type code.The code that denotes the character of the WIND-OBSERVATION.
  - '9' indicates that the data is missing.

- W4=> WIND-OBSERVATION speed rate. The rate of horizontal travel of air past a fixed point.
 UNITS: meters per second
 SCALING FACTOR: 10.
 - 9999 indicates missing values
 
 - W5 =>WIND-OBSERVATION speed quality code. The code that denotes a quality status of a reported WIND-OBSERVATION speed rate.
 
 We'll now create additional columns in our "trimmed_data_frame" to represent these values as separate columns.
 
 - WND_Angle = WIND-OBSERVATION direction angle
 - WND_Qlty = WIND-OBSERVATION direction quality code
 - WND_Obs = WIND-OBSERVATION type code
 - WND_Speed = WIND-OBSERVATION speed rate
 - WND_Speed_Qlty = WIND-OBSERVATION speed quality code

In [0]:
#Let us now try to append these columns to the data frame
#df_concat = df_1.union(df_2)
from functools import reduce



#clean_weather_trimmed_split = clean_weather_trimmed.union(df_split)

In [0]:
# The code below splits the comma separated column values into individual columns.
#- WND_Angle = WIND-OBSERVATION direction angle
 #- WND_Qlty = WIND-OBSERVATION direction quality code
 #- WND_Obs = WIND-OBSERVATION type code
 #- WND_Speed = WIND-OBSERVATION speed rate
 #- WND_Speed_Qlty = WIND-OBSERVATION speed quality code
#weather_split = weather_split.withColumn('WIND_DIR', split_col.getItem(0).cast(IntegerType()))

split_col = f.split(cleaned_weather['WND'], ',')
cleaned_weather = cleaned_weather.withColumn('WND_Angle', split_col.getItem(0).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('WND_Qlty', split_col.getItem(1))
cleaned_weather = cleaned_weather.withColumn('WND_Obs', split_col.getItem(2))
cleaned_weather = cleaned_weather.withColumn('WND_Speed', split_col.getItem(3).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('WND_Speed_Qlty', split_col.getItem(4))
#df = df.withColumn('NAME2', split_col.getItem(1))

In [0]:
split_col = f.split(cleaned_weather['WND'], ',')
cleaned_weather_19 = cleaned_weather_19.withColumn('WND_Angle', split_col.getItem(0).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('WND_Qlty', split_col.getItem(1))
cleaned_weather_19 = cleaned_weather_19.withColumn('WND_Obs', split_col.getItem(2))
cleaned_weather_19 = cleaned_weather_19.withColumn('WND_Speed', split_col.getItem(3).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('WND_Speed_Qlty', split_col.getItem(4))

#### Split 2- The Sky Conditions

This column provides more information about the conditions of the sky.
We'll split the data into the following columns.

In [0]:
# We'll now split the CGI obervations into muliple columns

split_col = f.split(cleaned_weather['CIG'], ',')
cleaned_weather = cleaned_weather.withColumn('CIG_Height', split_col.getItem(0).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('CIG_Qlty', split_col.getItem(1))
cleaned_weather = cleaned_weather.withColumn('CIG_Ceiling', split_col.getItem(2))
cleaned_weather = cleaned_weather.withColumn('CIG_CAVOK', split_col.getItem(3))

#df = df.withColumn('NAME2', split_col.getItem(1))

In [0]:
split_col = f.split(cleaned_weather['CIG'], ',')
cleaned_weather_19 = cleaned_weather_19.withColumn('CIG_Height', split_col.getItem(0).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('CIG_Qlty', split_col.getItem(1))
cleaned_weather_19 = cleaned_weather_19.withColumn('CIG_Ceiling', split_col.getItem(2))
cleaned_weather_19 = cleaned_weather_19.withColumn('CIG_CAVOK', split_col.getItem(3))

#### Split 3- The Visibility Conditions

This column provides more information about the visibility conditions.

In [0]:
#We will now split the column into muliple cols

split_col = f.split(cleaned_weather['VIS'], ',')
cleaned_weather = cleaned_weather.withColumn('VIS_Dis', split_col.getItem(0).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('VIS_Qlty', split_col.getItem(1))
cleaned_weather = cleaned_weather.withColumn('VIS_Var', split_col.getItem(2))
cleaned_weather = cleaned_weather.withColumn('VIS_Var_Qlty', split_col.getItem(3))



In [0]:
split_col = f.split(cleaned_weather['VIS'], ',')
cleaned_weather_19 = cleaned_weather_19.withColumn('VIS_Dis', split_col.getItem(0).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('VIS_Qlty', split_col.getItem(1))
cleaned_weather_19 = cleaned_weather_19.withColumn('VIS_Var', split_col.getItem(2))
cleaned_weather_19 = cleaned_weather_19.withColumn('VIS_Var_Qlty', split_col.getItem(3))

#### Split 4- The Temperature Observations

This column provides more information about the temperature conditions.

In [0]:

split_col = f.split(cleaned_weather['TMP'], ',')
cleaned_weather = cleaned_weather.withColumn('TMP_Degree', split_col.getItem(0).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('TMP_Qlty', split_col.getItem(1))


In [0]:
split_col = f.split(cleaned_weather['TMP'], ',')
cleaned_weather_19 = cleaned_weather_19.withColumn('TMP_Degree', split_col.getItem(0).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('TMP_Qlty', split_col.getItem(1))

#### Split 5- The Dew Observations

These attributes indicate the Dew observations of the air

In [0]:
#SPlitting the DEW column


split_col = f.split(cleaned_weather['DEW'], ',')
cleaned_weather = cleaned_weather.withColumn('DEW_Degree', split_col.getItem(0).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('DEW_Qlty', split_col.getItem(1))

In [0]:
split_col = f.split(cleaned_weather['DEW'], ',')
cleaned_weather_19 = cleaned_weather_19.withColumn('DEW_Degree', split_col.getItem(0).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('DEW_Qlty', split_col.getItem(1))

#### Split 6- The Pressure Observations

These attributes indicate the atmospheric pressure measurements.

In [0]:
split_col = f.split(cleaned_weather['SLP'], ',')
cleaned_weather = cleaned_weather.withColumn('SLP_Pressure', split_col.getItem(0).cast(IntegerType()))
cleaned_weather = cleaned_weather.withColumn('SLP_Qlty', split_col.getItem(1))

In [0]:
split_col = f.split(cleaned_weather['SLP'], ',')
cleaned_weather_19 = cleaned_weather_19.withColumn('SLP_Pressure', split_col.getItem(0).cast(IntegerType()))
cleaned_weather_19 = cleaned_weather_19.withColumn('SLP_Qlty', split_col.getItem(1))

##### Now our dataframe has all the columns that are split into individual values.WE'll now get rid of the aggregated columns - WND, SLP,VIS and TMP.

#### Let's now get rid of the redundant columns

In [0]:
cleaned_weather_final = cleaned_weather.drop("WND","CIG","VIS","TMP","DEW","SLP")
cleaned_weather_19_final = cleaned_weather_19.drop("WND","CIG","VIS","TMP","DEW","SLP")

In [0]:
#ANalyzing the cols

for i in range(10,len(cleaned_weather_final.columns)):
  print(cleaned_weather_final[i])
  #print(cleaned_weather_final.groupBy(cleaned_weather_final[i]).count().orderBy(["count",cleaned_weather_final[i]],ascending = False))
  
        
  

#for col in (range(20,len(all_cols))):
#   print(cleaned_weather[col])
#   vals.append(cleaned_weather.groupBy(cleaned_weather[col]).count().orderBy(["count",cleaned_weather[col]],ascending = False).take(1))
#   #print(vals.append(cleaned_weather.groupBy(cleaned_weather[col]).count().orderBy(["count",cleaned_weather[col]],ascending = False).take(1)))

## Analysis of Weather Attributes.
Now that we have finalized our data frame , let's analyze the Individueal features.

#### Section 1 - Analysis of "Wind"

###### 1.1 Wind Angle -  The angle, measured in a clockwise direction, between true north and the direction from which the wind is blowing.

- Type = Quantitative
- Range = 1 - 360
- Missing  = 999

In [0]:
#Lets look at the statistics of WND_Angle
cleaned_weather_final[['WND_Angle']].describe().show()


In [0]:
#Analysis of Missing values

WND_Angle_NA =cleaned_weather_final.filter(cleaned_weather_final["WND_Angle"] == 999)

In [0]:
WND_Angle_present =cleaned_weather_final.filter(cleaned_weather_final["WND_Angle"] != 999)

###### The plot below shows the average angle of WIND

In [0]:
WND_Angle_present[['YEAR','WND_Angle']].display()

YEAR,WND_Angle
2016,230
2016,220
2016,230
2016,210
2016,110
2016,90
2016,90
2016,90
2016,100
2016,90


###### Missing values of WND_ANGLE over the years

In [0]:
WND_Angle_NA[['YEAR','WND_Angle']].display()

YEAR,WND_Angle
2016,999
2016,999
2016,999
2016,999
2016,999
2016,999
2016,999
2016,999
2016,999
2016,999


##### Observation:
- The number of missing values for WND_ANGLE are roughly the same each year.

In [0]:
all_cols = cleaned_weather_final.columns

#### 1.2 Wind Qlty -The code that denotes a quality status of a reported WIND-OBSERVATION direction angle.
- Type = Qualitative
- Range = 0-9
- 0 = Passed gross limits check
- 1 = Passed all quality control checks
- 2 = Suspect
- 3 = Erroneous
- 4 = Passed gross limits check, data originate from an NCEI data source
- 5 = Passed all quality control checks, data originate from an NCEI data source
- 6 = Suspect, data originate from an NCEI data source
- 7 = Erroneous, data originate from an NCEI data source
- 9 = Passed gross limits check if element is present

In [0]:
#cleaned_weather_final.groupBy(cleaned_weather_final['WND_Qlty']).count().orderBy(["count",cleaned_weather_final['WND_Qlty']],ascending = False)

cleaned_weather_final[['WND_Qlty','YEAR']].groupBy("WND_Qlty","YEAR").count().display()

WND_Qlty,YEAR,count
1,2018,6178703
9,2018,25513781
5,2018,25511728
9,2017,24122839
5,2017,25613925
1,2017,4116496
U,2018,125
9,2016,24621467
1,2016,3772786
5,2016,26188116


###### Observation:
Wind Quality:
-Our domain for this variable is 0-9 and  'Quality status' = 'U' , 'A' , 'P' are unknown values that we should drop.

###### 1.3 Wind_Obs- The code that denotes the character of the WIND-OBSERVATION.

Type = Qualitative
- Missing Values = 9
- A = Abridged Beaufort
- B = Beaufort
- C = Calm
- H = 5-Minute Average Speed
- N = Normal
- R = 60-Minute Average Speed
- Q = Squall
- T = 180 Minute Average Speed
- V = Variable

In [0]:
cleaned_weather_final[['WND_Obs']].groupBy("WND_Obs").count().display()

WND_Obs,count
V,5602149
C,33342921
N,122394772
9,60135747
R,4899686
H,3658606


##### Observation:
Wind_Obs:
-Almost 30% of the the values are unknown and should be analyzed further/dropped.

#### 1.4 Wind_Speed
- (Scaling factor = 10)
- Units = m/sec
- Type = Quantitative
- Scaling = 10
- Missing Values = 99999

In [0]:
WND_Speed_NA = cleaned_weather_final.filter(cleaned_weather_final["WND_Speed"] != 99999)

### The Bar chart below shows average Wind speed over the years

In [0]:
WND_Speed_NA[['WND_Speed','YEAR']].display()

WND_Speed,YEAR
36,2016
26,2016
36,2016
21,2016
15,2016
0,2016
15,2016
15,2016
21,2016
0,2016


In [0]:
cleaned_weather_final[['WND_Speed']].describe().show()

In [0]:
#Analysis of missing values

WND_NA = cleaned_weather_final.filter(cleaned_weather_final["WND_Speed"] == 9999)

### count of Missing values for WND_SPEED over the years

In [0]:
WND_NA[['WND_Speed','YEAR']].display()

##### Observation:
Wind_Speed:
- Almost 6M records have missing values.
- The number is same across the years.

###### 1.4 Wind_Speed_Qlty  The code that denotes a quality status of a reported WIND-OBSERVATION speed rate.

Type = Qualitative

- 0 = Passed gross limits check
- 1 = Passed all quality control checks
- 2 = Suspect
- 3 = Erroneous
- 4 = Passed gross limits check, data originate from an NCEI data source
- 5 = Passed all quality control checks, data originate from an NCEI data source
- 6 = Suspect, data originate from an NCEI data source
- 7 = Erroneous, data originate from an NCEI data source
- 9 = Passed gross limits check if element is present

In [0]:
cleaned_weather_final[['WND_Speed_Qlty','YEAR']].groupBy("WND_Speed_Qlty",'YEAR').count().display()

WND_Speed_Qlty,YEAR,count
9,2016,14811047
9,2019,2469988
5,2016,34050200
9,2017,14358821
9,2018,15112765
5,2017,33361515
6,2016,7491
6,2019,1091
6,2017,6667
5,2018,33243163


##### Observation:
Wind Speed Quality:
-Our domain for this variable is 0-9 and  'Quality status' = 'U' , 'A' , 'P' , 'I' are unknown values that we should drop.

#### Section 2 - Analysis of "SKY Conditions"

###### 2.1 CIG_Height.
The height above ground level (AGL) of the lowest cloud or obscuring phenomena layer , which may be predominantly opaque, or the vertical visibility into a surface-based obstruction.
- Type = Quantitative
- Range = 0-22000
- Missing  = 99999

In [0]:
cleaned_weather_final[['CIG_Height']].describe().show()

In [0]:
# Lets Check the number of missing values
cleaned_weather_final.filter(cleaned_weather_final["CIG_Height"] == 99999).count()

In [0]:
CIG_Height_present = cleaned_weather_final.filter(cleaned_weather_final["CIG_Height"] != 99999)

### The bar chart below displays the average of lowest obscuring phenomena over the years

In [0]:
CIG_Height_present[['CIG_Height','YEAR']].display()

CIG_Height,YEAR
91,2016
122,2016
183,2016
274,2016
427,2016
610,2016
549,2016
853,2016
945,2016
1067,2016


###### Observation:
CIG_Height:
- We have around 75878410 records that are missing.
- The vertical visibility does not change much on an average.

###### 2.2 CIG_Qlty
The code that denotes a quality status of a reported ceiling height dimension.

- Type = Qualitative
- 0 = Passed gross limits check
- 1 = Passed all quality control checks
- 2 = Suspect
- 3 = Erroneous
- 4 = Passed gross limits check, data originate from an NCEI data source
- 5 = Passed all quality control checks, data originate from an NCEI data source
- 6 = Suspect, data originate from an NCEI data source
- 7 = Erroneous, data originate from an NCEI data source
- 9 = Passed gross limits check if element is present

In [0]:
cleaned_weather_final[['CIG_Qlty','YEAR']].groupBy("CIG_Qlty","YEAR").count().display()

CIG_Qlty,YEAR,count
1,2018,4311387
7,2018,107996
9,2018,19664789
5,2018,33109520
6,2018,10709
7,2015,93932
1,2015,2429997
6,2015,6103
9,2015,17644213
5,2015,34495669


###### Observation:
CIG_Height:
- This seems to be an important factor as the value is of 'reasonable' quality for all the records

###### 2.3 CIG_Ceiling 
 The code that denotes the method used to determine the ceiling.

- Type = Qualitative
- Missing = 9
- Measured = 'M'
- Obscures = 'W'

In [0]:
cleaned_weather_final[['CIG_Ceiling']].groupBy("CIG_Ceiling").count().display()

CIG_Ceiling,count
M,53676401
C,69328
W,1360256
9,165204206


##### Observation:
CIG_Ceiling:
- Almost 171M values are missing and should be dropped.

#### 2.4 CIG_CAVOK 
 The code that represents whether the 'Ceiling and Visibility Okay' (CAVOK) condition has been reported.

- Type = Qualitative
- Missing = 9

In [0]:
cleaned_weather_final[['CIG_CAVOK']].groupBy("CIG_CAVOK").count().display()

CIG_CAVOK,count
N,217790362
9,2519829


##### Observation:
CIG_CAVOK:
- Intesresting to note that there are no values for which CAVOK is 'Y'

#### Section 3 - Analysis of "Visibility Conditions"

###### 3.1 VIS_Dis
 The horizontal distance at which an object can be seen and identified.
- Units = Meters
- Type = Quantitative
- Range = 0-160000
- Missing  = 999999

In [0]:
# Lets take a count of missing values
cleaned_weather_final.filter(cleaned_weather_final["VIS_Dis"] == 999999).count()

##### Observation:
VIS_DIS:
- 69770868 missing values

In [0]:
VIS_DIS_present = cleaned_weather_final.filter(cleaned_weather_final["VIS_DIS"] != 999999)

###### Plot- Avg Visibility over years

In [0]:
VIS_DIS_present[['VIS_DIS','YEAR']].display()

VIS_DIS,YEAR
16093,2016
16093,2016
16093,2016
16093,2016
16093,2016
16093,2016
16093,2016
16093,2016
16093,2016
16093,2016


#### 3.2 VIS_Qlty
 The code that denotes a quality status of a reported distance of a visibility observation.

- Type = Qualitative
- Range = 0-9
- 0 = Passed gross limits check
-  1 = Passed all quality control checks
- 2 = Suspect
- 3 = Erroneous
- 4 Passed gross limits check, data originate from an NCEI data source
- 5 = Passed all quality control checks, data originate from an NCEI data source
- 6 = Suspect, data originate from an NCEI data source-  7 = Erroneous, data originate from an NCEI data source
- 9 = Passed gross limits check if element is present

In [0]:
cleaned_weather_final[['VIS_Qlty']].groupBy("VIS_Qlty").count().display()

VIS_Qlty,count
7,1096651
5,140361452
6,155321
A,59423
9,71721490
1,16619889
P,19653
I,2


##### Observation:
VIS_QLTY:
- The values 'I','P','A' are outside our domain and should be dropped

###### 3.3 VIS_Var
The code that denotes whether or not the reported visibility is variable.


- Type = Qualitative
- Range = N,V,9

In [0]:
cleaned_weather_final[['VIS_Var','YEAR']].groupBy("VIS_Var",'YEAR').count().display()

VIS_Var,YEAR,count
9,2016,20122624
N,2016,34284852
V,2016,175019
N,2017,33554127
N,2018,33477701
V,2017,196136
9,2017,20103118
9,2018,23546096
V,2018,180604
V,2015,182940


#### 3.4 VIS_Var_Qlty
- Type = Qualitative
- Range = 0-9

In [0]:
cleaned_weather_final[['VIS_Var_Qlty']].groupBy("VIS_Var_Qlty").count().show()

##### Observation:
VIS_QLTY:
- The value A' is outside our domain and should be dropped
- We shuld also further analyze the data values which are missing

#### Section 4 - Analysis of "Temperature"

###### 4.1 TMP_Degree
The temperature of the air in Degree Celcius

- Type = Quantitaive
- Scaling = 10
- Missing  =9999

In [0]:
cleaned_weather_final.filter(cleaned_weather_final["TMP_Degree"] == 9999).count()

In [0]:
TMP_Degree_NA = cleaned_weather_final.filter(cleaned_weather_final["TMP_Degree"] == 9999)

###### Count of Missing values for TMP over the Years

In [0]:
TMP_Degree_NA[['TMP_Degree','YEAR']].display()

TMP_Degree,YEAR
9999,2016
9999,2016
9999,2016
9999,2016
9999,2016
9999,2016
9999,2016
9999,2016
9999,2016
9999,2016


In [0]:
TMP_Degree_Not_NA = cleaned_weather_final.filter(cleaned_weather_final["TMP_Degree"] != 9999)

###### Avg TMP over the years

In [0]:
TMP_Degree_Not_NA[['TMP_Degree','YEAR']].display()

TMP_Degree,YEAR
0,2016
0,2016
0,2016
10,2016
5,2016
10,2016
0,2016
5,2016
5,2016
11,2016


###### Observations :
- The average temp for 2018 is marginally less(for those recs that have a value)

###### 4.2 TMP_Qlty
The code that denotes a quality status of an AIR-TEMPERATURE-OBSERVATION.

- Type = Qualitative
- Missing  = 9999

In [0]:
cleaned_weather_final[['TMP_Qlty']].groupBy("TMP_Qlty").count().show()

#### Section 5.1 - Dew_Degree
- Quantitative
- MIN: -0982 MAX: +0368
- UNITS: Degrees Celsius
- SCALING FACTOR: 10
- 9999 = Missing.

In [0]:
#Let us find out the missing values
cleaned_weather_final.filter(cleaned_weather_final["DEW_Degree"] == 9999).count()

#### Section 5.2 - Dew_Qlty
- MIN: -0982 MAX: +0368
- UNITS: Degrees Celsius
- SCALING FACTOR: 10
- 9999 = Missing.

In [0]:
cleaned_weather_final[['DEW_Qlty']].groupBy("DEW_Qlty").count().show()

#### Section 6 - Analysis of "Atmospheric pressure"

#### Section 6.1 - SLP_Pressure
The air pressure relative to Mean Sea Level (MSL).
- MIN: 08600 
- MAX: 10900
- UNITS: Hectopascals
- SCALING FACTOR: 10
- 99999 = Missing.

In [0]:
# Let;s analyze the missing values
cleaned_weather_final.filter(cleaned_weather_final["SLP_Pressure"] == 99999).count()

In [0]:
cleaned_weather_final[['SLP_Qlty']].groupBy("SLP_Qlty").count().show()

#### Next, Lets drop all the missing values represented by 9's in the dataset.

In [0]:
cleaned_weather_final.columns


In [0]:
#Let's remove missing values for WND_Angle
##df.filter(df["age"]>24).show()
cleaned_weather_final_no_missing_vals = cleaned_weather_final.filter((cleaned_weather_final['WND_Angle'] != 999) | (cleaned_weather_final['WND_Obs'] != '9') | (cleaned_weather_final['WND_Speed'] != 9999) | (cleaned_weather_final['CIG_Height'] != 99999)|(cleaned_weather_final['CIG_CAVOK'] != '9') | (cleaned_weather_final['VIS_Dis'] != 999999)|(cleaned_weather_final['VIS_Var'] != '9') |(cleaned_weather_final['TMP_Degree'] != 9999) | (cleaned_weather_final['DEW_Degree'] != 9999) |(cleaned_weather_final['SLP_Pressure'] != 99999)|(cleaned_weather_final['REPORT_TYPE'] == 'FM-15'))


In [0]:
cleaned_weather_19_final.columns

In [0]:
cleaned_weather_final_no_missing_vals_19 = cleaned_weather_19_final.filter((cleaned_weather_19_final['WND_Angle'] != 999) | (cleaned_weather_19_final['WND_Obs'] != '9') | (cleaned_weather_19_final['WND_Speed'] != 9999) | (cleaned_weather_19_final['CIG_Height'] != 99999)|(cleaned_weather_19_final['CIG_CAVOK'] != '9') | (cleaned_weather_19_final['VIS_Dis'] != 999999)|(cleaned_weather_19_final['VIS_Var'] != '9') |(cleaned_weather_19_final['TMP_Degree'] != 9999) | (cleaned_weather_19_final['DEW_Degree'] != 9999) |(cleaned_weather_19_final['SLP_Pressure'] != 99999)|(cleaned_weather_19_final['REPORT_TYPE'] == 'FM-15'))


In [0]:
cleaned_weather_final_no_missing_vals.count()

In [0]:
cleaned_weather_final_no_missing_vals.count()

In [0]:
cleaned_weather_final_no_missing_vals_19.count()

In [0]:
weather_final = cleaned_weather_final_no_missing_vals
weather_final_19 = cleaned_weather_final_no_missing_vals_19

In [0]:
#weather_final= cleaned_weather_final_no_missing_vals.drop("CIG_Ceiling")

In [0]:
weather_final= cleaned_weather_final_no_missing_vals.drop("CIG_Ceiling")

In [0]:
weather_final.count()

In [0]:
weather_final.write.parquet("dbfs:/mnt/mids-w261/team20SSDK/strategy/weather_all_columns")

In [0]:
weather_final_19.write.parquet("dbfs:/mnt/mids-w261/team20SSDK/strategy/weather_19_all_columns")

In [0]:
missing_vals_train_val = cleaned_weather_final.filter((cleaned_weather_final['WND_Angle'] == 999) | (cleaned_weather_final['WND_Obs'] == '9') | (cleaned_weather_final['WND_Speed'] == 9999) | (cleaned_weather_final['CIG_Height'] == 99999)|(cleaned_weather_final['CIG_CAVOK'] == '9') | (cleaned_weather_final['VIS_Dis'] == 999999)|(cleaned_weather_final['VIS_Var'] == '9') |(cleaned_weather_final['TMP_Degree'] == 9999) | (cleaned_weather_final['DEW_Degree'] == 9999) |(cleaned_weather_final['SLP_Pressure'] == 99999)|(cleaned_weather_final['REPORT_TYPE'] == 'FM-15'))

In [0]:
missing_vals_19 = cleaned_weather_19_final.filter((cleaned_weather_19_final['WND_Angle'] == 999) | (cleaned_weather_19_final['WND_Obs'] == '9') | (cleaned_weather_19_final['WND_Speed'] == 9999) | (cleaned_weather_19_final['CIG_Height'] == 99999)|(cleaned_weather_19_final['CIG_CAVOK'] == '9') | (cleaned_weather_19_final['VIS_Dis'] == 999999)|(cleaned_weather_19_final['VIS_Var'] == '9') |(cleaned_weather_19_final['TMP_Degree'] == 9999) | (cleaned_weather_19_final['DEW_Degree'] == 9999) |(cleaned_weather_19_final['SLP_Pressure'] == 99999)|(cleaned_weather_19_final['REPORT_TYPE'] == 'FM-15'))

In [0]:
missing_vals_train_val[['WND_Angle','WND_Obs','WND_Speed','CIG_Height','VIS_Dis','TMP_Degree','DEW_Degree','SLP_Pressure','YEAR']].display()

WND_Angle,WND_Obs,WND_Speed,CIG_Height,VIS_Dis,TMP_Degree,DEW_Degree,SLP_Pressure,YEAR
230,N,36,91,999999,0,-10,99999,2016
999,V,26,122,999999,0,-10,99999,2016
220,N,36,183,999999,0,-5,99999,2016
230,N,21,274,999999,10,-10,99999,2016
210,N,15,427,999999,5,-11,99999,2016
999,C,0,610,999999,10,-10,99999,2016
110,N,15,549,999999,0,-5,99999,2016
999,V,15,853,999999,5,-11,99999,2016
999,V,21,945,999999,5,-16,99999,2016
999,C,0,1067,999999,11,-22,99999,2016


###### Observations:
- Intersting to note that 2018 has only missing values in WND_SPEED and SLP_Pressure

In [0]:
missing_vals_19[['WND_Angle','WND_Obs','WND_Speed','CIG_Height','VIS_Dis','TMP_Degree','DEW_Degree','SLP_Pressure','YEAR']].display()

WND_Angle,WND_Obs,WND_Speed,CIG_Height,VIS_Dis,TMP_Degree,DEW_Degree,SLP_Pressure,YEAR
999,R,27,99999,999999,-152,9999,99999,2019
999,9,9999,99999,999999,-153,9999,99999,2019
999,9,9999,99999,999999,-153,9999,99999,2019
999,9,9999,99999,999999,-154,9999,99999,2019
999,9,9999,99999,999999,-154,9999,99999,2019
999,9,9999,99999,999999,-155,9999,99999,2019
999,9,9999,99999,999999,-155,9999,99999,2019
999,9,9999,99999,999999,-156,9999,99999,2019
999,9,9999,99999,999999,-157,9999,99999,2019
999,9,9999,99999,999999,-158,9999,99999,2019


In [0]:
display(dbutils.fs.ls("dbfs:/mnt/mids-w261/team20SSDK/"))