# Flight Delay Project Data : EDA of `WEATHER` and `STATIONS`
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

Calendar Years: 2015 - 2019

### Additional sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/


https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236

In this notebook, we evaluate the `WEATHER` and `STATIONS` data only

# Imports & Helper Functions

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import databricks.koalas as ks
from pyspark.sql import SQLContext
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [0]:
############################################################################
# Plotting Utilities, Constants, Methods for W209 arXiv project
############################################################################

#---------------------------------------------------------------------------
## Plotting Palette
#
# Create a dict object containing U.C. Berkeley official school colors for plot palette 
# reference : https://brand.berkeley.edu/colors/
# secondary reference : https://alumni.berkeley.edu/brand/color-palette# CLass Initialization
#---------------------------------------------------------------------------

berkeley_palette = {'berkeley_blue'     : '#003262',
                    'california_gold'   : '#fdb515',
                    'founders_rock'     : '#3b7ea1',
                    'medalist'          : '#c4820e',
                    'bay_fog'           : '#ddd5c7',
                    'lawrence'          : '#00b0da',
                    'sather_gate'       : '#b9d3b6',
                    'pacific'           : '#46535e',
                    'soybean'           : '#859438',
                    'south_hall'        : '#6c3302',
                    'wellman_tile'      : '#D9661F',
                    'rose_garden'       : '#ee1f60',
                    'golden_gate'       : '#ed4e33',
                    'lap_lane'          : '#00a598',
                    'ion'               : '#cfdd45',
                    'stone_pine'        : '#584f29',
                    'grey'              : '#eeeeee',
                    'web_grey'          : '#888888',
                    # alum only colors
                    'metallic_gold'     : '#BC9B6A',
                    'california_purple' : '#5C3160'                   
                    }

# `Weather` Data Analysis

In [0]:
# load raw weather data into koalas dataframe and print raw info about shape and sparsity
df = ks.sql("select * from weather_tbl").to_koalas()
print(df.info())

### Raw data profile:
 - 630,904,436 rows
 - 177 columns
   - `3 x float64`
   - `1 x int16`
   - `172 x object`

### Missing data analysis

In [0]:
# evaluate missing data
missing = df.isnull().sum(axis=0).reset_index()
missing.columns = ['column', 'total_missing']
missing['missing_pct'] = missing.total_missing / df.shape[0] * 100
missing = missing.sort_values(by=['total_missing','column']).reset_index(drop = True)
missing['category'] = ['No missing data'] * missing.shape[0]
missing.loc[((missing['missing_pct'] > 0.0) & (missing['missing_pct'] <= 40.0)), 'category'] = 'Up to 40% missing data'
missing.loc[((missing['missing_pct'] > 40.0) & (missing['missing_pct'] <= 80.0)), 'category'] = 'Up to 80% missing data'
missing.loc[missing['missing_pct'] > 80.0, 'category'] = 'Over 80% missing data'
#missing.head(5).style.set_properties(**{'background-color': berkeley_palette['california_gold'], 'color': berkeley_palette['berkeley_blue'], 'border-color': 'white'})

cats = missing.groupby('category').column.count().to_frame()
cats.columns = ['Category Count']

cats.style.bar(color = berkeley_palette['founders_rock'], align = 'mid')\
  .set_caption('Distribution of missing data per column in Weather')

Unnamed: 0_level_0,Category Count
category,Unnamed: 1_level_1
No missing data,175
Up to 40% missing data,2


In [0]:
missing_subset = missing[missing.missing_pct > 0.0].copy().sort_values(by='missing_pct')
x, y, h = missing_subset['column'].to_numpy(),  missing_subset.missing_pct.to_numpy(), missing_subset.total_missing.to_numpy()
marker_colors = [berkeley_palette['pacific'] if x > 80.0 else berkeley_palette['lawrence'] for x in missing_subset.missing_pct.to_numpy()]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=h)])
fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='[Weather]: Columns w/ Missing Values', title_font_size = 24,
                  height=450, width=800,
                  yaxis=dict(title='% Missing Values', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Column Names', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
good_cols = ['STATION', 'DATE', 'SOURCE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'NAME', 'REPORT_TYPE', 'CALL_SIGN', 'QUALITY_CONTROL', 'WND', 'CIG', 'VIS', 
    'TMP', 'DEW', 'SLP', 'MA1', 'REM']

df[good_cols].head(5).style.set_properties(**{'background-color': berkeley_palette['pacific'],'color': berkeley_palette['california_gold'],'border-color': berkeley_palette['berkeley_blue']})\
  .set_caption('Raw Weather Data : First 5 Rows (Only Columns with Data)')

Unnamed: 0,STATION,DATE,SOURCE,LATITUDE,LONGITUDE,ELEVATION,NAME,REPORT_TYPE,CALL_SIGN,QUALITY_CONTROL,WND,CIG,VIS,TMP,DEW,SLP,MA1,REM
0,47739099999,2015-01-01 00:00:00,4,34.8,138.183333,135.0,"SHIZUOKA AIRPORT, JA",FM-15,99999,V020,"260,1,N,0098,1","99999,9,9,N",9999199,401,-301,999999,100401999999,MET069METAR RJNS 010000Z 26019KT 9999 FEW020 04/M03 Q1004 RMK 1CU020 A2967=
1,47739099999,2015-01-01 00:03:00,4,34.8,138.183333,135.0,"SHIZUOKA AIRPORT, JA",FM-16,99999,V020,"260,1,N,0093,1","99999,9,9,N",9999199,401,-301,999999,100401999999,MET072SPECI RJNS 010003Z 26018G28KT 9999 FEW020 04/M03 Q1004 RMK 1CU020 A2967=
2,47739099999,2015-01-01 01:00:00,4,34.8,138.183333,135.0,"SHIZUOKA AIRPORT, JA",FM-15,99999,V020,"260,1,N,0108,1","99999,9,9,N",9999199,401,-301,999999,100401999999,MET072METAR RJNS 010100Z 26021G32KT 9999 FEW020 04/M03 Q1004 RMK 1CU020 A2967=
3,47739099999,2015-01-01 02:00:00,4,34.8,138.183333,135.0,"SHIZUOKA AIRPORT, JA",FM-15,99999,V020,"260,1,N,0118,1","99999,9,9,N",9999199,501,-601,999999,100301999999,MET069METAR RJNS 010200Z 26023KT 9999 FEW020 05/M06 Q1003 RMK 2CU020 A2964=
4,47739099999,2015-01-01 02:02:00,4,34.8,138.183333,135.0,"SHIZUOKA AIRPORT, JA",FM-16,99999,V020,"260,1,N,0118,1","99999,9,9,N",9999199,501,-501,999999,100301999999,MET072SPECI RJNS 010202Z 26023G33KT 9999 FEW020 05/M05 Q1003 RMK 2CU020 A2964=


In [0]:
df = ks.sql("with cte as (select distinct station, longitude, latitude from weather_tbl where station is not null) select station, count(*) as total from cte group by station having total > 1 order by 2 desc").to_koalas()

x, y = df.station.to_numpy(),  df.total.to_numpy()
#marker_colors = [berkeley_palette['pacific'] if x > 80.0 else berkeley_palette['lawrence'] for x in missing_subset.missing_pct.to_numpy()]

fig = go.Figure(data=[go.Bar(x=(x+'-'), y=y, hovertext=y)])
fig.update_traces(marker_color=berkeley_palette['pacific'], marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='[Weather]: Stations w/ Multiple Lat/Long Locations', title_font_size = 24,
                  height=650, width=1800,
                  yaxis=dict(title='Total (Unique) Lat/Long Locations', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Station Code', titlefont_size=16, color=berkeley_palette['berkeley_blue'], tickangle=90),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
del df

# `STATIONS` Data Analysis

In [0]:
# load raw weather data into koalas dataframe and print raw info about shape and sparsity
df = ks.sql("select * from stations_tbl").to_koalas()
print(df.info())

### Raw data profile:
 - 29,771 rows
 - 11 columns
   - `11 x object`

In [0]:
for c in df.columns:
  print(f"{c} - unique values: {len(df[c].unique())}")

In [0]:
# evaluate missing data
missing = df.isnull().sum(axis=0).reset_index()
missing.columns = ['column', 'total_missing']
missing['missing_pct'] = missing.total_missing / df.shape[0] * 100
missing = missing.sort_values(by=['total_missing','column']).reset_index(drop = True)
missing['category'] = ['No missing data'] * missing.shape[0]
missing.loc[((missing['missing_pct'] > 0.0) & (missing['missing_pct'] <= 40.0)), 'category'] = 'Up to 40% missing data'
missing.loc[((missing['missing_pct'] > 40.0) & (missing['missing_pct'] <= 80.0)), 'category'] = 'Up to 80% missing data'
missing.loc[missing['missing_pct'] > 80.0, 'category'] = 'Over 80% missing data'
#missing.head(5).style.set_properties(**{'background-color': berkeley_palette['california_gold'], 'color': berkeley_palette['berkeley_blue'], 'border-color': 'white'})

cats = missing.groupby('category').column.count().to_frame()
cats.columns = ['Category Count']

cats.style.bar(color = berkeley_palette['founders_rock'], align = 'mid')\
  .set_caption('Distribution of missing data per column in Stations')

Unnamed: 0_level_0,Category Count
category,Unnamed: 1_level_1
Up to 80% missing data,2
No missing data,4
Up to 40% missing data,5


In [0]:
missing_subset = missing[missing.missing_pct > 0.0].copy().sort_values(by='missing_pct')
x, y, h = missing_subset['column'].to_numpy(),  missing_subset.missing_pct.to_numpy(), missing_subset.total_missing.to_numpy()
marker_colors = [berkeley_palette['pacific'] if x > 60.0 else berkeley_palette['lawrence'] for x in missing_subset.missing_pct.to_numpy()]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=h)])
fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='[Stations]: Columns w/ Missing Values', title_font_size = 24,
                  height=450, width=800,
                  yaxis=dict(title='% Missing Values', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Column Names', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
fig = px.histogram(df.to_pandas(), x="elev", marginal="violin",
                   hover_data=df.columns, color_discrete_sequence=[berkeley_palette['berkeley_blue']],
                   nbins=100, opacity=0.7, histnorm='probability')

fig.update_layout(title_text='Station Elevation Distribution', height=500, width=1000, plot_bgcolor='rgba(0,0,0,0)',
                 xaxis=dict(title="Elevation"), yaxis=dict(title='Probability'))
fig.show()