# Flight Delay Project Data : EDA of `FLIGHTS`
## Bureau of Transportation Statistics
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236   
https://www.bts.gov/topics/airlines-and-airports/understanding-reporting-causes-flight-delays-and-cancellations

Calendar Years: 2015 - 2019

### Additional sources
This might be useful in matching station codes to airports:
1. http://dss.ucar.edu/datasets/ds353.4/inventories/station-list.html
2. https://www.world-airport-codes.com/


https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf
https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236

In this notebook, we evaluate the `FLIGHT` data only

# Imports & Helper Functions

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import databricks.koalas as ks
from pyspark.sql import SQLContext
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [0]:
############################################################################
# Plotting Utilities, Constants, Methods for W209 arXiv project
############################################################################

#---------------------------------------------------------------------------
## Plotting Palette
#
# Create a dict object containing U.C. Berkeley official school colors for plot palette 
# reference : https://brand.berkeley.edu/colors/
# secondary reference : https://alumni.berkeley.edu/brand/color-palette# CLass Initialization
#---------------------------------------------------------------------------

berkeley_palette = {'berkeley_blue'     : '#003262',
                    'california_gold'   : '#fdb515',
                    'founders_rock'     : '#3b7ea1',
                    'medalist'          : '#c4820e',
                    'bay_fog'           : '#ddd5c7',
                    'lawrence'          : '#00b0da',
                    'sather_gate'       : '#b9d3b6',
                    'pacific'           : '#46535e',
                    'soybean'           : '#859438',
                    'south_hall'        : '#6c3302',
                    'wellman_tile'      : '#D9661F',
                    'rose_garden'       : '#ee1f60',
                    'golden_gate'       : '#ed4e33',
                    'lap_lane'          : '#00a598',
                    'ion'               : '#cfdd45',
                    'stone_pine'        : '#584f29',
                    'grey'              : '#eeeeee',
                    'web_grey'          : '#888888',
                    # alum only colors
                    'metallic_gold'     : '#BC9B6A',
                    'california_purple' : '#5C3160'                   
                    }

# <font color="Red">Flights</font> Data Analysis

In [0]:
# load raw flights data into koalas dataframe and print raw info about shape and sparsity
df = ks.sql("select * from us_delay_flights_tbl").to_koalas()
print(df.info())

### Raw data profile:
 - 31,746,841 rows
 - 109 columns
   - `32 x float32`
   - `39 x int32`
   - `38 x object`

### Missing data analysis

In [0]:
# evaluate missing data
missing = df.isnull().sum(axis=0).reset_index()
missing.columns = ['column', 'total_missing']
missing['missing_pct'] = missing.total_missing / df.shape[0] * 100
missing = missing.sort_values(by=['total_missing','column']).reset_index(drop = True)
missing['category'] = ['No missing data'] * missing.shape[0]
missing.loc[((missing['missing_pct'] > 0.0) & (missing['missing_pct'] <= 40.0)), 'category'] = 'Up to 40% missing data'
missing.loc[((missing['missing_pct'] > 40.0) & (missing['missing_pct'] <= 80.0)), 'category'] = 'Up to 80% missing data'
missing.loc[missing['missing_pct'] > 80.0, 'category'] = 'Over 80% missing data'
#missing.head(5).style.set_properties(**{'background-color': berkeley_palette['california_gold'], 'color': berkeley_palette['berkeley_blue'], 'border-color': 'white'})

cats = missing.groupby('category').column.count().to_frame()
cats.columns = ['Category Count']

cats.style.bar(color = berkeley_palette['founders_rock'], align = 'mid')\
  .set_caption('Distribution of missing data per column in Flights')

Unnamed: 0_level_0,Category Count
category,Unnamed: 1_level_1
No missing data,37
Up to 40% missing data,19
Over 80% missing data,53


In [0]:
missing_subset = missing[missing.missing_pct > 0.0].copy().sort_values(by='missing_pct')
x, y, h = missing_subset['column'].to_numpy(),  missing_subset.missing_pct.to_numpy(), missing_subset.total_missing.to_numpy()
marker_colors = [berkeley_palette['pacific'] if x > 80.0 else berkeley_palette['lawrence'] for x in missing_subset.missing_pct.to_numpy()]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=h)])
fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='[Flights]: Columns w/ Missing Values', title_font_size = 24,
                  height=550, width=1400,
                  yaxis=dict(title='% Missing Values', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Column Names', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
# 1.5% of our data is missing the dependent variable and is thus unusable
missing[(missing.column == 'DEP_DELAY')].style.hide_columns(['category'])\
  .set_properties(**{'background-color': berkeley_palette['california_gold'], 'color': berkeley_palette['berkeley_blue'], 'border-color':'white'})\
  .set_caption('Missing Delay Data')

Unnamed: 0,column,total_missing,missing_pct
42,DEP_DELAY,477296,1.503444


In [0]:
df = ks.sql("SELECT IF(CANCELLED = 1.000, 'Cancelled', 'Not Cancelled') as CANCELLED_STATUS, COUNT(*) as TOTAL_COUNT FROM us_delay_flights_tbl where DEP_DELAY >= -15 GROUP BY CANCELLED").to_koalas()
df.style.bar(color = berkeley_palette['founders_rock'], align = 'mid')\
  .set_caption('Distribution of Cancelled Flights')

Unnamed: 0,CANCELLED_STATUS,TOTAL_COUNT
0,Not Cancelled,31092983
1,Cancelled,17336


In [0]:
del missing, missing_subset, cats, df

### Dependent variable analysis

In [0]:
delay_df = ks.sql("select ORIGIN, DEP_DELAY FROM us_delay_flights_tbl WHERE DEP_DELAY is not NULL")
delay_df.describe().T.style.set_properties(**{'background-color': berkeley_palette['founders_rock'], 'color': 'white', 'border-color':berkeley_palette['berkeley_blue']})\
  .set_caption('Distribution of DEP_DELAY')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DEP_DELAY,31269545.0,9.855286,43.505203,-234.0,-5.0,-2.0,7.0,2755.0


In [0]:
del delay_df

#### The extreme values seem suspect
There is just no way flights are departing hours early

In [0]:
df = ks.sql("select ORIGIN, DEP_DELAY FROM us_delay_flights_tbl WHERE DEP_DELAY <= 15")
df.describe().T.style.set_properties(**{'background-color': berkeley_palette['founders_rock'], 'color': 'white', 'border-color':berkeley_palette['berkeley_blue']})\
  .set_caption('Distribution of On-Time & Early Flights')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DEP_DELAY,25771920.0,-2.374013,5.752273,-234.0,-6.0,-3.0,0.0,15.0


In [0]:
vc = df.DEP_DELAY.value_counts().to_dataframe().reset_index()
vc.columns = ['departure_delay', 'departure_count']

x, y = vc.departure_delay.to_numpy(),  vc.departure_count.to_numpy()
marker_colors = [berkeley_palette['medalist'] if x > 0.0 else berkeley_palette['ion'] if x == 0.0 else berkeley_palette['lawrence'] for x in vc.departure_delay.to_numpy()]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='Early and On-Time Departures', title_font_size = 24,
                  height=400, width=1200,
                  yaxis=dict(title='Total Flights', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Minutes from Scheduled Departure', titlefont_size=16, color=berkeley_palette['berkeley_blue'],
                            tickmode="array", tickvals=[x for x in range(-235, 16, 10)],
                            ticktext=[x for x in range(-235, 16, 10)]),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
vc = df[df.DEP_DELAY >= -15].DEP_DELAY.value_counts().to_dataframe().reset_index()
vc.columns = ['departure_delay', 'departure_count']

x, y = vc.departure_delay.to_numpy(),  vc.departure_count.to_numpy()
marker_colors = [berkeley_palette['medalist'] if x > 0.0 else berkeley_palette['ion'] if x == 0.0 else berkeley_palette['lawrence'] for x in vc.departure_delay.to_numpy()]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='Early and On-Time Flights', title_font_size = 24,
                  height=400, width=800,
                  yaxis=dict(title='Total Flights', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Minutes from Schedule Departure', titlefont_size=16, color=berkeley_palette['berkeley_blue'],
                            tickmode="array", tickvals=[x for x in range(-15, 16, 1)],
                            ticktext=[x for x in range(-15, 16, 1)]),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
df = ks.sql("select ORIGIN, DEP_DELAY FROM us_delay_flights_tbl WHERE DEP_DELAY > 15")
df.describe().T.style.set_properties(**{'background-color': berkeley_palette['founders_rock'], 'color': 'white', 'border-color':berkeley_palette['berkeley_blue']})\
  .set_caption('Distribution of Delayed Flights')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DEP_DELAY,5497625.0,67.184133,81.379149,16.0,25.0,42.0,79.0,2755.0


In [0]:
vc = df.DEP_DELAY.value_counts().to_dataframe().reset_index()
vc.columns = ['departure_delay', 'departure_count']

x, y = vc.departure_delay.to_numpy(),  vc.departure_count.to_numpy()
marker_colors = [berkeley_palette['rose_garden'] if x > 278.0 else berkeley_palette['lawrence'] for x in vc.departure_delay.to_numpy()]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.update_traces(marker_color=marker_colors, marker_line_color=marker_colors,
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='Delayed Departures', title_font_size = 24,
                  height=400, width=1200,
                  yaxis=dict(title='Total Flights', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Minutes from Scheduled Departure', titlefont_size=16, color=berkeley_palette['berkeley_blue'],
                            tickmode="array", tickvals=[x for x in range(16, 2755, 200)],
                            ticktext=[x for x in range(16, 16, 200)]),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
vc = vc[(vc.departure_delay <= 300)]
x, y = vc.departure_delay.to_numpy(),  vc.departure_count.to_numpy()

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.update_traces(marker_color=berkeley_palette['lawrence'], marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='Delayed Flights (16m - 300m)', title_font_size = 24,
                  height=400, width=800,
                  yaxis=dict(title='Total Flights', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Minutes from Schedule Departure', titlefont_size=16, color=berkeley_palette['berkeley_blue'],
                            tickmode="array", tickvals=[x for x in range(16, 300, 15)],
                            ticktext=[x for x in range(16, 300, 15)]),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
del vc

#### Dependent Variable + Airport

In [0]:
df = ks.sql("SELECT ORIGIN, COUNT(*) as TOTAL, AVG(DEP_DELAY) as MEAN_DELAY, SUM(IF(DEP_DELAY > 15, 1, 0)) AS LATE_FLIGHTS FROM us_delay_flights_tbl WHERE DEP_DELAY > -16 GROUP BY ORIGIN ORDER BY 2 DESC")
total_flights = int(df.TOTAL.to_numpy().sum())
df['PCT_LATE_FLIGHTS'] = (df.LATE_FLIGHTS / df.TOTAL) * 100.0
df['ON_TIME_FLIGHTS'] = df.TOTAL - df.LATE_FLIGHTS
df['PCT_ON_TIME_FLIGHTS'] = (df.ON_TIME_FLIGHTS / df.TOTAL) * 100.0
df['PCT_FLIGHTS'] = (df.TOTAL / total_flights) * 100.0

df.head(10).style.set_properties(**{'background-color': berkeley_palette['founders_rock'], 'color': 'white', 'border-color':berkeley_palette['berkeley_blue']})\
  .set_caption('Top 10 Airports by Flight Count')


Unnamed: 0,ORIGIN,TOTAL,MEAN_DELAY,LATE_FLIGHTS,PCT_LATE_FLIGHTS,ON_TIME_FLIGHTS,PCT_ON_TIME_FLIGHTS,PCT_FLIGHTS
0,ATL,1897468,9.23049,314879,16.594694,1582589,83.405306,6.09916
1,ORD,1463177,12.995183,307586,21.02179,1155591,78.97821,4.703189
2,DFW,1197891,11.704491,241340,20.147075,956551,79.852925,3.850462
3,DEN,1137830,10.734406,220606,19.388309,917224,80.611691,3.657404
4,LAX,1068953,10.630683,213265,19.95083,855688,80.04917,3.436008
5,SFO,840478,12.278024,176637,21.016255,663841,78.983745,2.701605
6,PHX,815542,8.355637,140300,17.203283,675242,82.796717,2.621452
7,CLT,789188,9.622294,138575,17.559187,650613,82.440813,2.53674
8,IAH,767608,9.842562,130519,17.00334,637089,82.99666,2.467374
9,LAS,764245,10.429264,150739,19.723911,613506,80.276089,2.456564


In [0]:
df = ks.sql("""
with cte as (
SELECT Airport, MAX(LastFlightDate) as LastFlightDate FROM (
  SELECT ORIGIN as Airport, max(FL_DATE) as LastFlightDate
  FROM us_delay_flights_tbl 
  WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 and OP_CARRIER NOT IN ('US', 'VX')
  GROUP BY ORIGIN
  UNION 
  SELECT DEST as Airport, max(FL_DATE) as LastFlightDate
  FROM us_delay_flights_tbl 
  WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 and OP_CARRIER NOT IN ('US', 'VX')
  GROUP BY DEST
) GROUP BY Airport)
select Airport, LastFlightDate from cte where LastFlightDate < "2019-01-01" order by 2 asc""").to_koalas()

df.set_index('Airport').style.set_properties(**{'background-color': berkeley_palette['medalist'], 'color': 'white', 'border-color':berkeley_palette['berkeley_blue']})\
  .set_caption('Airports w/o Traffic in 2019')

Unnamed: 0_level_0,LastFlightDate
Airport,Unnamed: 1_level_1
CEC,2015-04-06
CLD,2015-04-06
ILG,2015-04-29
DIK,2015-11-30
EFD,2016-08-14
ENV,2016-11-28
TKI,2017-01-06
UST,2017-08-12
FNL,2017-11-20
YNG,2018-01-04


In [0]:
df = ks.sql("SELECT ORIGIN, COUNT(*) as TOTAL, AVG(DEP_DELAY) as MEAN_DELAY, SUM(IF(DEP_DELAY > 15, 1, 0)) AS LATE_FLIGHTS FROM us_delay_flights_tbl WHERE DEP_DELAY > -16 GROUP BY ORIGIN ORDER BY 2 DESC")
total_flights = int(df.TOTAL.to_numpy().sum())
df['PCT_LATE_FLIGHTS'] = (df.LATE_FLIGHTS / df.TOTAL) * 100.0
df['ON_TIME_FLIGHTS'] = df.TOTAL - df.LATE_FLIGHTS
df['PCT_ON_TIME_FLIGHTS'] = (df.ON_TIME_FLIGHTS / df.TOTAL) * 100.0
df['PCT_FLIGHTS'] = (df.TOTAL / total_flights) * 100.0
vc = df[df.TOTAL > 100][['ORIGIN', 'TOTAL', 'PCT_FLIGHTS']].copy()

x, y, h  = vc.ORIGIN.to_numpy(),  vc.TOTAL.to_numpy(), vc.PCT_FLIGHTS.to_numpy()
marker_colors = [berkeley_palette['medalist'] if x < 3.0 else berkeley_palette['lawrence'] for x in h]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=h)])
fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['berkeley_blue'],
                  marker_line_width=1.5, opacity=0.7)
fig.update_layout(title_text='Flight Count by Airport', title_font_size = 24,
                  height=600, width=1500,
                  yaxis=dict(title='Flight Count', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Airport Code', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()
del vc

#### Relationship between dependent variable and airline on-time ratio

In [0]:
df = df[(df.TOTAL > 100)]  # eliminate the few with 1 or only a couple of flights in the dataset
c, pv = pearsonr(df.MEAN_DELAY.to_numpy(), df.PCT_ON_TIME_FLIGHTS.to_numpy())
x = np.vstack(df.PCT_ON_TIME_FLIGHTS.to_numpy())
reg = LinearRegression().fit(x, df.MEAN_DELAY.to_numpy())
fitline = reg.predict(x)

fig = go.Figure(data=[go.Scatter(name = 'Airports',
  x=df.PCT_ON_TIME_FLIGHTS.to_numpy(), y=df.MEAN_DELAY.to_numpy(),
  mode='markers', text=df.ORIGIN.to_numpy(), marker=dict(size=df.PCT_FLIGHTS.to_numpy() + 20))])

fig.add_trace(go.Scatter(name='Regression Line', x=df.PCT_ON_TIME_FLIGHTS.to_numpy(), y=fitline, mode='lines', marker=dict(color=berkeley_palette['golden_gate'])))

fig.update_layout(title_text='Average Flight Delay vs. Airport % On-Time Ratio\nBy Airline', title_font_size = 24,
                  height=600, width=800,
                  yaxis=dict(title='Average Flight Delay (min)', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='% of On-Time Flights', titlefont_size=16, color=berkeley_palette['berkeley_blue'],
                            tickmode="array", tickvals = [x for x in range(0, 105, 5)],
                            ticktext = [x for x in range(0, 105, 5)]),
                  plot_bgcolor='rgba(0,0,0,0)')

fig.add_annotation(x = 87, y = 33, text = "Corr Coeff: {:.3f}".format(c), showarrow=False, 
                   font=dict(family="Courier New, monospace", size=16, color='white'), bordercolor=berkeley_palette['pacific'], bgcolor=berkeley_palette['berkeley_blue'], opacity=0.6, borderwidth=2, borderpad=4)

fig.show()

#### Examination of Flights over Time

In [0]:
df = ks.sql("SELECT YEAR(FL_DATE) as YEAR, MONTH, SUM(IF(DEP_DELAY > 15, 1, 0)) as DELAYED_TOTAL from us_delay_flights_tbl WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 and OP_CARRIER NOT IN ('US', 'VX') and ORIGIN NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') and DEST NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') GROUP BY YEAR(FL_DATE), MONTH ORDER BY 1 ASC, 2 ASC")
df["period"] = df.YEAR.astype(str) + "-" + df.MONTH.astype(str)
df = df.sort_values(by=['YEAR','MONTH'])

fig = go.Figure(data=[go.Scatter(x=df.period.to_numpy(), y=df.DELAYED_TOTAL.to_numpy(), fill='tozeroy', name='All Airports', line_color=berkeley_palette['founders_rock'])])
fig.update_layout(title_text="Total Delayed Flights per Month (All Airports)", titlefont_size=24, height=300, width=1000,
                 yaxis=dict(title="Monthly Delayed Flights", titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                 xaxis=dict(title="Year-Month", titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')

fig.show()
del df

In [0]:

df = ks.sql("SELECT YEAR(FL_DATE) as YEAR, MONTH, SUM(IF(DEP_DELAY > 15, 1, 0)) as DELAYED_TOTAL, ORIGIN from us_delay_flights_tbl WHERE DEP_DELAY >= -15 AND CANCELLED = 0.000 AND ORIGIN IN ('ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'SFO') GROUP BY YEAR(FL_DATE), MONTH, ORIGIN ORDER BY 1 ASC, 2 ASC, 4 ASC")
#df = df.append(df2, sort=False).reset_index()
df["period"] = df.YEAR.astype(str) + "-" + df.MONTH.astype(str)
df2 = ks.DataFrame({'ORIGIN':['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'SFO'],
                   'color':[berkeley_palette['berkeley_blue'], berkeley_palette['golden_gate'], berkeley_palette['medalist'],
                           berkeley_palette['lawrence'], berkeley_palette['lap_lane'], berkeley_palette['pacific']]})
df = df.merge(df2, on='ORIGIN', how='inner')  
df = df.sort_values(by=['YEAR','MONTH','ORIGIN'])

fig = px.area(data_frame = df.to_pandas(), x="period", y="DELAYED_TOTAL", facet_col="ORIGIN", facet_col_wrap=2, 
              color="ORIGIN", color_discrete_sequence = [berkeley_palette['berkeley_blue'], berkeley_palette['golden_gate'], berkeley_palette['california_gold'],
                           berkeley_palette['lawrence'], berkeley_palette['lap_lane'], berkeley_palette['pacific']], 
              title='Top 6 Origin Airport Delayed Flights per Month',
              labels={'DELAYED_TOTAL':'Delayed FLights', 'ORIGIN':'Origin Airport','period':'Year-Month'})
fig.update_layout(titlefont_size=24, height=600, width=1400, plot_bgcolor='rgba(0,0,0,0)', titlefont_color=berkeley_palette['berkeley_blue'],
                  yaxis=dict(titlefont_size=16, titlefont_color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(titlefont_size=16, titlefont_color=berkeley_palette['berkeley_blue'])
                 )
fig.show()
del df, df2

#### Airline-level Exploration
We should exclude US Airways and Virgin America as they are no longer operating in the U.S. and thus prediction for them is unnecessary.

In [0]:
def highlight_lowdate(s):
  is_max = ~(s == s.max())
  return [" ".join(['background-color:', berkeley_palette['medalist']]) if v else '' for v in is_max]

airline_codes = {'OP_CARRIER':['9E','AA','AS','B6','DL','EV','F9','G4','HA','MQ','NK','OH','OO','UA','US','VX','WN','YV','YX'],
                'Airline':['Endeavor Air', 'American Airlines', 'Alaska Airlines', 'jetBlue', 'Delta Air Lines', 'ExpressJet', 'Frontier Airlines',
                          'Allegiant Air', 'Hawaiian Airlines', 'Envoy Air', 'Spirit Airlines', 'PSA Airlines', 'SkyWest Airlines', 'United Airlines',
                          'US Airways', 'Virgin America', 'Southwest Airlines', 'Mesa Airlines', 'Republic Airways']}

df = ks.sql("select OP_CARRIER, MAX(FL_DATE) as LastFlightDate from us_delay_flights_tbl where DEP_DELAY >= -15 and CANCELLED = 0.000 GROUP BY OP_CARRIER order by 1 asc").to_koalas()
df = df.merge(ks.DataFrame(airline_codes), how='inner', on='OP_CARRIER').sort_values('LastFlightDate').set_index(['OP_CARRIER','Airline'])[['LastFlightDate']]
df.style.set_properties(**{'background-color': berkeley_palette['founders_rock'], 'color': 'white', 'border-color':berkeley_palette['berkeley_blue']})\
  .set_caption('Latest Flight by Airline').apply(highlight_lowdate)


Unnamed: 0_level_0,Unnamed: 1_level_0,LastFlightDate
OP_CARRIER,Airline,Unnamed: 2_level_1
US,US Airways,2015-06-30
VX,Virgin America,2018-03-31
9E,Endeavor Air,2019-12-31
AA,American Airlines,2019-12-31
AS,Alaska Airlines,2019-12-31
B6,jetBlue,2019-12-31
DL,Delta Air Lines,2019-12-31
EV,ExpressJet,2019-12-31
F9,Frontier Airlines,2019-12-31
G4,Allegiant Air,2019-12-31


In [0]:
airline_codes = {'OP_CARRIER':['9E','AA','AS','B6','DL','EV','F9','G4','HA','MQ','NK','OH','OO','UA','US','VX','WN','YV','YX'],
                'Airline':['Endeavor Air', 'American Airlines', 'Alaska Airlines', 'jetBlue', 'Delta Air Lines', 'ExpressJet', 'Frontier Airlines',
                          'Allegiant Air', 'Hawaiian Airlines', 'Envoy Air', 'Spirit Airlines', 'PSA Airlines', 'SkyWest Airlines', 'United Airlines',
                          'US Airways', 'Virgin America', 'Southwest Airlines', 'Mesa Airlines', 'Republic Airways']}

df = ks.sql("SELECT AVG(DEP_DELAY) as MEAN_DELAY FROM us_delay_flights_tbl WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 and OP_CARRIER NOT IN ('US', 'VX')").to_koalas()
avg_delay = float(df.MEAN_DELAY[0])

df = ks.sql("SELECT OP_CARRIER, AVG(DEP_DELAY) as MEAN_DELAY FROM us_delay_flights_tbl WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 and OP_CARRIER NOT IN ('US', 'VX') and ORIGIN NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') and DEST NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') GROUP BY OP_CARRIER ORDER BY 1 ASC").to_koalas()
df = df.merge(ks.DataFrame(airline_codes), how='inner', on='OP_CARRIER').set_index('Airline')[['MEAN_DELAY']].sort_values(by='MEAN_DELAY')

x, y = (df.index.to_numpy(), df.MEAN_DELAY.to_numpy())
marker_colors = [berkeley_palette['berkeley_blue'] if x <= avg_delay else berkeley_palette['rose_garden'] for x in y]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.add_shape(type="line", line_color=berkeley_palette['pacific'], line_width = 3, opacity=1, line_dash='dot',
             x0=0, x1=1, xref='paper', y0=avg_delay, y1=avg_delay, yref='y')

fig.update_traces(marker_color=marker_colors, marker_line_color=berkeley_palette['pacific'],
                  marker_line_width=1.5, opacity=0.7)
fig.add_annotation(text="Average Delayed Time (min)", x="Delta Air Lines", y=avg_delay, arrowhead=2, showarrow=True, 
                   yshift=10, xshift=30, arrowsize=2, font=dict(size=12, color=berkeley_palette['berkeley_blue']))

fig.update_layout(title_text='Average Flight Delay in Minutes by Airline vs. Industry Average', title_font_size = 24,
                  height=500, width=1000,
                  yaxis=dict(title='Average Delay (min.)', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Airline', titlefont_size=16, color=berkeley_palette['berkeley_blue'],
                            tickmode="array", tickvals=list(range(0,len(x),1)),
                            ticktext=list(x)),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()



In [0]:
airline_codes = {'OP_CARRIER':['9E','AA','AS','B6','DL','EV','F9','G4','HA','MQ','NK','OH','OO','UA','US','VX','WN','YV','YX'],
                'Airline':['Endeavor Air', 'American Airlines', 'Alaska Airlines', 'jetBlue', 'Delta Air Lines', 'ExpressJet', 'Frontier Airlines',
                          'Allegiant Air', 'Hawaiian Airlines', 'Envoy Air', 'Spirit Airlines', 'PSA Airlines', 'SkyWest Airlines', 'United Airlines',
                          'US Airways', 'Virgin America', 'Southwest Airlines', 'Mesa Airlines', 'Republic Airways']}

df = ks.sql("SELECT OP_CARRIER, COUNT(*) as TOTAL, AVG(DEP_DELAY) as MEAN_DELAY, SUM(IF(DEP_DELAY > 15, 1, 0)) AS LATE_FLIGHTS FROM us_delay_flights_tbl WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 and OP_CARRIER NOT IN ('US', 'VX') and ORIGIN NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') and DEST NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') GROUP BY OP_CARRIER")
df = df.merge(ks.DataFrame(airline_codes), how='inner', on='OP_CARRIER').sort_values(by='Airline').set_index('Airline')

total_flights = int(df.TOTAL.to_numpy().sum())
df['PCT_LATE_FLIGHTS'] = (df.LATE_FLIGHTS / df.TOTAL) * 100.0
df['ON_TIME_FLIGHTS'] = df.TOTAL - df.LATE_FLIGHTS
df['PCT_ON_TIME_FLIGHTS'] = (df.ON_TIME_FLIGHTS / df.TOTAL) * 100.0
df['PCT_FLIGHTS'] = (df.TOTAL / total_flights) * 100.0
df = df.sort_values(by='PCT_FLIGHTS', ascending=False)

labels = df.index.to_numpy()
widths = df.PCT_FLIGHTS.to_numpy()
color = [berkeley_palette['pacific'], berkeley_palette['rose_garden']]
data = {
  "On-Time":df.PCT_ON_TIME_FLIGHTS.to_numpy(),
  "Delayed":df.PCT_LATE_FLIGHTS.to_numpy()
}

i, fig = -1, go.Figure()
for key in data:
    i+=1
    fig.add_trace(go.Bar(
        name=key,
        y=data[key],
        x=np.cumsum(widths)-widths,
        width=widths,
        offset=0,
        customdata=np.transpose([labels, widths*data[key]]),
        text=data[key],
        texttemplate="%{text:.2f}%",
        #texttemplate="%{y} x %{width} =<br>%{customdata[1]}",
        textposition="inside",
        textangle=0,
        textfont_color="white",
        hovertemplate="<br>".join([
            "label: %{customdata[0]}",
            "width: %{width}",
            "height: %{y}",
            "area: %{customdata[1]}",
        ]),
        marker_color=color[i], marker_line_color=berkeley_palette['berkeley_blue'],
        opacity=0.7
    ))

fig.update_xaxes( 
    tickvals=np.cumsum(widths)-widths/2, 
    ticktext= labels
)

fig.update_xaxes(range=[0,100])
fig.update_yaxes(range=[0,100])

fig.update_layout(
    title_text="Ratio of On-Time vs Delayed Flight by Airline (Area = % of Flights by Airline)",
    title_font_size = 24, title_font_color = berkeley_palette['berkeley_blue'],
    yaxis=dict(title='% of Airline Flights', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
    xaxis=dict(title='Airline', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
    barmode="stack",
    uniformtext=dict(mode="hide", minsize=10),
    width = 1500, height=500, plot_bgcolor='rgba(0,0,0,0)')


fig.show()
del df

#### Evaluation of Flight Geography

In [0]:
limits = [(1,100),(100,1000),(1000,10000),(10000,100000),(100000,1000000), (1000000,10000000)]
colors = [berkeley_palette['pacific'],berkeley_palette['founders_rock'],berkeley_palette['rose_garden'],berkeley_palette['wellman_tile'],berkeley_palette['california_gold'],berkeley_palette['berkeley_blue']]
scale = 1000

fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = spark.sql(f"Select a.*, at.latitude, at.longitude from airport_delay_analysis a join airport_tbl at on a.ORIGIN=at.IATA where total_flights between {lim[0]} and {lim[1]}").toPandas()
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['origin'] + '<br>Delayed Flights ' + (df_sub['num_delay']).astype(str) + '<br>Flights ' + (df_sub['total_flights']).astype(str),
        marker = dict(
            size = df_sub['total_flights']/scale,
            color = colors[i],
            line_color=berkeley_palette['south_hall'],
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0:,} - {1:,} flights'.format(lim[0],lim[1])))

fig.update_layout(
        title_text = 'Flight Volume by Origin Location (2015-2019)',
        title_font_size = 24,
        height=800, width=1000,
        showlegend = True,
        geo = dict(
            scope = 'usa',
            landcolor = berkeley_palette['bay_fog'])
    )

fig.show()

In [0]:
limits = [(0,1),(1,2),(2,5),(5,10),(10,15), (15,25)]
colors = [berkeley_palette['pacific'],berkeley_palette['founders_rock'],berkeley_palette['soybean'],berkeley_palette['lawrence'],berkeley_palette['california_gold'],berkeley_palette['berkeley_blue']]
scale = 100
fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = spark.sql(f"Select a.*, at.latitude, at.longitude from airport_pagerank a join airport_tbl at on a.id=at.IATA where pagerank between {lim[0]} and {lim[1]}").toPandas()
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['id'] + '<br>Page Rank ' + (df_sub['pagerank']).astype(str),
        marker = dict(
            size = df_sub['pagerank'] * scale,
            color = colors[i],
            line_color=berkeley_palette['berkeley_blue'],
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1])))

fig.update_layout(
        title_text = 'PageRank by Origin Airport',
        title_font_size = 24,
        showlegend = True,
        height = 800, width = 1000,
        geo = dict(
            scope = 'usa',
            landcolor = berkeley_palette['bay_fog'],
        )
    )

fig.show()

In [0]:
import plotly.graph_objects as go

import pandas as pd

limits = [(0,0.05),(0.05,0.1),(0.1,0.15),(0.15,0.2),(0.2,0.25)]
colors = [berkeley_palette['lap_lane'],berkeley_palette['lawrence'],berkeley_palette['founders_rock'],berkeley_palette['pacific'],berkeley_palette['berkeley_blue']]
scale = 500
fig = go.Figure()

for i in range(len(limits)):
    lim = limits[i]
    df_sub = spark.sql(f"Select a.*, at.latitude, at.longitude from airport_delay_analysis a join airport_tbl at on a.ORIGIN=at.IATA where ratio between {lim[0]} and {lim[1]}").toPandas()
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_sub['longitude'],
        lat = df_sub['latitude'],
        text = df_sub['origin'] + '<br>Ratio ' + (df_sub['ratio']).astype(str),
        marker = dict(
            size = df_sub['ratio'] * scale,
            color = colors[i],
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ),
        name = '{0} - {1}'.format(lim[0],lim[1])))

fig.update_layout(
        title_text = 'Airport Departure Delay Ratio',
        title_font_size = 24,
        showlegend = True,
        height = 800, width = 1000,
        geo = dict(
            scope = 'usa',
            landcolor = berkeley_palette['bay_fog'],
        )
    )

fig.show()

In [0]:
delayDF = spark.sql("select concat(ORIGIN, '-', DEST) as OD, avg_dep_delay from OD_Group order by avg_dep_delay desc").toPandas()

x, y = delayDF.OD.to_numpy(),  delayDF.avg_dep_delay.to_numpy()
marker_colors = [berkeley_palette['lawrence'] if d < 5 else berkeley_palette['founders_rock'] if d < 10 else berkeley_palette['pacific'] if d < 15 else berkeley_palette['berkeley_blue'] for d in y]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.update_traces(marker_color=marker_colors, marker_line_color=marker_colors,
                  marker_line_width=1.5, opacity=0.7)
fig.add_annotation(x=700, y=17,
            text="Departure Delay > 15 minutes",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=2300, y=14,
            text="Departure Delay between 10 and 15 minutes",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=4500, y=10,
            text="Departure Delay between 5 and 10 minutes",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=6300, y=5,
            text="Departure Delay < 5 minutes",
            showarrow=True,
            arrowhead=1)
fig.update_layout(title_text='Average Departure Delay by Route', title_font_size = 24,
                  height=500, width=1400,
                  yaxis=dict(title='Average Departure Delay (min)', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Route (Origin + Destination)', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
df = ks.sql("""
  with cte as (
    SELECT YEAR, SUM(IF(DEP_DELAY > 15, 1, 0)) as TOTAL_DELAYS, COUNT(*) as TOTAL_FLIGHTS
    FROM us_delay_flights_tbl 
    WHERE DEP_DELAY >= -15 and CANCELLED = 0.000 
    and OP_CARRIER NOT IN ('US', 'VX') 
    and ORIGIN NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') 
    and DEST NOT IN ('CEC', 'CLD', 'ILG', 'DIK', 'EFD', 'ENV', 'TKI', 'UST', 'FNL', 'YNG', 'IFP', 'FLO') 
    GROUP BY YEAR
  )

  select YEAR, (1 - (TOTAL_DELAYS / TOTAL_FLIGHTS)) as OntimeRatio, (TOTAL_DELAYS / TOTAL_FLIGHTS) as DelayRatio
  from cte
  order by YEAR""").to_koalas()

top_labels = ['On-Time Flights', 'Delayed Flights']
colors = [berkeley_palette['pacific'], berkeley_palette['rose_garden']]
x_data = df.to_numpy()[:,1:3]
y_data = df.to_numpy()[:,0]

fig = go.Figure()

for i in range(0, len(x_data[0])):
    for xd, yd in zip(x_data, y_data):
        fig.add_trace(go.Bar(
            x=[xd[i]], y=[yd],
            orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)', width=1)
            )
        ))

fig.update_layout(
    xaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
        domain=[0.15, 1]
    ),
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=False,
        zeroline=False,
    ),
    barmode='stack',
    #paper_bgcolor='rgb(248, 248, 255)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    #plot_bgcolor='rgb(248, 248, 255)',
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(l=120, r=10, t=140, b=80),
    showlegend=False,
)

annotations = []

for yd, xd in zip(y_data, x_data):
    # labeling the y-axis
    annotations.append(dict(xref='paper', yref='y',
                            x=0.14, y=yd,
                            xanchor='right',
                            text=str(int(yd)),
                            font=dict(family='Arial', size=16,
                                      color=berkeley_palette['berkeley_blue']),
                            showarrow=False, align='right'))
    # labeling the first percentage of each bar (x_axis)
    annotations.append(dict(xref='x', yref='y',
                            x=xd[0] / 2, y=yd,
                            text = "{:.2%}".format(xd[0]),
                            #text=str(xd[0]) + '%',
                            font=dict(family='Arial', size=14,
                                      color='rgb(248, 248, 255)'),
                            showarrow=False))
    # labeling the first Likert scale (on the top)
    if yd == y_data[-1]:
        annotations.append(dict(xref='x', yref='paper',
                                x=xd[0] / 2, y=1.1,
                                text=top_labels[0],
                                font=dict(family='Arial', size=16,
                                          color=berkeley_palette['berkeley_blue']),
                                showarrow=False))
    space = xd[0]
    for i in range(1, len(xd)):
            # labeling the rest of percentages for each bar (x_axis)
            annotations.append(dict(xref='x', yref='y',
                                    x=space + (xd[i]/2), y=yd,
                                    #text=str(xd[i]) + '%',
                                    text = "{:.2%}".format(xd[i]),
                                    font=dict(family='Arial', size=14,
                                              color='rgb(248, 248, 255)'),
                                    showarrow=False))
            # labeling the Likert scale
            if yd == y_data[-1]:
                annotations.append(dict(xref='x', yref='paper',
                                        x=space + (xd[i]/2), y=1.1,
                                        text=top_labels[i],
                                        font=dict(family='Arial', size=16,
                                                  color=berkeley_palette['berkeley_blue']),
                                        showarrow=False))
            space += xd[i]

fig.update_layout(annotations=annotations, title_text='Ratio of On-Time and Delayed Flights by Year', title_font_size = 24,
                  height=500, width=1400,
                  yaxis=dict(title=dict(text='Calendar Year', standoff=60), titlefont_size=20, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')

fig.show()

In [0]:
%sql

refresh table flights_all_v5

In [0]:
delayDF = spark.sql('''SELECT t1.TAIL_NUM, COUNT/COUNT_TOTAL FRAC_SAME FROM
(SELECT TAIL_NUM,COUNT(*) COUNT 
FROM flights_all_v5 
WHERE DEP_DEL15 == DEP_DEL15_PREV AND DEP_DEL15 == 1 
GROUP BY TAIL_NUM) t1
JOIN
(SELECT TAIL_NUM,COUNT(*) COUNT_TOTAL
FROM flights_all_v5 
WHERE DEP_DEL15 == 1 
GROUP BY TAIL_NUM) t2
ON t1.TAIL_NUM = t2.TAIL_NUM
ORDER BY FRAC_SAME DESC''').toPandas()

x, y = delayDF.TAIL_NUM.to_numpy(),  delayDF.FRAC_SAME.to_numpy()
marker_colors = [berkeley_palette['lawrence'] if d < 0.3 else berkeley_palette['founders_rock'] if d < 0.4 else berkeley_palette['pacific'] if d < 0.5 else berkeley_palette['berkeley_blue'] for d in y]

fig = go.Figure(data=[go.Bar(x=x, y=y, hovertext=y)])
fig.update_traces(marker_color=marker_colors, marker_line_color=marker_colors,
                  marker_line_width=1.5, opacity=0.7)

fig.add_annotation(x=450, y=0.55,
            text="Ratio prev. flight<br> delays > 0.5",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=2300, y=0.45,
            text="Ratio prev. flight<br> delays > 0.4",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=6000, y=0.4,
            text="Ratio prev. flight<br> delays > 0.3",
            showarrow=True,
            arrowhead=1)
fig.add_annotation(x=7500, y=0.3,
            text="Ratio prev. flight<br> delays < 0.3",
            showarrow=True,
            arrowhead=1)
fig.update_layout(title_text='Delayed Flights with Previous Flight Also Delayed, by Tail Number', title_font_size = 24,
                  height=500, width=1400,
                  yaxis=dict(title='Ratio of Delayed Previous Flight', titlefont_size=16, tickfont_size=14, color=berkeley_palette['berkeley_blue']),
                  xaxis=dict(title='Tail Number', titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')
fig.show()

In [0]:
pct_table_dep_time = spark.sql('''SELECT ORIGIN, AVG(DEP_DEL15) avg_del FROM (SELECT * FROM flights_all_v5 ORDER BY FL_DATE, CRS_DEP_TIME LIMIT 20000000) GROUP BY ORIGIN''').toPandas()

In [0]:
pct_table_dep_time = pct_table_dep_time.sort_values('avg_del', ascending = False)
fig = go.Figure(data=[go.Scatter(x=pct_table_dep_time.ORIGIN, y=pct_table_dep_time.avg_del, fill='tozeroy', name='All Airports', line_color=berkeley_palette['founders_rock'])])
fig.update_layout(title_text="Total Delayed Flights by Origin", titlefont_size=24, height=300, width=1000,
                 yaxis=dict(title="Ratio of Delayed Flights", titlefont_size=16, tickfont_size=12, color=berkeley_palette['berkeley_blue']),
                 xaxis=dict(title="", titlefont_size=16, color=berkeley_palette['berkeley_blue']),
                  plot_bgcolor='rgba(0,0,0,0)')

fig.show()