# Flight Delays: Web App

In this notebook, we'll prepare data for use in our Dash web app.

### Business Goals

Here we want to achieve the following:

1. Prepare data for vizualizations by date
2. Prepare data for vizualizations by airline

In [2]:
import pandas as pd
import glob
import os
import requests
import json
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import sqlite3 as db
import datetime
from pytz import timezone
import pytz

from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

import pandas as pd
import glob
import os
import requests
import json
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,\
precision_recall_fscore_support, f1_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_validate, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import random
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import xgboost as xgb

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay

from keras import regularizers
from keras import models
from keras import layers
import lightgbm
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

import keras
from keras.models import Sequential
from keras.layers import Dense

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
from scipy import stats
import pickle

## Importing data

In [18]:
carrier_data = pd.read_csv('data/prepared/data_for_graphing.csv')

In [19]:
relevant_airports = ['ATL', 'DFW', 'DEN', 'ORD', 'LAX', 'CLT', 'LAS', 'PHX', 
                     'MCO', 'SEA', 'MIA', 'IAH', 'JFK', 'FLL', 'EWR', 'SFO', 'MSP', 'DTW',
                     'BOS', 'SLC', 'PHL', 'BWI', 'TPA', 'SAN', 'MDW', 'LGA', 'BNA', 'IAD',
                     'DAL', 'DCA', 'PDX', 'AUS', 'HOU', 'HNL', 'STL', 'RSW', 'SMF', 'MSY',
                     'SJU', 'RDU', 'OAK', 'MCI', 'CLE', 'IND', 'SAT', 'SNA', 'PIT', 'CVG',
                     'CMH', 'PBI', 'JAX', 'MKE', 'ONT', 'ANC', 'BDL', 'OGG', 'OMA', 'MEM',
                     'BOI', 'RNO', 'CHS', 'OKC']

airport_filter = '|'.join(relevant_airports)

carrier_data = carrier_data[carrier_data['ORIGIN'].str.contains(airport_filter)]

In [20]:
carrier_data['airport-lookup-key'] = carrier_data['ORIGIN'] + '-' + carrier_data['DEST']
airport_lookup = carrier_data.drop_duplicates(subset=['airport-lookup-key'])

In [14]:
airport_lookup = airport_lookup[['ORIGIN', 'DEST','airport-lookup-key', 'origin-elevation','dest-elevation', 'DISTANCE','dest-lat-long','origin-lat-long','origin-tz','dest-tz']]

In [18]:
airport_lookup.to_csv('data/prepared/airport_lookup.csv', index=False)

## Grouping Data by Day

To group by day, we'll need to create a datetime field to group by first.

In [21]:
carrier_data['severe_delay'] = carrier_data['ARR_DELAY_NEW'] > 60
carrier_data['severe_delay'] = carrier_data['severe_delay'].map({True: 'Severe Delays', False: 'No Severe Delays'})

In [22]:
carrier_data['FL_DATE'] = pd.to_datetime(carrier_data['FL_DATE']).dt.date

In [23]:
grouped_data = carrier_data.groupby(['FL_DATE', 'ORIGIN', 'holiday', 'DAY_OF_WEEK', 'takeoff-time-of-day', 'severe_delay'], as_index=False).size()

Let's preview our data

In [24]:
grouped_data.head()

Unnamed: 0,FL_DATE,ORIGIN,holiday,DAY_OF_WEEK,takeoff-time-of-day,severe_delay,size
0,2021-06-01,ANC,Not a Holiday,Tuesday,Early Afternoon,No Severe Delays,8
1,2021-06-01,ANC,Not a Holiday,Tuesday,Early Afternoon,Severe Delays,1
2,2021-06-01,ANC,Not a Holiday,Tuesday,Early Evening,No Severe Delays,4
3,2021-06-01,ANC,Not a Holiday,Tuesday,Early Morning,No Severe Delays,11
4,2021-06-01,ANC,Not a Holiday,Tuesday,Late Afternoon,No Severe Delays,4


We'll want the user to be able to graph data on 4 levels:
1. All delays
2. Delays by airport
3. Delays by airline
4. Delays by airline & airport

So we work to pivot our data and save out files for these 4 levels.

In [25]:
# Delays by airport
df_by_airport = pd.pivot_table(grouped_data, values='size', index=['FL_DATE', 'ORIGIN'],
                    columns=['severe_delay'], aggfunc=np.sum, fill_value=0)
df_by_airport = df_by_airport.reset_index()
df_by_airport['percent-delayed'] = df_by_airport['Severe Delays'] / (df_by_airport['No Severe Delays'] + df_by_airport['Severe Delays'])

# Delays by holiday
df_by_holiday = pd.pivot_table(grouped_data, values='size', index=['holiday', 'ORIGIN'],
                    columns=['severe_delay'], aggfunc=np.sum, fill_value=0)
df_by_holiday = df_by_holiday.reset_index()
df_by_holiday['percent-delayed'] = df_by_holiday['Severe Delays'] / (df_by_holiday['No Severe Delays'] + df_by_holiday['Severe Delays'])

# Delays by holiday
df_by_timeofday_weekday = pd.pivot_table(grouped_data, values='size', index=['DAY_OF_WEEK', 'takeoff-time-of-day','ORIGIN'],
                    columns=['severe_delay'], aggfunc=np.sum, fill_value=0)
df_by_timeofday_weekday = df_by_timeofday_weekday.reset_index()
df_by_timeofday_weekday['percent-delayed'] =df_by_timeofday_weekday['Severe Delays'] / (df_by_timeofday_weekday['No Severe Delays'] + df_by_timeofday_weekday['Severe Delays'])

Now let's save these out to files.

In [27]:
df_by_airport.to_csv('data/prepared/delays-by-airport.csv', index=False)
df_by_holiday.to_csv('data/prepared/delays-by-holiday.csv', index=False)
df_by_timeofday_weekday.to_csv('data/prepared/df_by_timeofday_weekday.csv', index=False)