# Flight Delays: Web App

Here we will prepare data for vizualizations in the web app. Because we have over 14,000 records it will be useful to prep and save some of the data into a smaller size by stripping it down to the bare essentials we're trying to vizualize.

In [1]:
import pandas as pd
import glob
import os
import requests
import json
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import sqlite3 as db
import datetime
from pytz import timezone
import pytz

from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

import pandas as pd
import glob
import os
import requests
import json
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,\
precision_recall_fscore_support, f1_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_validate, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import random
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import xgboost as xgb

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay

from keras import regularizers
from keras import models
from keras import layers
import lightgbm
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import class_weight
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

import keras
from keras.models import Sequential
from keras.layers import Dense

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
from scipy import stats
import pickle

### Business Goals

Here we want to achieve the following:

1. Prepare data for vizualizations by date
2. Prepare data for vizualizations by airline

## Importing the Data
Like before, we bring the cleaned data in.

In [2]:
df = pd.read_csv('data/prepared/cleaned_data.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Creating a flight lookup database

In [3]:
# Some flights happen multiple times per day, so we create a key to group flights by their excact # and departure time
df['exact-flight'] = df['CRS_DEP_TIME'].astype(str) + df['flight-number']

In [4]:
unique_flight_records = df.drop_duplicates(subset=['exact-flight'])

In [5]:
cols_to_drop = ['YEAR',
'MONTH',
'DAY_OF_MONTH',
'DAY_OF_WEEK',
'TAIL_NUM',
'DEP_DELAY',
'DEP_DELAY_NEW',
'ARR_DELAY_NEW',
'FL_DATE_LOCAL',
'FL_ARR_DATE_LOCAL',
'ARR_DAY_OF_WEEK',
'takeoff-congestion-key',
'arrival-congestion-key',
'congestion-key',
'avg-takeoff-congestion',
'avg-arrival-congestion',
'dest-congestion-key',
'dest-avg-takeoff-congestion',
'dest-avg-arrival-congestion',
'weather-key',
'dest-weather-key',
'lat-long',
'maxtemp',
'mintemp',
'avgtemp',
'totalprecip',
'avgvis',
'maxwind',
'avghumidity',
'dest-maxtemp',
'dest-mintemp',
'dest-avgtemp',
'dest-totalprecip',
'dest-avgvis',
'dest-maxwind',
'dest-avghumidity',
'days-from-specific-holiday']

unique_flight_records.drop(columns=cols_to_drop, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [6]:
unique_flight_records.to_csv('data/prepared/unique_flight_number_data.csv', index=False)

In [2]:
df = pd.read_csv('data/prepared/data_for_graphing.csv')

In [3]:
relevant_airports = ['ATL', 'DFW', 'DEN', 'ORD', 'LAX', 'CLT', 'LAS', 'PHX', 
                     'MCO', 'SEA', 'MIA', 'IAH', 'JFK', 'FLL', 'EWR', 'SFO', 'MSP', 'DTW',
                     'BOS', 'SLC', 'PHL', 'BWI', 'TPA', 'SAN', 'MDW', 'LGA', 'BNA', 'IAD',
                     'DAL', 'DCA', 'PDX', 'AUS', 'HOU', 'HNL', 'STL', 'RSW', 'SMF', 'MSY',
                     'SJU', 'RDU', 'OAK', 'MCI', 'CLE', 'IND', 'SAT', 'SNA', 'PIT', 'CVG',
                     'CMH', 'PBI', 'JAX', 'MKE', 'ONT', 'ANC', 'BDL', 'OGG', 'OMA', 'MEM',
                     'BOI', 'RNO', 'CHS', 'OKC']

airport_filter = '|'.join(relevant_airports)

df = df[df['ORIGIN'].str.contains(airport_filter)]

## Grouping Data by Day

To group by day, we'll need to create a datetime field to group by first.

In [4]:
df['severe_delay'] = df['ARR_DELAY_NEW'] > 60
df['severe_delay'] = df['severe_delay'].map({True: 'Severe Delays', False: 'No Severe Delays'})

Next, we want to rename those airlines to something more user-friendly.

In [5]:
df['FL_DATE_LOCAL_ROUNDED'] = pd.to_datetime(df['FL_DATE_LOCAL_ROUNDED'])
df['rounded-hour'] = df['FL_DATE_LOCAL_ROUNDED'].dt.hour.astype(str).str.zfill(2)

In [6]:
df['FL_DATE_LOCAL'] = pd.to_datetime(df['FL_DATE_LOCAL'])
df['FL_DATE_LOCAL'] = df['FL_DATE_LOCAL'].dt.date

In [7]:
grouped_data = df.groupby(['FL_DATE_LOCAL', 'rounded-hour', 'ORIGIN', 'holiday', 'severe_delay'], as_index=False).size()

Let's preview our data

In [26]:
grouped_data.head()

Unnamed: 0,FL_DATE_LOCAL,rounded-hour,ORIGIN,severe_delay,size
0,2021-06-01,0,ANC,No Severe Delays,2
1,2021-06-01,0,DEN,No Severe Delays,1
2,2021-06-01,0,FLL,No Severe Delays,1
3,2021-06-01,0,LAS,No Severe Delays,13
4,2021-06-01,0,LAX,No Severe Delays,10


We'll want the user to be able to graph data on 4 levels:
1. All delays
2. Delays by airport
3. Delays by airline
4. Delays by airline & airport

So we work to pivot our data and save out files for these 4 levels.

In [30]:
# Delays by airport
df_by_airport = pd.pivot_table(grouped_data, values='size', index=['FL_DATE_LOCAL', 'ORIGIN'],
                    columns=['severe_delay'], aggfunc=np.sum, fill_value=0)
df_by_airport = df_by_airport.reset_index()
df_by_airport['percent-delayed'] = df_by_airport['Severe Delays'] / (df_by_airport['No Severe Delays'] + df_by_airport['Severe Delays'])

# Delays by hour
df_by_hour = pd.pivot_table(grouped_data, values='size', index=['rounded-hour', 'ORIGIN'],
                    columns=['severe_delay'], aggfunc=np.sum, fill_value=0)
df_by_hour = df_by_hour.reset_index()
df_by_hour['percent-delayed'] = df_by_hour['Severe Delays'] / (df_by_hour['No Severe Delays'] + df_by_hour['Severe Delays'])
df_by_hour['rounded-hour'] = pd.to_numeric(df_by_hour['rounded-hour'])

# Delays by holiday
df_by_holiday = pd.pivot_table(grouped_data, values='size', index=['holiday', 'ORIGIN'],
                    columns=['severe_delay'], aggfunc=np.sum, fill_value=0)
df_by_holiday = df_by_holiday.reset_index()
df_by_holiday['percent-delayed'] = df_by_holiday['Severe Delays'] / (df_by_holiday['No Severe Delays'] + df_by_holiday['Severe Delays'])

Now let's save these out to files.

In [43]:
df_by_airport.to_csv('data/prepared/delays-by-airport.csv', index=False)
df_by_hour.to_csv('data/prepared/delays-by-hour.csv', index=False)
df_by_holiday.to_csv('data/prepared/delays-by-holiday.csv', index=False)

Next, we may want to see how a graph might look like in our web app that simply plots severe delays at all airports and for all airlines.

Now let's look at delays at a specific airport.

In [21]:
# data  where the index is the date
fig = px.line(df_by_date,
              x=df_by_airport.loc[df_by_airport['ORIGIN'] == 'JFK']['date'],
              y=df_by_airport.loc[df_by_airport['ORIGIN'] == 'JFK']['percent-delayed'],
              labels={
                     "x": "Date",
                     "y": "Severe Delays"},  title="Daily Severe Airport Delays at JFK Airport")

# Show plot 
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

In [38]:
px.bar(df_by_hour,
                  x=df_by_hour.loc[df_by_hour['ORIGIN'] == 'ATL']['rounded-hour'],
                  y=df_by_hour.loc[df_by_hour['ORIGIN'] == 'ATL']['percent-delayed'],
                  labels={"x": "Hour of the Day",
                          "y": "Severe Delays"},
                  title="Severe Delays by Hour of the Day at JFK Airport")