# Flight Data Clean Up

## Load source data into a single dataframe to explore

In [2]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import glob
import zipfile
import os
import shutil

In [3]:
# List of all source zips
zips = ['Resources/BTS/BTS_Q1_2023.zip','Resources/BTS/BTS_Q2_2023.zip',
       'Resources/BTS/BTS_Q3_2023.zip','Resources/BTS/BTS_Q4_2023.zip']

# Directory to extract to, confirm it exists
extraction_path = 'Resources/BTS/extracted_zips'
os.makedirs(extraction_path, exist_ok=True)

# Iterate through zips and extract all
for zip in zips:
    with zipfile.ZipFile(zip, 'r') as zip_ref:
        zip_ref.extractall(extraction_path)

    print(f"Extracted {zip} to {extraction_path}")

Extracted Resources/BTS/BTS_Q1_2023.zip to Resources/BTS/extracted_zips
Extracted Resources/BTS/BTS_Q2_2023.zip to Resources/BTS/extracted_zips
Extracted Resources/BTS/BTS_Q3_2023.zip to Resources/BTS/extracted_zips
Extracted Resources/BTS/BTS_Q4_2023.zip to Resources/BTS/extracted_zips


In [4]:
# Read all extracted CSV's
# glob is an efficient way to work through all CSV's in a directory: https://docs.python.org/3/library/glob.html
csv_files = glob.glob(f"{extraction_path}/*.csv")

# Empty list for storing DataFrames
dataframes = []

# Iterate through csv's and store into DataFrames list
for csv_file in csv_files:
    df = pd.read_csv(csv_file, low_memory=False)
    dataframes.append(df)
# Concatenante into single DataFrame
unfiltered_flight_info = pd.concat(dataframes, ignore_index = True)
pd.set_option('display.max_columns', None)
unfiltered_flight_info.head()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,Originally_Scheduled_Code_Share_Airline,DOT_ID_Originally_Scheduled_Code_Share_Airline,IATA_Code_Originally_Scheduled_Code_Share_Airline,Flight_Num_Originally_Scheduled_Code_Share_Airline,Operating_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings,DivReachedDest,DivActualElapsedTime,DivArrDelay,DivDistance,Div1Airport,Div1AirportID,Div1AirportSeqID,Div1WheelsOn,Div1TotalGTime,Div1LongestGTime,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum,Div3Airport,Div3AirportID,Div3AirportSeqID,Div3WheelsOn,Div3TotalGTime,Div3LongestGTime,Div3WheelsOff,Div3TailNum,Div4Airport,Div4AirportID,Div4AirportSeqID,Div4WheelsOn,Div4TotalGTime,Div4LongestGTime,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Duplicate,Unnamed: 119
0,2023,1,1,22,7,2023-01-22,B6,B6,20409,B6,1447,,,,,B6,20409,B6,N636JB,1447,12197,1219702,31703,HPN,"White Plains, NY",NY,36,New York,22,15304,1530402,33195,TPA,"Tampa, FL",FL,12,Florida,33,800,756.0,-4.0,0.0,0.0,-1.0,0800-0859,12.0,808.0,1053.0,5.0,1102,1058.0,-4.0,0.0,0.0,-1.0,1100-1159,0.0,,0.0,182.0,182.0,165.0,1.0,1032.0,5,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
1,2023,1,1,22,7,2023-01-22,B6,B6,20409,B6,1451,,,,,B6,20409,B6,N3112J,1451,10721,1072102,30721,BOS,"Boston, MA",MA,25,Massachusetts,13,13204,1320402,31454,MCO,"Orlando, FL",FL,12,Florida,33,1955,2048.0,53.0,53.0,1.0,3.0,1900-1959,16.0,2104.0,6.0,10.0,2322,16.0,54.0,54.0,1.0,3.0,2300-2359,0.0,,0.0,207.0,208.0,182.0,1.0,1121.0,5,39.0,0.0,1.0,0.0,14.0,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
2,2023,1,1,22,7,2023-01-22,B6,B6,20409,B6,1453,,,,,B6,20409,B6,N507JT,1453,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,14843,1484306,34819,SJU,"San Juan, PR",PR,72,Puerto Rico,3,1727,1734.0,7.0,7.0,0.0,0.0,1700-1759,25.0,1759.0,2110.0,7.0,2054,2117.0,23.0,23.0,1.0,1.0,2000-2059,0.0,,0.0,147.0,163.0,131.0,1.0,1046.0,5,7.0,0.0,16.0,0.0,0.0,1713.0,6.0,6.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
3,2023,1,1,22,7,2023-01-22,B6,B6,20409,B6,1454,,,,,B6,20409,B6,N956JT,1454,14843,1484306,34819,SJU,"San Juan, PR",PR,72,Puerto Rico,3,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,1140,1147.0,7.0,7.0,0.0,0.0,1100-1159,15.0,1202.0,1323.0,5.0,1328,1328.0,0.0,0.0,0.0,0.0,1300-1359,0.0,,0.0,168.0,161.0,141.0,1.0,1046.0,5,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,
4,2023,1,1,22,7,2023-01-22,B6,B6,20409,B6,1455,,,,,B6,20409,B6,N249JB,1455,10721,1072102,30721,BOS,"Boston, MA",MA,25,Massachusetts,13,11278,1127805,30852,DCA,"Washington, DC",VA,51,Virginia,38,700,652.0,-8.0,0.0,0.0,-1.0,0700-0759,16.0,708.0,831.0,4.0,852,835.0,-17.0,0.0,0.0,-2.0,0800-0859,0.0,,0.0,112.0,103.0,83.0,1.0,399.0,2,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,


In [15]:
# Specify columns and remove limitation on display
pd.set_option('display.max_columns', None)
flights_df = pd.DataFrame(unfiltered_flight_info[['FlightDate','IATA_Code_Marketing_Airline', 
                                                  'Operated_or_Branded_Code_Share_Partners',
                                                  'Operating_Airline ','Flight_Number_Marketing_Airline',
                                                  'Origin','Dest','DepDelay','DepDelayMinutes',
                                                  'DepDel15','ArrDelay','ArrDelayMinutes','Cancelled',
                                                  'Diverted','CarrierDelay','WeatherDelay','NASDelay',
                                                  'SecurityDelay','LateAircraftDelay']])
flights_df.head()

Unnamed: 0,FlightDate,IATA_Code_Marketing_Airline,Operated_or_Branded_Code_Share_Partners,Operating_Airline,Flight_Number_Marketing_Airline,Origin,Dest,DepDelay,DepDelayMinutes,DepDel15,ArrDelay,ArrDelayMinutes,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2023-01-22,B6,B6,B6,1447,HPN,TPA,-4.0,0.0,0.0,-4.0,0.0,0.0,0.0,,,,,
1,2023-01-22,B6,B6,B6,1451,BOS,MCO,53.0,53.0,1.0,54.0,54.0,0.0,0.0,39.0,0.0,1.0,0.0,14.0
2,2023-01-22,B6,B6,B6,1453,FLL,SJU,7.0,7.0,0.0,23.0,23.0,0.0,0.0,7.0,0.0,16.0,0.0,0.0
3,2023-01-22,B6,B6,B6,1454,SJU,FLL,7.0,7.0,0.0,0.0,0.0,0.0,0.0,,,,,
4,2023-01-22,B6,B6,B6,1455,BOS,DCA,-8.0,0.0,0.0,-17.0,0.0,0.0,0.0,,,,,


In [16]:
# Empty and delete the extraction_path directory
try:
    shutil.rmtree(extraction_path)
    print(f"Directory {extraction_path} and all its contents have been deleted.")
except OSError as e:
    print(f"Error: {e.strerror}")

Directory Resources/BTS/extracted_zips and all its contents have been deleted.


## Explore and filter flights and airports to matching data

In [17]:
# number of unique origin airports in data
origins_df = unfiltered_flight_info[['Origin','OriginCityName']]
origins_df['Origin'].nunique()

359

In [18]:
# number of unique destination airports in data
destinations_df = unfiltered_flight_info[['Dest','DestCityName']]
destinations_df['Dest'].nunique()

359

In [19]:
# total number of unique airports in data
# compare the origin airports to destination airports
ori_match = origins_df['Origin'].isin(destinations_df['Dest'])
dest_match = destinations_df['Dest'].isin(origins_df['Origin'])
ori_dest_match = ori_match | dest_match

# only keep rows for airports that are listed in both columns
flight_airports = flights_df[ori_dest_match]
flight_airports['Origin'].nunique()
# confirmed that the same 359 airports have travel data

359

In [20]:
# Read in CSV of cleaned us-airports and compare to airports in flight data
us_airports = pd.read_csv('Resources/us-airports-cleaned.csv')
iata_match = us_airports['iata_code'].isin(flight_airports['Origin'])

us_airports_filtered = us_airports[iata_match].reset_index()
us_airports_filtered['iata_code'].nunique()

351

In [43]:
# Which 8 airports do we have data for that our airport list doesn't have?
iata_match = flight_airports['Origin'].isin(us_airports_filtered['iata_code'])

us_airports_missing = flight_airports[~iata_match]
exclude_airports = us_airports_missing['Origin'].unique()
exclude_airports

array(['SJU', 'STT', 'BQN', 'PSE', 'STX', 'PPG', 'GUM', 'SPN'],
      dtype=object)

Research (Google):
SJU, BQM, PSE = Puerto Rico
STT,STX, = U.S. Virgin Islands
PPG = American Samoa (unincorporated territory)
GUM = Guam
SPN = Northern Mariana Islands

In [22]:
# Excluding data from the 8 erroneous airports 
exclusions = flights_df['Origin'].isin(us_airports_missing['Origin'])

filtered_flights = flights_df[~exclusions]
filtered_flights['Origin'].nunique()

351

In [23]:
# Filter airports to just those that have flight data
airport_mask = us_airports['iata_code'].isin(filtered_flights['Origin'])

us_airports_filtered = pd.DataFrame(us_airports[airport_mask])
us_airports_filtered['iata_code'].nunique()

351

In [24]:
us_airports_filtered['type'].value_counts()

type
medium_airport    276
large_airport      65
small_airport      10
Name: count, dtype: int64

In [25]:
# Export filtered data sets to CSV
us_airports_filtered.to_csv('Resources/us_airports_filtered.csv')
filtered_flights.to_csv('Resources/filtered_flights.csv')

In [26]:
# # Zip filtered_flights.csv and delete to allow github upload
csv_path = 'Resources/filtered_flights.csv'
zip_path = 'Resources/filtered_flights.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_path, os.path.basename(csv_path))

# Check if the ZIP file exists and the size condition is met
if os.path.exists(zip_path):
    # Get the size of the ZIP file in bytes
    zip_size = os.path.getsize(zip_path)
    
    # Define the size condition, e.g., file must be larger than 0 bytes to ensure it's not empty
    if zip_size > 50:
        os.remove(csv_path)
        print(f"The file {csv_path} has been zipped, is not empty, and has been deleted.")
    else:
        print(f"The file {zip_path} is empty.")
else:
    print("Error: The zip file was not created successfully.")

The file Resources/filtered_flights.csv has been zipped, is not empty, and has been deleted.


## Identify Airlines

In [27]:
# Identify unique IATA codes and store in a list in order of frequency
airline_IATA_codes = filtered_flights['IATA_Code_Marketing_Airline'].value_counts().index.to_list()
airline_IATA_codes

['AA', 'DL', 'WN', 'UA', 'AS', 'B6', 'NK', 'F9', 'G4', 'HA']

In [28]:
# Identify airlines and make list in matching order 
# Source: https://www.iata.org/en/publications/directories/code-search/
airline_names = ['American Airlines','Delta Airlines','Southwest Airlines','United Airlines',
                 'Alaska Airlines','JetBlue','Spirit Airlines','Frontier Airlines','Allegiant Air',
                 'Hawaiian Airlines']
# Build dictionary of IATA_codes and airline names to turn into a data frame
airline_data = {'airline_IATA_code':airline_IATA_codes,
               'airline_name':airline_names}

In [29]:
# create dataframe
airline_df = pd.DataFrame(airline_data)
airline_df

Unnamed: 0,airline_IATA_code,airline_name
0,AA,American Airlines
1,DL,Delta Airlines
2,WN,Southwest Airlines
3,UA,United Airlines
4,AS,Alaska Airlines
5,B6,JetBlue
6,NK,Spirit Airlines
7,F9,Frontier Airlines
8,G4,Allegiant Air
9,HA,Hawaiian Airlines


In [31]:
# Export to CSV
airline_df.to_csv('Resources/airlines.csv')

## Get city of airport from the flight data and insert into airport information for SQL

In [55]:
us_airports_filtered.head(1)

Unnamed: 0.1,Unnamed: 0,iata_code,type,name,latitude_deg,longitude_deg,elevation_ft,region_name,local_region
0,0,LAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125.0,California,CA


In [57]:
# filter destination airports to unique entries
dest_filtered = destinations_df.drop_duplicates(subset=['Dest', 'DestCityName'])
dest_filtered.count()

Dest            359
DestCityName    359
dtype: int64

In [61]:
# merge airports data with city data
airport_cities = us_airports_filtered.merge(dest_filtered, left_on='iata_code', 
                                            right_on='Dest', how ='inner')
airport_cities.reset_index(drop=True).head()

Unnamed: 0.1,Unnamed: 0,iata_code,type,name,latitude_deg,longitude_deg,elevation_ft,region_name,local_region,Dest,DestCityName
0,0,LAX,large_airport,Los Angeles International Airport,33.942501,-118.407997,125.0,California,CA,LAX,"Los Angeles, CA"
1,1,ORD,large_airport,Chicago O'Hare International Airport,41.9786,-87.9048,680.0,Illinois,IL,ORD,"Chicago, IL"
2,2,JFK,large_airport,John F Kennedy International Airport,40.639447,-73.779317,13.0,New York,NY,JFK,"New York, NY"
3,3,ATL,large_airport,Hartsfield Jackson Atlanta International Airport,33.6367,-84.428101,1026.0,Georgia,GA,ATL,"Atlanta, GA"
4,4,SFO,large_airport,San Francisco International Airport,37.618999,-122.375,13.0,California,CA,SFO,"San Francisco, CA"


## Database Prep - dtypes, astype, rename