# Flight Data Clean Up

## Load source data into a single dataframe to explore

In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
import glob
import zipfile
import os
import shutil

In [2]:
# List of all source zips
zips = ['Resources/BTS/BTS_Q1_2023.zip','Resources/BTS/BTS_Q2_2023.zip',
       'Resources/BTS/BTS_Q3_2023.zip','Resources/BTS/BTS_Q4_2023.zip']

# Directory to extract to, confirm it exists
extraction_path = 'Resources/BTS/extracted_zips'
os.makedirs(extraction_path, exist_ok=True)

# Iterate through zips and extract all
for zip in zips:
    with zipfile.ZipFile(zip, 'r') as zip_ref:
        zip_ref.extractall(extraction_path)

    print(f"Extracted {zip} to {extraction_path}")

Extracted Resources/BTS/BTS_Q1_2023.zip to Resources/BTS/extracted_zips
Extracted Resources/BTS/BTS_Q2_2023.zip to Resources/BTS/extracted_zips
Extracted Resources/BTS/BTS_Q3_2023.zip to Resources/BTS/extracted_zips
Extracted Resources/BTS/BTS_Q4_2023.zip to Resources/BTS/extracted_zips


In [None]:
# Read all extracted CSV's
# glob is an efficient way to work through all CSV's in a directory: https://docs.python.org/3/library/glob.html
csv_files = glob.glob(f"{extraction_path}/*.csv")

# Empty list for storing DataFrames
dataframes = []

# Iterate through csv's and store into DataFrames list
for csv_file in csv_files:
    df = pd.read_csv(csv_file, low_memory=False)
    dataframes.append(df)
# Concatenante into single DataFrame
unfiltered_flight_info = pd.concat(dataframes, ignore_index = True)
unfiltered_flight_info.head()

In [None]:
# Specify columns and remove limitation on display
pd.set_option('display.max_columns', None)
flights_df = pd.DataFrame(unfiltered_flight_info[['FlightDate','IATA_Code_Marketing_Airline', 
                                                  'Operated_or_Branded_Code_Share_Partners',
                                                  'Operating_Airline ','Flight_Number_Marketing_Airline',
                                                  'Origin','Dest','DepDelay','DepDelayMinutes',
                                                  'DepDel15','ArrDelay','ArrDelayMinutes','Cancelled',
                                                  'Diverted','CarrierDelay','WeatherDelay','NASDelay',
                                                  'SecurityDelay','LateAircraftDelay']])
flights_df.head()

In [None]:
# Empty and delete the extraction_path directory
try:
    shutil.rmtree(extraction_path)
    print(f"Directory {extraction_path} and all its contents have been deleted.")
except OSError as e:
    print(f"Error: {e.strerror}")

## Explore and filter flights and airports to matching data

In [None]:
# number of unique origin airports in data
origins_df = unfiltered_flight_info[['Origin','OriginCityName']]
origins_df['Origin'].nunique()

In [None]:
# number of unique destination airports in data
destinations_df = unfiltered_flight_info[['Dest','DestCityName']]
destinations_df['Dest'].nunique()

In [None]:
# total number of unique airports in data
# compare the origin airports to destination airports
ori_match = origins_df['Origin'].isin(destinations_df['Dest'])
dest_match = destinations_df['Dest'].isin(origins_df['Origin'])
ori_dest_match = ori_match | dest_match

# only keep rows for airports that are listed in both columns
flight_airports = flights_df[ori_dest_match]
flight_airports['Origin'].nunique()
# confirmed that the same 359 airports have travel data

In [None]:
# Read in CSV of cleaned us-airports and compare to airports in flight data
us_airports = pd.read_csv('Resources/us-airports-cleaned.csv')
iata_match = us_airports['iata_code'].isin(flight_airports['Origin'])

us_airports_filtered = us_airports[iata_match]
us_airports_filtered['iata_code'].nunique()

In [None]:
# Which 8 airports do we have data for that our airport list doesn't have?
iata_match = flight_airports['Origin'].isin(us_airports_filtered['iata_code'])

us_airports_missing = flight_airports[~iata_match]
us_airports_missing['Origin'].unique()

Research (Google):
SJU, BQM, PSE = Puerto Rico
STT,STX, = U.S. Virgin Islands
PPG = American Samoa (unincorporated territory)
GUM = Guam
SPN = Northern Mariana Islands

In [None]:
# Excluding data from the 8 erroneous airports 
exclusions = flights_df['Origin'].isin(us_airports_missing['Origin'])

filtered_flights = flights_df[~exclusions]
filtered_flights['Origin'].nunique()

In [None]:
# Filter airports to just those that have flight data
airport_mask = us_airports['iata_code'].isin(filtered_flights['Origin'])

us_airports_filtered = us_airports[airport_mask]
us_airports_filtered['iata_code'].nunique()

In [None]:
# Export filtered data sets to CSV
us_airports_filtered.to_csv('Resources/us_airports_filtered.csv')
filtered_flights.to_csv('Resources/filtered_flights.csv')

In [None]:
# # Zip filtered_flights.csv and delete to allow github upload
csv_path = 'Resources/filtered_flights.csv'
zip_path = 'Resources/filtered_flights.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_path, os.path.basename(csv_path))

# Check if the ZIP file exists and the size condition is met
if os.path.exists(zip_path):
    # Get the size of the ZIP file in bytes
    zip_size = os.path.getsize(zip_path)
    
    # Define the size condition, e.g., file must be larger than 0 bytes to ensure it's not empty
    if zip_size > 50:
        os.remove(csv_path)
        print(f"The file {csv_path} has been zipped, is not empty, and has been deleted.")
    else:
        print(f"The file {zip_path} is empty.")
else:
    print("Error: The zip file was not created successfully.")

## Identify Airlines



In [None]:
# # Read in CSV and display a few rows
# pd.set_option('display.max_columns', None)
# flight_csv = pd.read_csv('Resources/On_Time_Marketing_Carrier_On_Time_Performance_(Beginning_January_2018)_2023_1.csv', low_memory=False)
# flight_csv.head()

In [None]:
# # Display columns without it being truncated because 'Operating_Airlines' column receives an error
# print({col: index for index, col in enumerate(flight_csv.columns)})
# # Discovered: 'Operating_Airline ' column has a space at the end in the index

In [None]:
# # Select columns and create new dataframe 
# jan_flights_df = pd.DataFrame(flight_csv[['FlightDate','IATA_Code_Marketing_Airline', 'Operated_or_Branded_Code_Share_Partners',
#                                       'Operating_Airline ','Flight_Number_Marketing_Airline','Tail_Number',
#                                       'Origin','Dest','OriginCityName','DestCityName','CRSDepTime','DepTime',
#                                       'DepDelay','DepDelayMinutes','DepDel15','ArrDelay','ArrDelayMinutes','Cancelled',
#                                      'Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']])
# jan_flights_df.head()

In [None]:
# # Check Airlines included
# print(jan_flights_df['IATA_Code_Marketing_Airline'].value_counts())
# print(jan_flights_df['Operating_Airline '].value_counts())

In [None]:
# #exporting to csv
# jan_flights_df.to_csv('Resources/jan_flights_df.csv')

In [None]:
# # Define the path to the ZIP file and the specific CSV file name inside the ZIP
# source_zip = 'path/to/your/data.zip'
# csv = 'your_file.csv'  # Name of the CSV file within the ZIP archive

# # Use pandas.read_csv with the compression parameter
# df = pd.read_csv(zip_file_path, compression='zip', encoding='utf-8', header=0, sep=',', quotechar='"', error_bad_lines=False, engine='python', filepath_or_buffer=f"zip://{zip_file_path}!{csv_file_name}")

# # Now you can work with the DataFrame `df` as usual
# print(df.head())

In [None]:
# # Read in next CSV, select columns
# column_selection = ['FlightDate','IATA_Code_Marketing_Airline', 'Operated_or_Branded_Code_Share_Partners',
#                                       'Operating_Airline ','Flight_Number_Marketing_Airline','Tail_Number',
#                                       'Origin','Dest','OriginCityName','DestCityName','CRSDepTime','DepTime',
#                                       'DepDelay','DepDelayMinutes','DepDel15','ArrDelay','ArrDelayMinutes','Cancelled',
#                                      'Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']
# # more efficient way to work through multiple CSV's: https://docs.python.org/3/library/glob.html
# csv_files = glob.glob('Resources/bts/*.csv')

# # Empty list for DataFrames
# dataframes = []

# # Iterate through each CSV file and append to list
# for file in csv_files:
#     df = pd.read_csv(file, usecols=column_selection, low_memory = False)
#     dataframes.append(df)

# # Concatenate DataFrames from list to single df
# combined_flight_info = pd.concat(dataframes, ignore_index=True)
# combined_flight_info.tail()

In [None]:
# # Export to CSV
# combined_flight_info.to_csv('Resources/combined_flight_info.csv')


In [None]:
# # Zip the CSV and delete to allow github upload
# csv_path = 'Resources/combined_flight_info.csv'
# zip_path = 'Resources/combined_flight_info.zip'

# with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
#     zipf.write(csv_path, os.path.basename(csv_path))

# # Confirm zip exists prior to deleting csv
# if os.path.exists(zip_path):
#     os.remove(csv_path)
#     print(f"The file {csv_file_path} has been zipped and deleted.")
# else:
#     print("Error: The zip file was not created successfully.")

In [None]:
# # import dependencies
# import zipfile
# import tempfile
# import shutil
# import glob
# import os

In [None]:
# # Read in source CSVs, select columns, concat info

# # List of all source zips
# zips = ['Resources/BTS/BTS_Q1_2023.zip','Resources/BTS/BTS_Q2_2023.zip',
#        'Resources/BTS/BTS_Q3_2023.zip','Resources/BTS/BTS_Q4_2023.zip']
# # Prepare to select columns
# column_selection = ['FlightDate','IATA_Code_Marketing_Airline','Operating_Airline ',
#                     'Flight_Number_Marketing_Airline','Tail_Number','Origin','Dest',
#                     'CRSDepTime','DepTime','DepDelay','DepDelayMinutes','DepDel15','ArrDelay',
#                     'ArrDelayMinutes','Cancelled', 'Diverted','CarrierDelay',
#                     'WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']
# # Empty list for DataFrames
# dataframes = []

# # Loop through zips and extract to a temp directory
# for zip in zips:
#     with tempfile.TemporaryDirectory() as temp_dir:
#         with zipfile.ZipFile(zip, 'r') as zip_ref:
#             zip_ref.extractall(temp_dir)
        
#         # Use glob to find all CSV files in the temporary directory
#         # https://docs.python.org/3/library/glob.html
#         csv_files = glob.glob(os.path.join(temp_dir, '*.csv'))
        
#         # Iterate through CSV files
#         for csv_file in csv_files:
#             df = pd.read_csv(csv_file, usecols=column_selection, low_memory = False)
#             dataframes.append(df)
        
#         # combine the dataframes
#         combined_flight_info = pd.concat(dataframes, ignore_index=True)
#         # export to new csv
#         combined_flight_info.to_csv('Resources/combined_flight_info.csv')
# # Okay to close block and let temp_dir auto_delete
# if os.path.exists('Resources/combined_flight_info.csv'):
#     print("combined_flight_info.csv has been created")
# else: 
#     print("Something went wrong. Please try again.")

In [None]:
# # Let's test it without selecting columns at first. 
# # Read in source CSVs, select columns, concat info

# # List of all source zips
# zips = ['Resources/BTS/BTS_Q1_2023.zip','Resources/BTS/BTS_Q2_2023.zip',
#        'Resources/BTS/BTS_Q3_2023.zip','Resources/BTS/BTS_Q4_2023.zip']
# # Empty list for DataFrames
# dataframes = []

# # Loop through zips and extract to a temp directory
# for zip in zips:
#     with tempfile.TemporaryDirectory() as temp_dir:
#         with zipfile.ZipFile(zip, 'r') as zip_ref:
#             zip_ref.extractall(temp_dir)
        
#         # Use glob to find all CSV files in the temporary directory
#         # https://docs.python.org/3/library/glob.html
#         csv_files = glob.glob(os.path.join(temp_dir, '*.csv'))
        
#         # Iterate through CSV files
#         for csv_file in csv_files:
#             df = pd.read_csv(csv_file, low_memory = False)
#             dataframes.append(df)
        
#         # combine the dataframes
#         combined_flight_info = pd.concat(dataframes, ignore_index=True)
#         # export to new csv
#         combined_flight_info.to_csv('Resources/combined_flight_info.csv')
# # Okay to close block and let temp_dir auto_delete
# if os.path.exists('Resources/combined_flight_info_unflitered.csv'):
#     print("combined_flight_info_unfiltered.csv has been created")
# else: 
#     print("Something went wrong. Please try again.")

In [None]:
# # Zip the CSV and delete to allow github upload
# csv_path = 'Resources/combined_flight_info.csv'
# zip_path = 'Resources/combined_flight_info.zip'

# with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
#     zipf.write(csv_path, os.path.basename(csv_path))

# # Confirm zip exists prior to deleting csv
# if os.path.exists(zip_path):
#     os.remove(csv_path)
#     print(f"The file {csv_path} has been zipped and deleted.")
# else:
#     print("Error: Zip failed to create.")
