### Read in flight data, keep only selected airports and consolidate the csv:s from monthly files to yearly files

In [28]:
import os
import pandas as pd

# Directory containing extracted CSV files
extracted_data_directory = '/mnt/c/Users/Admin/Downloads/2022-flight-data'



In [29]:
# List of preselected airport IATA codes
selected_airports = ['DTW','LAS','PHL','DEN','CLT','SEA','MCO','BOS','FLL','IAD','IAH','SFO','EWR','MIA','JFK', 'LAX', 'ORD', 'ATL']  # Replace with your desired airport codes
# List to store DataFrames
data_frames = []

# Loop through each extracted CSV file
for filename in os.listdir(extracted_data_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(extracted_data_directory, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Print column names to inspect the headers
        print(f"Columns in {filename}: {df.columns}")
        
        # Filter data for preselected airports if the columns exist
        if 'ORIGIN' in df.columns and 'DEST' in df.columns:
            filtered_df = df[df['ORIGIN'].isin(selected_airports) & df['DEST'].isin(selected_airports)]
            data_frames.append(filtered_df)
        else:
            print(f"Skipping {filename} - Missing 'ORIGIN' or 'DEST' columns.")


Columns in T_ONTIME_REPORTING_apr.csv: Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST',
       'DEST_WAC', 'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME',
       'DISTANCE'],
      dtype='object')
Columns in T_ONTIME_REPORTING_aug.csv: Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST',
       'DEST_WAC', 'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME',
       'DISTANCE'],
      dtype='object')
Columns in T_ONTIME_REPORTING_dec.csv: Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'ORIGIN_AIRPORT_ID', 'ORIGIN', 'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST',
       'DEST_WAC', 'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME',
       'DISTANCE'],
      dtype='object')
Columns in T_ONTIME_REPORTING_feb.csv: Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MON

In [30]:
# Concatenate all DataFrames into one DataFrame
filtered_data = pd.concat(data_frames, ignore_index=True)

In [31]:

filtered_data.to_csv('/mnt/c/Users/Admin/Downloads/2022_processed_flight_data.csv', index=False)

### concatenate 2021 and 2022 data, also check for missing values

In [32]:
file_2022_path = '/mnt/c/Users/Admin/Downloads/2022_processed_flight_data.csv'
df_2022_filtered = pd.read_csv(file_2022_path)


file_2021_path = '/mnt/c/Users/Admin/Downloads/2021_processed_flight_data.csv'
df_2021_filtered = pd.read_csv(file_2021_path)

# Concatenate both DataFrames
concatenated_df = pd.concat([df_2022_filtered, df_2021_filtered], ignore_index=True)


In [34]:
#check amount of missing values for each col
for i in concatenated_df.columns:
    print(i, ": ",concatenated_df[i].isnull().sum())

YEAR :  0
QUARTER :  0
MONTH :  0
DAY_OF_MONTH :  0
DAY_OF_WEEK :  0
FL_DATE :  0
ORIGIN_AIRPORT_ID :  0
ORIGIN :  0
ORIGIN_WAC :  0
DEST_AIRPORT_ID :  0
DEST :  0
DEST_WAC :  0
CRS_DEP_TIME :  0
DEP_DELAY_NEW :  37751
CRS_ARR_TIME :  0
DISTANCE :  0


In [35]:
#Remove missing values for dep_delay_new
df_cleaned = concatenated_df.dropna(subset=['DEP_DELAY_NEW'])

In [37]:
for i in df_cleaned.columns:
    print(i, ": ",df_cleaned[i].isnull().sum())

YEAR :  0
QUARTER :  0
MONTH :  0
DAY_OF_MONTH :  0
DAY_OF_WEEK :  0
FL_DATE :  0
ORIGIN_AIRPORT_ID :  0
ORIGIN :  0
ORIGIN_WAC :  0
DEST_AIRPORT_ID :  0
DEST :  0
DEST_WAC :  0
CRS_DEP_TIME :  0
DEP_DELAY_NEW :  0
CRS_ARR_TIME :  0
DISTANCE :  0


In [40]:
df_cleaned

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_WAC,DEST_AIRPORT_ID,DEST,DEST_WAC,CRS_DEP_TIME,DEP_DELAY_NEW,CRS_ARR_TIME,DISTANCE
0,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,600,2.0,833,946.0
1,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,845,6.0,1121,946.0
2,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,959,2.0,1237,946.0
3,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,1045,34.0,1323,946.0
4,2022,2,4,1,5,4/1/2022 12:00:00 AM,10397,ATL,34,10721,BOS,13,1135,0.0,1400,946.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992528,2021,3,9,30,4,9/30/2021 12:00:00 AM,14771,SFO,91,14747,SEA,93,1630,0.0,1850,679.0
1992529,2021,3,9,30,4,9/30/2021 12:00:00 AM,14771,SFO,91,14747,SEA,93,1800,2.0,2025,679.0
1992530,2021,3,9,30,4,9/30/2021 12:00:00 AM,14771,SFO,91,14747,SEA,93,1910,0.0,2131,679.0
1992531,2021,3,9,30,4,9/30/2021 12:00:00 AM,14771,SFO,91,14747,SEA,93,2004,0.0,2224,679.0


### Keeping FL_DATE, ORIGIN and DEST unchanged because they will be useful in combining flight data with weather data

In [39]:
df_cleaned.to_csv('/mnt/c/Users/Admin/Downloads/processed_flight_data_2022_and_2021.csv', index=False)