# **Pre-Processing for Machine Learning**

## Import necessary libraries

In [7]:
import pandas as pd
import numpy as np

## Load the raw data

Requires a route flights dataframe with METAR data. <br><br>
Note: If the file does not contain METAR data, dropping certain columns may result in an error.

In [None]:
# Define a generic file path as an argument or variable
file_path = 'yourcsv.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Resetting the index of the DataFrame (df) and removing the old index
# `inplace=True` modifies the original DataFrame directly, without creating a new one
# `drop=True` prevents the old index from being added as a new column in the DataFrame
df.reset_index(inplace=True, drop=True)


## Data cleaning

Drop unnecessary rows

In [None]:
# Drop rows with True values in the 'blocked' column (these indicate flights with no data)
df = df[df['blocked'] == False]

# Drop rows with True values in the 'position_only' column (these indicate flights with reduced data)
df = df[df['position_only'] == False]

# Drop rows with missing values in the icao identifiying columns
df = df.dropna(subset=['ident_icao', 'operator_icao','origin.code_icao','destination.code_icao'])

# Drop rows with missing values in the 'filed_ete' and 'aircraft_type' columns as these cannot be approximated (and are <1% of the data)
df = df.dropna(subset=['filed_ete', 'aircraft_type'])

# Drop rows with missing values in the 'scheduled_out' column
df = df.dropna(subset=['scheduled_out'])


## Feature engineering

#### **Adding short-haul vs long-haul flights** <br>

Adding a column which allows to distinguish short- and long-haul flights
Definition:<br>
Short-haul flights <= 3h travel time <br>
Long-haul flights > 3h travel time

In [10]:
df['flight_type'] = np.where(df['filed_ete'] <= 3*60*60, 'Short-haul', 'Long-haul')

#### **Adding manufacturer information**

In [11]:
aircraft_manufacturer = {
    'A319': 'Airbus',
    'A320': 'Airbus',
    'A20N': 'Airbus',
    'B738': 'Boeing',
    'B38M': 'Boeing',
    'A321': 'Airbus',
    'A21N': 'Airbus',
    'BCS3': 'Airbus',
    'E295': 'Embraer',
    'A359': 'Airbus',
    'B77W': 'Boeing',
    'B772': 'Boeing',
    'A332': 'Airbus',
    'A333': 'Airbus',
    'B788': 'Boeing',
    'B752': 'Boeing',
    'B763': 'Boeing',
    'B753': 'Boeing',
    'B39M': 'Boeing',
    'A330': 'Airbus',
    'B739': 'Boeing',
    'B737': 'Boeing',
    'B789': 'Boeing',
    'B78X': 'Boeing',
    'A35K': 'Airbus',
    'BCS1': 'Airbus',
    'E190': 'Embraer',
    'AT72': 'ATR',
    'A318': 'Airbus',
    'B773': 'Boeing',
    'E75L': 'Embraer',
    'E170': 'Embraer',
    '737': 'Boeing',
    'A339': 'Airbus',
    'CRJ9': 'Bombardier',
    'A388': 'Airbus',
    'B733': 'Boeing',
    'B77L': 'Boeing',
    'E290': 'Embraer',
    'B744': 'Boeing',
    'B764': 'Boeing',
    '777': 'Boeing',
    'B732': 'Boeing',
    '3M3': 'McDonnell Douglas',
    '787': 'Boeing',
    'CRJX': 'Bombardier',
    'B736': 'Boeing',
    '73M': 'Boeing',
    '32S': 'Airbus',
    'B748': 'Boeing',
    '31A': 'McDonnell Douglas',
    'A337': 'Airbus',
    'DH8D': 'De Havilland Canada',
    'A20': 'Airbus',
    'B712': 'Boeing',
    'AJ27': 'Dassault',
    'CRJ': 'Bombardier',
    'B734': 'Boeing',
    'ATR': 'ATR',
    '35L': 'Airbus',
    'E75S': 'Embraer',
    'C206': 'Cessna',
    '35H': 'Airbus',
    'E195': 'Embraer',
    'B735': 'Boeing',
}

df['manufacturer'] = df['aircraft_type'].map(aircraft_manufacturer)

#### **Adding departure date and time information:**

The code converts the 'scheduled_out' column to datetime format and creates several new columns based on the extracted time information. Specifically, it classifies the time of day (morning, afternoon, evening, night), extracts the departure month, weekday, and week number from the scheduled_out datetime column.<br>
departure_month<br>
departure_week_no<br>
departure_weekday<br>
departure_time_of day<br>

In [12]:
# Convert 'scheduled_out' to datetime
df['scheduled_out'] = pd.to_datetime(df['scheduled_out'])

# Create the 'time_of_day' column based on the hour of the day
def classify_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

# Apply the classification to the DataFrame
df['departure_time_of_day'] = df['scheduled_out'].dt.hour.apply(classify_time_of_day)

# Extract 'month', 'date', and 'weekday'
df['departure_month'] = df['scheduled_out'].dt.month
df['departure_weekday'] = df['scheduled_out'].dt.strftime('%A')  # Full weekday name

# Extract 'week_no' based on the 'scheduled_out' date
df['week_no'] = df['scheduled_out'].dt.isocalendar().week

#### **Add region** <br>
Assign a region to each airport via the IATA code

In [14]:
regions = {
  'LGA': 'North America',
    'BNA': 'North America',
    'CLT': 'North America',
    'ORD': 'North America',
    'CDG': 'Europe',
    'NRT': 'Asia Pacific',
    'YUL': 'North America',
    'ICN': 'Asia Pacific',
    'CPH': 'Europe',
    'DXB': 'Middle East',
    'LIS': 'Europe',
    'TPE': 'Asia Pacific',
    'BOG': 'South America',
    'JFK': 'North America',
    'JED': 'Middle East',
    'MCO': 'North America',
    'PHL': 'North America',
    'SCL': 'South America',
    'LIM': 'South America',
    'KUL': 'Asia Pacific',
    'SIN': 'Asia Pacific',
    'CAI': 'Africa',
    'HKG': 'Asia Pacific',
    'OSL': 'Europe',
    'ARN': 'Europe',
    'YVR': 'North America',
    'BER': 'Europe',
    'GDL': 'North America',
    'LAX': 'North America',
    'CUN': 'North America',
    'KEF': 'Europe',
    'LHR': 'Europe',
    'BKK': 'Asia Pacific',
    'PMI': 'Europe',
    'SJU': 'North America',
    'GRU': 'South America',
    'AMS': 'Europe',
    'SFO': 'North America',
    'ATL': 'North America',
    'MIA': 'North America',
    'DUB': 'Europe',
    'FUK': 'Asia Pacific',
    'MSY': 'North America',
    'FCO': 'Europe',
    'DEL': 'Asia Pacific',
    'ADD': 'Africa',
    'CGK': 'Asia Pacific',
    'DFW': 'North America',
    'RUN': 'Africa',
    'ORY': 'Europe',
    'CZM': 'North America',
    'LIR': 'North America',
    'PTY': 'North America',
    'IST': 'Europe',
    'BOH': 'Europe',
    'HEL': 'Europe',
    'DAL': 'North America',
    'SAT': 'North America',
    'TLV': 'Middle East',
    'DMU': 'South America',
    'BOM': 'Asia Pacific',
    'CVT': 'Europe',
    'MAD': 'Europe',
    'MDW': 'North America',
    'YQB': 'North America',
    'BGO': 'Europe',
    'LYS': 'Europe',
    'ZAG': 'Europe',
    'COS': 'North America',
    'PIT': 'North America',
    'YYZ': 'North America',
    'FUE': 'Europe',
    'HOU': 'North America',
    'LUX': 'Europe',
    'DTW': 'North America',
    'HYC': 'North America',
    'TFS': 'Europe',
    'DUS': 'Europe',
    'PEK': 'Asia Pacific',
    'TYS': 'North America',
    'BUR': 'North America',
    'DEN': 'North America',
    'SLC': 'North America',
    'RNO': 'North America',
    'OAK': 'North America',
    'BWI': 'North America',
    'TPA': 'North America',
    'LAS': 'North America',
    'RSW': 'North America',
    'PBI': 'North America',
    'STL': 'North America',
    'RTM': 'Europe',
    'RDU': 'North America',
    'BBP': 'North America',
    'CVG': 'North America',
    'MED': 'Middle East',
    'HAM': 'Europe',
    'LPA': 'Europe',
    'FRA': 'Europe',
    'FLL': 'North America',
    'BOD': 'Europe',
    'AUS': 'North America',
    'MKE': 'North America',
    'DOH': 'Middle East',
    'DMK': 'Asia Pacific',
    'ZRH': 'Europe',
    'SMF': 'North America',
    'SNA': 'North America',
    'BHD': 'Europe',
    'MSP': 'North America',
    'FAI': 'North America',
    'TUN': 'Africa',
    'STT': 'North America',
    'HRG': 'Africa',
    'LGW': 'Europe',
    'RHO': 'Europe',
    'IXJ': 'Asia Pacific',
    'MDZ': 'South America',
    'OLB': 'Europe',
    'KGS': 'Europe',
    'ANF': 'South America',
    'SAN': 'North America',
    'DSM': 'North America',
    'IAH': 'North America',
    'SEA': 'North America',
    'MTJ': 'North America',
    'ACY': 'North America',
    'BZE': 'North America',
    'HPN': 'North America',
    'LEJ': 'Europe',
    'EWR': 'North America',
    'DWC': 'Middle East',
    'SRQ': 'North America',
    'CMH': 'North America',
    'LEY': 'Europe',
    'GUA': 'Central America',
    'SHJ': 'Middle East',
    'IND': 'North America',
    'ALG': 'Africa',
    'AEP': 'South America',
    'BGR': 'North America',
    'ABQ': 'North America',
    'AUH': 'Middle East',
    'NGO': 'Asia Pacific',
    'OXF': 'Europe',
    'ASW': 'Africa',
    'ONT': 'North America',
    'YYT': 'North America',
    'NAS': 'North America',
    'SYD': 'Australia',
    'HMO': 'North America',
    'SCE': 'North America',
    'NUM': 'South America',
    'GYE': 'South America',
    'EZE': 'South America',
    'SWF': 'North America',
    'BTH': 'Asia Pacific',
    'KIX': 'Asia Pacific',
    'GNV': 'North America',
    'BWN': 'Asia Pacific',
    'BQN': 'North America',
    'GRR': 'North America',
    'KRK': 'Europe',
    'HNL': 'North America',
    'PHX': 'North America',
    'DPS': 'Asia Pacific',
    'KHH': 'Asia Pacific',
    'TAS': 'Asia Pacific',
    'MUC': 'Europe',
    'MCI': 'North America',
    'PVR': 'North America',
    'MNL': 'Asia Pacific',
    'MAN': 'Europe',
    'XRY': 'Europe',
    'COR': 'South America',
    'BLB': 'Europe',
    'HBE': 'Africa',
    'SUF': 'Europe',
    'AMD': 'Asia Pacific',
    'CFU': 'Europe',
    'INT': 'North America',
    'PDX': 'North America',
    'MRU': 'Africa',
    'DZA': 'Europe',
    'MLE': 'Asia Pacific',
    'VLC': 'Europe',
    'GOT': 'Europe',
    'NKM': 'Asia Pacific',
    'TLL': 'Europe',
    'HRL': 'North America',
    'SAP': 'North America',
    'CLE': 'North America',
    'PWM': 'North America',
    'OKC': 'North America',
    'AGP': 'Europe',
    'KAO': 'Asia Pacific',
    'BQH': 'Europe',
    'QSC': 'Europe',
    'RAK': 'Africa',
    'BLL': 'Europe',
    'AMA': 'North America',
    'GOI': 'Asia Pacific',
    'BHM': 'North America',
    'BAH': 'Middle East',
    'CMN': 'Africa',
    'TNG': 'Africa',
    'FLR': 'Europe',
    'VGO': 'Europe',
    'STR': 'Europe',
    'VAA': 'Asia Pacific',
    'ESH': 'Europe',
    'DIL': 'Asia Pacific',
    'HER': 'Europe',
    'VNO': 'Europe',
    'SDQ': 'Caribbean',
    'VIE': 'Europe',
    'AAL': 'Europe',
    'RUH': 'Middle East',
    'SSH': 'Africa',
    'TNR': 'Africa',
    'BOS': 'North America',
    'GAU': 'Asia Pacific',
    'BRC': 'South America',
    'RVN': 'Asia Pacific',
    'DCA': 'North America',
    'ORF': 'North America',
    'ROC': 'North America',
    'OUL': 'Europe',
    'XNA': 'North America',
    'NOU': 'Oceania',
    'LDY': 'North America',
    'BHX': 'Europe',
    'JAX': 'North America',
    'PUS': 'Asia Pacific',
    'ORK': 'Europe',
    'ELP': 'North America',
    'OMA': 'North America',
    'TUS': 'North America',
    'MLU': 'North America',
    'CGN': 'Europe',
    'LXR': 'Africa',
    'MXL': 'North America',
    'SNN': 'Europe'
}

df['origin_region'] = df['origin.code_iata'].map(regions)
df['destination_region'] = df['destination.code_iata'].map(regions)

#### **Add subregion**<br>
Assign a region to each airport via the IATA code

In [15]:
sub_regions = {
    'LGA': 'North America - East Coast',
    'BNA': 'North America - East Coast',
    'CLT': 'North America - East Coast',
    'ORD': 'North America - East Coast',
    'CDG': 'Europe - Continental Europe',
    'NRT': 'Asia Pacific',
    'YUL': 'North America - East Coast',
    'ICN': 'Asia Pacific',
    'CPH': 'Europe - Skandinavia',
    'DXB': 'Middle East',
    'LIS': 'Europe - Continental Europe',
    'TPE': 'Asia Pacific',
    'BOG': 'South America',
    'JFK': 'North America - East Coast',
    'JED': 'Middle East',
    'MCO': 'North America - East Coast',
    'PHL': 'North America - East Coast',
    'SCL': 'South America',
    'LIM': 'South America',
    'KUL': 'Asia Pacific',
    'SIN': 'Asia Pacific',
    'CAI': 'Africa',
    'HKG': 'Asia Pacific',
    'OSL': 'Europe - Skandinavia',
    'ARN': 'Europe - Skandinavia',
    'YVR': 'North America - West Coast',
    'BER': 'Europe - Continental Europe',
    'GDL': 'North America - West Coast',
    'LAX': 'North America - West Coast',
    'CUN': 'North America - West Coast',
    'KEF': 'Europe - Skandinavia',
    'LHR': 'Europe - Continental Europe',
    'BKK': 'Asia Pacific',
    'PMI': 'Europe - Continental Europe',
    'SJU': 'North America - East Coast',
    'GRU': 'South America',
    'AMS': 'Europe - Continental Europe',
    'SFO': 'North America - West Coast',
    'ATL': 'North America - East Coast',
    'MIA': 'North America - East Coast',
    'DUB': 'Europe - Continental Europe',
    'FUK': 'Asia Pacific',
    'MSY': 'North America - East Coast',
    'FCO': 'Europe - Continental Europe',
    'DEL': 'Asia Pacific',
    'ADD': 'Africa',
    'CGK': 'Asia Pacific',
    'DFW': 'North America - West Coast',
    'RUN': 'Africa',
    'ORY': 'Europe - Continental Europe',
    'CZM': 'North America - West Coast',
    'LIR': 'North America - West Coast',
    'PTY': 'North America - West Coast',
    'IST': 'Europe - Continental Europe',
    'BOH': 'Europe - Continental Europe',
    'HEL': 'Europe - Skandinavia',
    'DAL': 'North America - East Coast',
    'SAT': 'North America - West Coast',
    'TLV': 'Middle East',
    'DMU': 'South America',
    'BOM': 'Asia Pacific',
    'CVT': 'Europe - Continental Europe',
    'MAD': 'Europe - Continental Europe',
    'MDW': 'North America - East Coast',
    'YQB': 'North America - East Coast',
    'BGO': 'Europe - Skandinavia',
    'LYS': 'Europe - Continental Europe',
    'ZAG': 'Europe - Continental Europe',
    'COS': 'North America - West Coast',
    'PIT': 'North America - East Coast',
    'YYZ': 'North America - East Coast',
    'FUE': 'Europe - Continental Europe',
    'HOU': 'North America - West Coast',
    'LUX': 'Europe - Continental Europe',
    'DTW': 'North America - East Coast',
    'HYC': 'North America - East Coast',
    'TFS': 'Europe - Continental Europe',
    'DUS': 'Europe - Continental Europe',
    'PEK': 'Asia Pacific',
    'TYS': 'North America - East Coast',
    'BUR': 'North America - West Coast',
    'DEN': 'North America - West Coast',
    'SLC': 'North America - West Coast',
    'RNO': 'North America - West Coast',
    'OAK': 'North America - West Coast',
    'BWI': 'North America - East Coast',
    'TPA': 'North America - East Coast',
    'LAS': 'North America - West Coast',
    'RSW': 'North America - East Coast',
    'PBI': 'North America - East Coast',
    'STL': 'North America - West Coast',
    'RTM': 'Europe - Continental Europe',
    'RDU': 'North America - East Coast',
    'BBP': 'North America - East Coast',
    'CVG': 'North America - East Coast',
    'MED': 'Middle East',
    'HAM': 'Europe - Skandinavia',
    'LPA': 'Europe - Continental Europe',
    'FRA': 'Europe - Continental Europe',
    'FLL': 'North America - East Coast',
    'BOD': 'Europe - Continental Europe',
    'AUS': 'North America - West Coast',
    'MKE': 'North America - East Coast',
    'DOH': 'Middle East',
    'DMK': 'Asia Pacific',
    'ZRH': 'Europe - Continental Europe',
    'SMF': 'North America - West Coast',
    'SNA': 'North America - West Coast',
    'BHD': 'Europe - Continental Europe',
    'MSP': 'North America - East Coast',
    'FAI': 'North America - West Coast',
    'TUN': 'Africa',
    'STT': 'North America - East Coast',
    'HRG': 'Africa',
    'LGW': 'Europe - Continental Europe',
    'RHO': 'Europe - Continental Europe',
    'IXJ': 'Asia Pacific',
    'MDZ': 'South America',
    'OLB': 'Europe - Continental Europe',
    'KGS': 'Europe - Continental Europe',
    'ANF': 'South America',
    'SAN': 'North America - West Coast',
    'DSM': 'North America - East Coast',
    'IAH': 'North America - East Coast',
    'SEA': 'North America - West Coast',
    'MTJ': 'North America - West Coast',
    'ACY': 'North America - East Coast',
    'BZE': 'North America - East Coast',
    'HPN': 'North America - East Coast',
    'LEJ': 'Europe - Continental Europe',
    'EWR': 'North America - East Coast',
    'DWC': 'Middle East',
    'SRQ': 'North America - East Coast',
    'CMH': 'North America - East Coast',
    'LEY': 'Europe - Continental Europe',
    'GUA': 'Central America',
    'SHJ': 'Middle East',
    'IND': 'North America - East Coast',
    'ALG': 'Africa',
    'AEP': 'South America',
    'BGR': 'North America - East Coast',
    'ABQ': 'North America - East Coast',
    'AUH': 'Middle East',
    'NGO': 'Asia Pacific',
    'OXF': 'Europe - Continental Europe',
    'ASW': 'Africa',
    'ONT': 'North America - West Coast',
    'YYT': 'North America - East Coast',
    'NAS': 'North America - East Coast',
    'SYD': 'Australia',
    'HMO': 'North America - West Coast',
    'SCE': 'North America - East Coast',
    'NUM': 'South America',
    'GYE': 'South America',
    'EZE': 'South America',
    'SWF': 'North America - East Coast',
    'BTH': 'Asia Pacific',
    'KIX': 'Asia Pacific',
    'GNV': 'North America - East Coast',
    'BWN': 'Asia Pacific',
    'BQN': 'North America - East Coast',
    'GRR': 'North America - East Coast',
    'KRK': 'Europe - Continental Europe',
    'HNL': 'North America - West Coast',
    'PHX': 'North America - West Coast',
    'DPS': 'Asia Pacific',
    'KHH': 'Asia Pacific',
    'TAS': 'Asia Pacific',
    'MUC': 'Europe - Continental Europe',
    'MCI': 'North America - East Coast',
    'PVR': 'North America - West Coast',
    'MNL': 'Asia Pacific',
    'MAN': 'Europe - Continental Europe',
    'XRY': 'Europe - Continental Europe',
    'COR': 'South America',
    'BLB': 'Europe - Continental Europe',
    'HBE': 'Africa',
    'SUF': 'Europe - Continental Europe',
    'AMD': 'Asia Pacific',
    'CFU': 'Europe - Continental Europe',
    'INT': 'North America - East Coast',
    'PDX': 'North America - West Coast',
    'MRU': 'Africa',
    'DZA': 'Europe - Continental Europe',
    'MLE': 'Asia Pacific',
    'VLC': 'Europe - Continental Europe',
    'GOT': 'Europe - Skandinavia',
    'NKM': 'Asia Pacific',
    'TLL': 'Europe - Skandinavia',
    'HRL': 'North America - East Coast',
    'SAP': 'North America - East Coast',
    'CLE': 'North America - East Coast',
    'PWM': 'North America - East Coast',
    'OKC': 'North America - East Coast',
    'AGP': 'Europe - Continental Europe',
    'KAO': 'Asia Pacific',
    'BQH': 'Europe - Continental Europe',
    'QSC': 'Europe - Continental Europe',
    'RAK': 'Africa',
    'BLL': 'Europe - Continental Europe',
    'AMA': 'North America - East Coast',
    'GOI': 'Asia Pacific',
    'BHM': 'North America - East Coast',
    'BAH': 'Middle East',
    'CMN': 'Africa',
    'TNG': 'Africa',
    'FLR': 'Europe - Continental Europe',
    'VGO': 'Europe - Continental Europe',
    'STR': 'Europe - Continental Europe',
    'VAA': 'Asia Pacific',
    'ESH': 'Europe - Continental Europe',
    'DIL': 'Asia Pacific',
    'HER': 'Europe - Continental Europe',
    'VNO': 'Europe - Continental Europe',
    'SDQ': 'Caribbean',
    'VIE': 'Europe - Continental Europe',
    'AAL': 'Europe - Skandinavia',
    'RUH': 'Middle East',
    'SSH': 'Africa',
    'TNR': 'Africa',
    'BOS': 'North America - East Coast',
    'GAU': 'Asia Pacific',
    'BRC': 'South America',
    'RVN': 'Asia Pacific',
    'DCA': 'North America - East Coast',
    'ORF': 'North America - East Coast',
    'ROC': 'North America - East Coast',
    'OUL': 'Europe - Skandinavia',
    'XNA': 'North America - East Coast',
    'NOU': 'Oceania',
    'LDY': 'North America - East Coast',
    'BHX': 'Europe - Continental Europe',
    'JAX': 'North America - East Coast',
    'PUS': 'Asia Pacific',
    'ORK': 'Europe - Continental Europe',
    'ELP': 'North America - East Coast',
    'OMA': 'North America - East Coast',
    'TUS': 'North America - East Coast',
    'MLU': 'North America - East Coast',
    'CGN': 'Europe - Continental Europe',
    'LXR': 'Africa',
    'MXL': 'North America - East Coast',
    'SNN': 'Europe - Continental Europe'
}
df['origin_sub_region'] = df['origin.code_iata'].map(sub_regions)
df['destination_sub_region'] = df['destination.code_iata'].map(sub_regions)

#### **Add Route Code**<br>
Add a route_code for each flight based on the origin.code_icao and the destination.code_icao.

In [16]:
df['route_code'] = df['origin.code_icao'] + '-' + df['destination.code_icao']

#### **Delay Binary Creation and Column Dropping**<br>
Based on the columns departure_delay and arrival_delay create a binary classification whether a flight is 1 (delayed) or 2 (on time).

In [None]:
# A delay is defined as 15m, times are in seconds, calculate the binary for departure and arrival delays
df['departure_delay_binary_FA'] = df['departure_delay'].apply(lambda x: 1 if x > (60*15) else 0)
df['arrival_delay_binary_FA'] = df['arrival_delay'].apply(lambda x: 1 if x > (60*15) else 0)

In [20]:
# Drop calculation columns which were used for the binary delay columns
df.drop(columns=['departure_delay',
                 'arrival_delay',
                 'diverted',
                 'cancelled',      
                 'scheduled_out',  
                 'estimated_out',  
                 'actual_out',     
                 'scheduled_off',  
                 'estimated_off',  
                 'actual_off',     
                 'scheduled_on',   
                 'estimated_on',   
                 'actual_on',      
                 'scheduled_in',   
                 'estimated_in',   
                 'actual_in'], inplace=True)      

#### **One Hote Encoding Aggregate of Weather Codes into Binary and Multi-Class (Summation)**<br>
The code extracts all column names in the DataFrame that start with "wx_code" and end with "departure", and then creates two new columns. The first, 'wx_binary_departure', is set to 1 if any of the selected columns have a value of 1, and 0 otherwise, while the second, 'wx_sum_departure', contains the sum of the values in those columns for each row.

In [24]:
# extract all column names that start with "wx_code" and end in "arrival"
wx_departure_columns = [col for col in df.columns if col.startswith('wx_code') and col.endswith('departure')]

# Create a new column 'wx_binary_departure' if any of the columns in wx_departure_columns has value = 1
df['wx_binary_departure'] = df[wx_departure_columns].apply(lambda x: 1 if x.any() else 0, axis=1)

# Create a new column 'wx_sum_arrival' that sums all the values in wx_arrival_columns
df['wx_sum_departure'] = df[wx_departure_columns].sum(axis=1)

#### **LIFR (Low Instrument Flight Rules)** <br>
Creat a new column 'LIFR_binary' that is 1 if flight_rules column is 4 (LIFR) and 0 otherwise

Conditions:
- Ceiling < 500 ft AGL
- Visibility < 1 SM
- Meaning: Extremely poor weather conditions requiring full instrument navigation and significant restrictions on flight operations.

In [25]:
# Creat a new column 'LIFR_binary' that is 1 if flight_rules column is 4 (LIFR) and 0 otherwise
df['LIFR_binary_departure'] = np.where(df['flight_rules_departure'] == 4, 1, 0)

#### **Create Altitude 1 with Cat 4 or 5 Clouds Binary** <br>
Create a new column 'low_cloud_ceiling' binary which has value=1 if 'clouds_layer_1_altitude_category_departure' is 4 or 5

In [26]:
# Create a new column 'low_cloud_ceiling' binary which has value=1 if 'clouds_layer_1_altitude_category_departure' is 4 or 5
df['low_cloud_ceiling_departure'] = np.where(df['clouds_layer_1_altitude_category_departure'].isin([4, 5]), 1, 0)

## Redundant and self-evidently useless feature dropping <br>

**Drop the fields that are immediately apparent as not useful**

In [None]:
df.drop(columns=['Unnamed: 0.1', # Generic index from previous operations
                 'Unnamed: 0', # Generic index from previous operations
                 'station_arrival', # METAR matching field, no longer useful
                 'station_departure', # METAR matching field, no longer useful
                 'METAR_departure_time_delta', # METAR matching field, no longer useful
                 'time.dt_departure', # METAR matching field, no longer useful
                 'METAR_arrival', # METAR matching field, no longer useful
                 'METAR_arrival_time_delta', # METAR matching field, no longer useful
                 'time.dt_arrival', # METAR matching field, no longer useful
                 'ident', # The route itself will serve as an identifier
                 'ident_icao', # The route itself will serve as an identifier
                 'ident_iata', # The route itself will serve as an identifier
                 'actual_runway_off', # Aleatory field, useless for our case
                 'actual_runway_on', # Aleatory field, useless for our case
                 'fa_flight_id', # FlightAware identifier, not useful
                 'operator', # Generic identifier, we are keeping the ICAO identifier as it is unique
                 'operator_iata', # ICAO identifier is more useful
                 'flight_number', # Flight number is not useful for our case
                 'registration', # Registration is not useful for our case
                 'atc_ident', # ATC identifier is not useful for our case
                 'inbound_fa_flight_id', # Would be useful for a more complex analysis accounting for link to previous flight delay, but not for our case
                 'codeshares', # Codeshares are not useful for our case
                 'codeshares_iata', # Codeshares are not useful for our case
                 'blocked', # We have already filtered out blocked flights
                 'position_only', # We have already filtered out position only flights
                 'foresight_predictions_available', # Not useful for our case
                 'progress_percent', # Not useful for our case
                 'status', # Not useful for our case
                 'route_distance', # Filed_ete accomplishes the same purpose
                 'filed_altitude', # Too many missing values, otherwise an interesting feature
                 'filed_airspeed', # Too many junk values with no consistent imputation interpretation
                 'route', # Too many missing values, otherwise an interesting feature
                 'baggage_claim', # Not useful for our case
                 'seats_cabin_business', # Too many missing values and not useful for our case
                 'seats_cabin_coach', # Too many missing values and not useful for our case
                 'seats_cabin_first', # Too many missing values and not useful for our case
                 'gate_origin', # Aleatory, not useful for our case
                 'gate_destination', # Aleatory, not useful for our case
                 'terminal_origin', # Aleatory, not useful for our case
                 'terminal_destination', # Aleatory, not useful for our case
                 'type', # Our data was filtered for passenger flights at the query level
                 'origin.code', # We are keeping the ICAO identifier as it is unique
                 'origin.code_iata', # ICAO identifier is more useful
                 'origin.code_lid', # ICAO identifier is more useful
                 'origin.timezone', # Not useful for our case
                 'origin.name', # Not useful for our case
                 'origin.city', # Not useful for our case
                 'origin.airport_info_url', # Not useful for our case
                 'destination.code', # We are keeping the ICAO identifier as it is unique
                 'destination.code_iata', # ICAO identifier is more useful
                 'destination.code_lid', # ICAO identifier is more useful
                 'destination.timezone', # Not useful for our case
                 'destination.name', # Not useful for our case
                 'destination.city', # Not useful for our case
                 'destination.airport_info_url', # Not useful for our case
                 'destination' # We are keeping the ICAO identifier as it is unique
                 ], inplace=True)

### Save Dataframe

In [27]:
df.to_csv('../df_preprocessed.csv', index=False)