#  Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [2]:
file_path = 'C:/Users/aleen/ML_project_flight_delay_prediction/data/flights_sample_100k.csv'
df = pd.read_csv(file_path)

print(f"Shape before cleaning: {df.shape}")
df.head()

Shape before cleaning: (100000, 32)


Unnamed: 0,FL_DATE,AIRLINE,AIRLINE_DOT,AIRLINE_CODE,DOT_CODE,FL_NUMBER,ORIGIN,ORIGIN_CITY,DEST,DEST_CITY,...,DIVERTED,CRS_ELAPSED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,DELAY_DUE_CARRIER,DELAY_DUE_WEATHER,DELAY_DUE_NAS,DELAY_DUE_SECURITY,DELAY_DUE_LATE_AIRCRAFT
0,2019-03-01,Allegiant Air,Allegiant Air: G4,G4,20368,1668,PGD,"Punta Gorda, FL",SPI,"Springfield, IL",...,0.0,160.0,138.0,122.0,994.0,,,,,
1,2021-02-16,American Airlines Inc.,American Airlines Inc.: AA,AA,19805,2437,DFW,"Dallas/Fort Worth, TX",LAX,"Los Angeles, CA",...,0.0,211.0,,,1235.0,,,,,
2,2022-04-12,PSA Airlines Inc.,PSA Airlines Inc.: OH,OH,20397,5560,EWN,"New Bern/Morehead/Beaufort, NC",CLT,"Charlotte, NC",...,0.0,79.0,78.0,51.0,221.0,,,,,
3,2021-10-13,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,1944,ABQ,"Albuquerque, NM",DEN,"Denver, CO",...,0.0,80.0,71.0,49.0,349.0,10.0,0.0,0.0,0.0,6.0
4,2022-06-05,Southwest Airlines Co.,Southwest Airlines Co.: WN,WN,19393,3081,PIT,"Pittsburgh, PA",STL,"St. Louis, MO",...,0.0,105.0,100.0,82.0,554.0,,,,,


# Drop Cancelled and Diverted Flights

In [3]:
df = df[(df['CANCELLED'] == 0) & (df['DIVERTED'] == 0)].copy()
print(f"Shape after removing cancelled and diverted flights: {df.shape}")

Shape after removing cancelled and diverted flights: (97148, 32)


# Fill Missing Values in Delay Component Columns

In [4]:
delay_cols = [
    'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER',
    'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT'
]
df[delay_cols] = df[delay_cols].fillna(0)

# Outlier Detection and Removal (IQR Method)

In [5]:
def remove_outliers_iqr(dataframe):
    df_clean = dataframe.copy()
    numeric_cols = df_clean.select_dtypes(include='number').columns
    
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

    return df_clean

print(f"Shape before outlier removal: {df.shape}")
df = remove_outliers_iqr(df)
print(f"Shape after outlier removal: {df.shape}")

Shape before outlier removal: (97148, 32)
Shape after outlier removal: (66212, 32)


# Feature Engineering: Time Columns and Date

In [6]:
df['CRS_DEP_HOUR'] = df['CRS_DEP_TIME'].astype(str).str.zfill(4).str[:2].astype(int)
df['CRS_ARR_HOUR'] = df['CRS_ARR_TIME'].astype(str).str.zfill(4).str[:2].astype(int)

df['FL_DATE'] = pd.to_datetime(df['FL_DATE'], errors='coerce')
df['DAY_OF_WEEK'] = df['FL_DATE'].dt.dayofweek

# Drop original time/date columns
df.drop(columns=['CRS_DEP_TIME', 'CRS_ARR_TIME', 'FL_DATE'], inplace=True)

# One-Hot Encoding 

In [7]:
df = pd.get_dummies(df, columns=['AIRLINE', 'ORIGIN', 'DEST'], drop_first=True)

# Define Features and Target

In [8]:
X = df.drop(columns=['ARR_DELAY', 'ELAPSED_TIME', 'ARR_TIME', 'DEP_TIME', 'CANCELLED', 'DIVERTED', 'DOT_CODE'], errors='ignore')
y = df['ARR_DELAY']

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (66212, 781)
Shape of y: (66212,)


# Define Preprocessing Pipeline

In [9]:
numeric_features = [
    'CRS_DEP_HOUR', 'CRS_ARR_HOUR', 'TAXI_OUT', 'TAXI_IN',
    'AIR_TIME', 'DISTANCE', 'DELAY_DUE_CARRIER',
    'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY',
    'DELAY_DUE_LATE_AIRCRAFT', 'DAY_OF_WEEK'
]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features)
])