In [41]:
import pandas as pd
import numpy as np

In [42]:
# Load dataset
df = pd.read_csv("airline_delay.csv")

In [43]:
# Check dataset shape
df.shape

(171666, 21)

In [44]:
# Check missing values
df.isnull().sum()

year                     0
month                    0
carrier                  0
carrier_name             0
airport                  0
airport_name             0
arr_flights            240
arr_del15              443
carrier_ct             240
weather_ct             240
nas_ct                 240
security_ct            240
late_aircraft_ct       240
arr_cancelled          240
arr_diverted           240
arr_delay              240
carrier_delay          240
weather_delay          240
nas_delay              240
security_delay         240
late_aircraft_delay    240
dtype: int64

In [45]:
# Separate numerical and categorical columns
num_cols = df.select_dtypes(include="number").columns
cat_cols = df.select_dtypes(include="object").columns

# Handle missing values
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [46]:
# Remove duplicate rows
df = df.drop_duplicates()

df.shape

(171666, 21)

In [47]:
# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df_encoded.shape

(171666, 871)

In [48]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])

In [49]:
df_encoded.columns

Index(['year', 'month', 'arr_flights', 'arr_del15', 'carrier_ct', 'weather_ct',
       'nas_ct', 'security_ct', 'late_aircraft_ct', 'arr_cancelled',
       ...
       'airport_name_Williston, ND: Sloulin Field International',
       'airport_name_Williston, ND: Williston Basin International',
       'airport_name_Wilmington, DE: New Castle',
       'airport_name_Wilmington, NC: Wilmington International',
       'airport_name_Worcester, MA: Worcester Regional',
       'airport_name_Wrangell, AK: Wrangell Airport',
       'airport_name_Yakima, WA: Yakima Air Terminal/McAllister Field',
       'airport_name_Yakutat, AK: Yakutat Airport',
       'airport_name_Youngstown/Warren, OH: Youngstown-Warren Regional',
       'airport_name_Yuma, AZ: Yuma MCAS/Yuma International'],
      dtype='object', length=871)

In [51]:
from sklearn.feature_selection import SelectKBest, f_classif

# Define features and target
X = df_encoded.drop("arr_del15", axis=1)
y = df_encoded["arr_del15"]

# Select top 10 most relevant features
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

X_selected.shape

(171666, 10)

In [52]:
selected_features = X.columns[selector.get_support()]
selected_features

Index(['arr_flights', 'carrier_ct', 'weather_ct', 'nas_ct', 'late_aircraft_ct',
       'arr_diverted', 'arr_delay', 'carrier_delay', 'nas_delay',
       'late_aircraft_delay'],
      dtype='object')