In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

In [2]:
DATA_PATH = Path("/kaggle/input/mfddmulti-modal-flight-delay-dataset/Aeolus/Flight_Tab/flight_with_weather_2024.csv")

df = pd.read_csv(DATA_PATH, low_memory=True)
print("Raw shape:", df.shape)


Raw shape: (6284841, 34)


In [3]:
df["Delayed"] = (df["DEP_DELAY"] > 15).astype(int)
df["FL_DATE"] = pd.to_datetime(df["FL_DATE"], errors="coerce")
df["FL_MONTH"] = df["FL_DATE"].dt.month
df["FL_DAY"] = df["FL_DATE"].dt.day
df["FL_WEEKDAY"] = df["FL_DATE"].dt.weekday


In [4]:
def parse_time_column(df, col_name):
    """
    Converts timestamp strings (e.g., 2024-01-01 12:52:00)
    into hour, minute, and seconds.
    """

    if col_name not in df.columns:
        return df

    df[col_name] = pd.to_datetime(df[col_name], errors="coerce")
    df[col_name + "_hour"] = df[col_name].dt.hour
    df[col_name + "_minute"] = df[col_name].dt.minute
    
    return df

In [5]:
df = parse_time_column(df, "CRS_DEP_TIME")
df = parse_time_column(df, "DEP_TIME")
df = parse_time_column(df, "WHEELS_OFF")

In [6]:
feature_cols = [
    "OP_CARRIER",
    "OP_CARRIER_FL_NUM",
    "ORIGIN",
    "DEST",
    "FL_MONTH",
    "FL_DAY",
    "FL_WEEKDAY",
    "CRS_DEP_TIME_hour",
    "DEP_TIME_hour",
    "TAXI_OUT",
    "WHEELS_OFF_hour",
]

feature_cols = [c for c in feature_cols if c in df.columns]

print("Using features:", feature_cols)

cat_cols = ["OP_CARRIER", "ORIGIN", "DEST"]
cat_cols = [c for c in cat_cols if c in df.columns]

num_cols = [c for c in feature_cols if c not in cat_cols]

for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

Using features: ['OP_CARRIER', 'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'FL_MONTH', 'FL_DAY', 'FL_WEEKDAY', 'CRS_DEP_TIME_hour', 'DEP_TIME_hour', 'TAXI_OUT', 'WHEELS_OFF_hour']


In [7]:
le_dict = {} 

for c in cat_cols:
    le = LabelEncoder()
    df[c] = le.fit_transform(df[c])
    le_dict[c] = le

In [8]:
X = df[feature_cols]
y = df["Delayed"]

print("Final X shape:", X.shape)
print("Final y distribution:")
print(y.value_counts())

Final X shape: (6284841, 11)
Final y distribution:
Delayed
0    5032470
1    1252371
Name: count, dtype: int64


In [9]:
cleaned_path = Path("/kaggle/working/cleaned_flight_data.csv")
out_df = pd.concat([X, y], axis=1)
out_df.to_csv(cleaned_path, index=False)
print("Saved cleaned dataset to:", cleaned_path)

Saved cleaned dataset to: /kaggle/working/cleaned_flight_data.csv
