# Preprocessing and Feature Engineering

In this notebook, we will clean the dataset, create meaningful features, and prepare the data for modeling. 

We will:
- Convert time features to usable formats
- Encode categorical variables
- Handle missing values
- Create new features to capture important information
- Prepare the dataset for machine learning

In [19]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set_style('whitegrid')

# Load the cleaned dataset
data_path = "../data/processed/flights_clean.csv"
df = pd.read_csv(data_path)

# Quick look
df.head()

Unnamed: 0,is_delay,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,...,DestState,CRSDepTime,Cancelled,Diverted,Distance,DistanceGroup,ArrDelay,ArrDelayMinutes,AirTime,CRSDepHour
0,1,2014,1,1,1,3,2014-01-01,UA,LAX,CA,...,IL,900,0.0,0.0,1744.0,7,43.0,43.0,218.0,9
1,0,2014,1,1,1,3,2014-01-01,AA,IAH,TX,...,TX,1750,0.0,0.0,224.0,1,2.0,2.0,50.0,17
2,1,2014,1,1,1,3,2014-01-01,AA,LAX,CA,...,IL,1240,0.0,0.0,1744.0,7,26.0,26.0,220.0,12
3,1,2014,1,1,1,3,2014-01-01,AA,DFW,TX,...,CA,1905,0.0,0.0,1235.0,5,159.0,159.0,169.0,19
4,0,2014,1,1,1,3,2014-01-01,AA,DFW,TX,...,NC,1115,0.0,0.0,936.0,4,-13.0,0.0,108.0,11


# Check data types and missing values

In [20]:
print("Data types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

Data types:
 is_delay               int64
Year                   int64
Quarter                int64
Month                  int64
DayofMonth             int64
DayOfWeek              int64
FlightDate            object
Reporting_Airline     object
Origin                object
OriginState           object
Dest                  object
DestState             object
CRSDepTime             int64
Cancelled            float64
Diverted             float64
Distance             float64
DistanceGroup          int64
ArrDelay             float64
ArrDelayMinutes      float64
AirTime              float64
CRSDepHour             int64
dtype: object

Missing values:
 is_delay             0
Year                 0
Quarter              0
Month                0
DayofMonth           0
DayOfWeek            0
FlightDate           0
Reporting_Airline    0
Origin               0
OriginState          0
Dest                 0
DestState            0
CRSDepTime           0
Cancelled            0
Diverted             0
D

# Convert columns to appropriate data types

In [21]:
df['FlightDate'] = pd.to_datetime(df['FlightDate'])
df['ScheduledHour'] = df['CRSDepTime'].astype(str).str.zfill(4).str[:2].astype(int)

df[['FlightDate', 'ScheduledHour']].head()

Unnamed: 0,FlightDate,ScheduledHour
0,2014-01-01,9
1,2014-01-01,17
2,2014-01-01,12
3,2014-01-01,19
4,2014-01-01,11


# Handle missing values in important columns

In [22]:
# Fill missing ArrDelayMinutes and AirTime with median or zero
df['ArrDelayMinutes'] = df['ArrDelayMinutes'].fillna(0)
df['AirTime'] = df['AirTime'].fillna(df['AirTime'].median())

# Verify no more missing values in these columns
print(df[['ArrDelayMinutes', 'AirTime']].isnull().sum())

ArrDelayMinutes    0
AirTime            0
dtype: int64


# Encode categorical variables with one-hot encoding

In [23]:
categorical_cols = ['Reporting_Airline', 'Origin', 'Dest']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(f"Shape before encoding: {df.shape}")
print(f"Shape after encoding: {df_encoded.shape}")

Shape before encoding: (1635590, 22)
Shape after encoding: (1635590, 39)


# Feature selection: Drop columns not used for modeling

In [24]:
drop_cols = ['ArrDelay']  # Keep 'is_delay' as target
df_features = df_encoded.drop(columns=drop_cols, errors='ignore')

# Prepare train/test split

In [25]:
from sklearn.model_selection import train_test_split

target_col = 'is_delay'
X = df_features.drop(columns=[target_col])
y = df_features[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")

Train set shape: (1308472, 37), Test set shape: (327118, 37)


# Concatenate features and target for saving

In [26]:
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save to CSV

In [27]:
os.makedirs("../data/processed", exist_ok=True)
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

print("✅ train.csv and test.csv saved in data/processed/")

✅ train.csv and test.csv saved in data/processed/
