# Import libraries and load raw data

In [12]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv('../data/raw/archive/flight_delay_predict.csv')
df.head()

Unnamed: 0,is_delay,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,Cancelled,Diverted,Distance,DistanceGroup,ArrDelay,ArrDelayMinutes,AirTime
0,1.0,2014,1,1,1,3,2014-01-01,UA,LAX,CA,ORD,IL,900,0.0,0.0,1744.0,7,43.0,43.0,218.0
1,0.0,2014,1,1,1,3,2014-01-01,AA,IAH,TX,DFW,TX,1750,0.0,0.0,224.0,1,2.0,2.0,50.0
2,1.0,2014,1,1,1,3,2014-01-01,AA,LAX,CA,ORD,IL,1240,0.0,0.0,1744.0,7,26.0,26.0,220.0
3,1.0,2014,1,1,1,3,2014-01-01,AA,DFW,TX,LAX,CA,1905,0.0,0.0,1235.0,5,159.0,159.0,169.0
4,0.0,2014,1,1,1,3,2014-01-01,AA,DFW,TX,CLT,NC,1115,0.0,0.0,936.0,4,-13.0,0.0,108.0


In [13]:
df.shape

(1635590, 20)

# Select relevant columns

In [15]:
columns_to_keep = [
    'is_delay', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
    'FlightDate', 'Reporting_Airline', 'Origin', 'OriginState',
    'Dest', 'DestState', 'CRSDepTime', 'Cancelled', 'Diverted',
    'Distance', 'DistanceGroup', 'ArrDelay', 'ArrDelayMinutes', 'AirTime'
]

df = df[columns_to_keep]
print(f"🧾 Columns reduced to: {df.shape[1]}")

🧾 Columns reduced to: 20


# Filter cancelled and diverted flights

In [17]:
df = df[(df['Cancelled'] == 0.0) & (df['Diverted'] == 0.0)]


# Convert 'FlightDate' to datetime

In [18]:
df['FlightDate'] = pd.to_datetime(df['FlightDate'])

# Convert CRSDepTime to hour of the day

In [19]:
df['CRSDepHour'] = df['CRSDepTime'].astype(str).str.zfill(4).str[:2].astype(int)

# Replace possible nulls in delay columns (ArrDelay may be -13, -5, etc.)

In [20]:
df['ArrDelayMinutes'] = df['ArrDelayMinutes'].fillna(0)
df['ArrDelay'] = df['ArrDelay'].fillna(0)
df['AirTime'] = df['AirTime'].fillna(df['AirTime'].median())

# Ensure 'is_delay' is binary integer

In [21]:
df['is_delay'] = df['is_delay'].astype(int)

# Reset index

In [22]:
df.reset_index(drop=True, inplace=True)

# Save cleaned dataset

In [23]:
os.makedirs("../data/processed", exist_ok=True)
df.to_csv("../data/processed/flights_clean.csv", index=False)
print("📁 Cleaned data saved to: data/processed/flights_clean.csv")

📁 Cleaned data saved to: data/processed/flights_clean.csv
