In [None]:
# ✈️ 1. IMPORTS & SETTINGS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# 📦 2. LOAD DATA
df = pd.read_csv("Airline_Delay_Cause.csv")



In [None]:
# 🔍 3. CLEANING
df.dropna(inplace=True)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df.drop(columns=["carrier_name", "airport_name"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
# 📊 4. BASIC STATS
print("Shape:", df.shape)
print(df.describe(include="all"))
print(df.isnull().sum())

In [None]:
# 📈 5. DISTRIBUTION OF ARRIVAL DELAYS
# Filter extreme outliers (e.g., anything over 1000 minutes)
filtered_df = df[df["arr_delay"] <= 1000]

sns.histplot(filtered_df["arr_delay"], bins=50, kde=True)
plt.title("Distribution of Arrival Delays (<= 1000 min)")
plt.xlabel("Arrival Delay (minutes)")
plt.ylabel("Number of Flights")
plt.show()


In [None]:
# 📉 6. PERCENTAGE OF FLIGHTS DELAYED ≥15 MIN
df["delayed_15+"] = df["arr_del15"] / df["arr_flights"]
sns.histplot(df["delayed_15+"], bins=30, kde=True)
plt.title("Proportion of Flights Delayed ≥15 Minutes")
plt.xlabel("Delay Rate")
plt.show()

In [None]:
# 🔍 7. AVERAGE DELAY MINUTES BY CAUSE
cause_cols = ["carrier_delay", "weather_delay", "nas_delay", "security_delay", "late_aircraft_delay"]
df_cause = df[cause_cols].mean().sort_values(ascending=False)

In [None]:
sns.barplot(x=df_cause.values, y=df_cause.index)
plt.title("Average Delay Duration by Cause (minutes)")
plt.xlabel("Avg Delay (minutes)")
plt.ylabel("Cause of Delay")
plt.show()

In [None]:
# 🗓️ 8. TEMPORAL TRENDS
df["date"] = pd.to_datetime(df[["year", "month"]].assign(day=1))

In [None]:
# Monthly Average Delay
monthly_delay = df.groupby("date")["arr_delay"].mean().reset_index()
sns.lineplot(data=monthly_delay, x="date", y="arr_delay")
plt.title("Monthly Average Arrival Delay")
plt.xlabel("Month")
plt.ylabel("Avg Delay (min)")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Delay by Month
monthly = df.groupby("month")["arr_delay"].mean()
sns.barplot(x=monthly.index, y=monthly.values)
plt.title("Average Delay by Month")
plt.xlabel("Month")
plt.ylabel("Avg Delay (min)")
plt.show()

In [None]:
# ✈️ 9. DELAY BY AIRPORT OR CARRIER (Top 10)
top_airports = df.groupby("airport")["arr_delay"].mean().sort_values(ascending=False).head(10)
sns.barplot(x=top_airports.values, y=top_airports.index)
plt.title("Top 10 Airports by Average Delay")
plt.xlabel("Avg Delay (min)")
plt.ylabel("Airport")
plt.show()

top_carriers = df.groupby("carrier")["arr_delay"].mean().sort_values(ascending=False).head(10)
sns.barplot(x=top_carriers.values, y=top_carriers.index)
plt.title("Top 10 Carriers by Average Delay")
plt.xlabel("Avg Delay (min)")
plt.ylabel("Carrier")
plt.show()

In [None]:
# 📉 10. CORRELATION ANALYSIS
corr = df[cause_cols + ["arr_delay"]].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation of Delay Causes with Total Delay")
plt.show()

In [None]:
print("Key Insights:")
print("- Late aircraft and NAS delays are the top contributors to total delay.")
print("- Delays peak during summer months (June–August).")
print("- Some airports and carriers consistently experience higher delays.")