In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
# Load your dataset
# STEP 2: Data Cleaning & Preprocessing

# Ensure target column exists and drop rows with missing target values
df = pd.read_csv("Airline_Delay_Cause.csv")
df = df.dropna(subset=["arr_del15"])

# Drop columns unlikely to help in prediction
drop_cols = ['arr_delay', 'carrier_name', 'airport_name']
# Keep only columns that exist in the DataFrame
drop_cols = [col for col in drop_cols if col in df.columns]
df = df.drop(columns=drop_cols)



In [None]:
# Define target and features
y = df["arr_del15"]
X = df.drop(columns=["arr_del15"])

X = pd.get_dummies(X, drop_first=True)

In [None]:
# 1. Make sure target has only 0 and 1
df = df[df['arr_del15'].isin([0, 1])]

# 2. Re-define target and features
y = df['arr_del15']
X = df.drop(columns=['arr_del15'])

# 3. Re-encode after filtering
X = pd.get_dummies(X, drop_first=True)

# 4. Check class balance
print("✅ Class distribution:")
print(y.value_counts())

# 5. Safe train-test split (now stratify works!)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 7. Predict and evaluate
y_pred = model.predict(X_test)

print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred))

print("✅ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.nlargest(15).plot(kind='barh', figsize=(10, 7))
plt.title("Top 15 Feature Importances")
plt.show()

In [17]:
import joblib

# Save trained model
joblib.dump(model, "flight_delay_model.pkl")

# Save the feature columns used during training
joblib.dump(X.columns.tolist(), "model_columns.pkl")

['model_columns.pkl']