<a href="https://colab.research.google.com/github/sthirisha2003/Flight-Delay-Safety-Fraud-Analysis/blob/main/Flight_Delay%2C_Safety_%26_Fraud_Analysis_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===========================================
# FINAL XGBOOST PIPELINE (ACCURACY-ONLY VERSION)
# ===========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# ===========================================
# Load dataset
# ===========================================
data = pd.read_csv("combined_airline_dataset_balanced.csv")

# ===========================================
# Convert datetime columns
# ===========================================
data['actual_dep'] = pd.to_datetime(data['actual_dep'], errors='coerce')
data['scheduled_dep'] = pd.to_datetime(data['scheduled_dep'], errors='coerce')

# Convert numeric-like columns
numeric_cols = ['scheduled_ops', 'airport_capacity', 'wind_speed_knots',
                'precip_mm', 'visibility_km']
for col in numeric_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# ===========================================
# Feature Engineering
# ===========================================
data['departure_diff'] = (data['actual_dep'] - data['scheduled_dep']).dt.total_seconds() / 60
data['load_ratio'] = data['scheduled_ops'] / (data['airport_capacity'] + 1)
data['weather_index'] = (data['wind_speed_knots'] + data['precip_mm']) / (data['visibility_km'] + 1)

# Drop datetime columns
data = data.drop(columns=['actual_dep', 'scheduled_dep'])

# Handle NaN or infinite values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# ===========================================
# Label Encoding for categorical features
# ===========================================
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# ===========================================
# Function: Train & Evaluate (Accuracy Only)
# ===========================================
def train_and_evaluate(task_name, target_column):
    print(f"\n================ {task_name.upper()} ================")

    X = data.drop(columns=[target_column])
    y = data[target_column]

    # Remove correlated columns (to avoid data leakage)
    if target_column == "delayed_flag":
        X = X.drop(columns=[col for col in ['delay_minutes','late_by_30'] if col in X.columns])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    # Compute class weight
    scale_pos_weight = (len(y_train) - sum(y_train)) / (sum(y_train) + 1e-6)

    # Define XGBoost model
    model = xgb.XGBClassifier(
        n_estimators=300,
        learning_rate=0.07,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=2,
        reg_alpha=1,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        eval_metric='logloss'
    )

    # Train model
    model.fit(X_train_bal, y_train_bal)

    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    print(f"Accuracy: {acc:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

    return {
        'Task': task_name,
        'Accuracy': acc,
        'ROC-AUC': roc_auc
    }

# ===========================================
# Run All Tasks (Accuracy Only)
# ===========================================
results = []
results.append(train_and_evaluate("Delay Prediction", "delayed_flag"))
results.append(train_and_evaluate("Safety Prediction", "safety_incident"))
results.append(train_and_evaluate("Fraud Detection", "failure_event"))

# Save summary
results_df = pd.DataFrame(results)
results_df.to_csv("xgboost_accuracy_results_summary.csv", index=False)

print("\n✅ Accuracy summary saved to 'xgboost_accuracy_results_summary.csv'")
print(results_df)



Accuracy: 0.8405
ROC-AUC: 0.9316

Accuracy: 0.9812
ROC-AUC: 0.8902

Accuracy: 0.8772
ROC-AUC: 0.7956

✅ Accuracy summary saved to 'xgboost_accuracy_results_summary.csv'
                Task  Accuracy   ROC-AUC
0   Delay Prediction   0.84050  0.931554
1  Safety Prediction   0.98125  0.890247
2    Fraud Detection   0.87725  0.795563
