In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Step 2: Load Data
df = pd.read_csv("cleaned_flight_data.csv")

In [5]:
# Step 3: Define Features and Target
X = df.drop("Flight_Cancelled", axis=1)
y = df["Flight_Cancelled"]

In [7]:
# Step 4: Identify Column Types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [9]:
# Step 5: Split Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 6: Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numerical_cols),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

In [13]:
# Step 7: Models to Compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear')  # You can try 'rbf' too
}

In [15]:
# Step 8: Train and Evaluate Models
results = []
predictions_df = pd.DataFrame()

for model_name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # Store predictions (for .csv export)
    predictions_df[model_name] = y_pred

    # Evaluation Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": round(accuracy, 3),
        "Precision": round(precision, 3),
        "Recall": round(recall, 3),
        "F1 Score": round(f1, 3)
    })


In [17]:
# Step 9: Create Comparison Table
results_df = pd.DataFrame(results)
print("Model Comparison Table:\n", results_df)

Model Comparison Table:
                     Model  Accuracy  Precision  Recall  F1 Score
0     Logistic Regression     0.803      0.836   0.889     0.862
1           Decision Tree     0.967      0.971   0.981     0.976
2           Random Forest     0.985      0.998   0.981     0.989
3  Support Vector Machine     0.802      0.839   0.881     0.860


In [19]:
# Step 10: Save Results to CSV
results_df.to_csv("classification_model_results.csv", index=False)