In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import warnings
from mlflow.models.signature import infer_signature

warnings.filterwarnings("ignore")

# Set experiment name (it'll create if not exist)
mlflow.set_experiment("Delay_Classification")

# Load data
df = pd.read_csv("vehicle_with_features.csv")

# Define target label
df['is_delayed'] = df['TimeDiffToStop_min'] > 10  # Label: True if delay > 10 mins
features = ['Speed', 'DistanceToNearestStop_m', 'StoppedDuration', 'Is_Night', 'Hour', 'DayOfWeek']
X = df[features]
y = df['is_delayed'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to test
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True)
}

# Run experiments
results = []

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Train
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        # Metrics
        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)

        # Log metrics and params
        mlflow.log_param("model", name)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1
        })

        # Log model with signature
        signature = infer_signature(X_test, preds)
        mlflow.sklearn.log_model(model, artifact_path="model", signature=signature, registered_model_name=f"{name}_delay_model")

        # Output
        print(f"📊 {name}")
        print(f"   Accuracy:  {acc:.4f}")
        print(f"   Precision: {prec:.4f}")
        print(f"   Recall:    {rec:.4f}")
        print(f"   F1 Score:  {f1:.4f}")
        print("-" * 40)

        results.append((name, acc, prec, rec, f1))

# Summary
print("\n✅ Model Comparison Summary")
for name, acc, prec, rec, f1 in sorted(results, key=lambda x: x[4], reverse=True):
    print(f"{name:<18} F1: {f1:.4f}  | Acc: {acc:.4f}  | Prec: {prec:.4f}  | Rec: {rec:.4f}")


2025/07/24 17:49:09 INFO mlflow.tracking.fluent: Experiment with name 'Delay_Classification' does not exist. Creating a new experiment.
Successfully registered model 'RandomForest_delay_model'.
Created version '1' of model 'RandomForest_delay_model'.


📊 RandomForest
   Accuracy:  0.8811
   Precision: 0.8747
   Recall:    0.8551
   F1 Score:  0.8648
----------------------------------------


Successfully registered model 'LogisticRegression_delay_model'.
Created version '1' of model 'LogisticRegression_delay_model'.


📊 LogisticRegression
   Accuracy:  0.6713
   Precision: 0.6696
   Recall:    0.5146
   F1 Score:  0.5820
----------------------------------------


Successfully registered model 'DecisionTree_delay_model'.
Created version '1' of model 'DecisionTree_delay_model'.


📊 DecisionTree
   Accuracy:  0.8571
   Precision: 0.8440
   Recall:    0.8326
   F1 Score:  0.8382
----------------------------------------


Successfully registered model 'GradientBoosting_delay_model'.
Created version '1' of model 'GradientBoosting_delay_model'.


📊 GradientBoosting
   Accuracy:  0.8457
   Precision: 0.8701
   Recall:    0.7674
   F1 Score:  0.8155
----------------------------------------


Successfully registered model 'NaiveBayes_delay_model'.
Created version '1' of model 'NaiveBayes_delay_model'.


📊 NaiveBayes
   Accuracy:  0.7003
   Precision: 0.7339
   Recall:    0.5112
   F1 Score:  0.6026
----------------------------------------


Successfully registered model 'KNN_delay_model'.
Created version '1' of model 'KNN_delay_model'.


📊 KNN
   Accuracy:  0.6169
   Precision: 0.5808
   Recall:    0.4966
   F1 Score:  0.5354
----------------------------------------




📊 SVM
   Accuracy:  0.5624
   Precision: 0.6000
   Recall:    0.0472
   F1 Score:  0.0875
----------------------------------------

✅ Model Comparison Summary
RandomForest       F1: 0.8648  | Acc: 0.8811  | Prec: 0.8747  | Rec: 0.8551
DecisionTree       F1: 0.8382  | Acc: 0.8571  | Prec: 0.8440  | Rec: 0.8326
GradientBoosting   F1: 0.8155  | Acc: 0.8457  | Prec: 0.8701  | Rec: 0.7674
NaiveBayes         F1: 0.6026  | Acc: 0.7003  | Prec: 0.7339  | Rec: 0.5112
LogisticRegression F1: 0.5820  | Acc: 0.6713  | Prec: 0.6696  | Rec: 0.5146
KNN                F1: 0.5354  | Acc: 0.6169  | Prec: 0.5808  | Rec: 0.4966
SVM                F1: 0.0875  | Acc: 0.5624  | Prec: 0.6000  | Rec: 0.0472


Successfully registered model 'SVM_delay_model'.
Created version '1' of model 'SVM_delay_model'.
