In [None]:
!pip install mlflow scikit-learn pandas


Collecting mlflow
  Downloading mlflow-3.1.4-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.4 (from mlflow)
  Downloading mlflow_skinny-3.1.4-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading databricks_sdk-0.60.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.4->mlflow)
  Downloading opentele

In [2]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

# Load data
df = pd.read_csv("vehicle_with_features.csv")

# Define target label
df['is_delayed'] = df['TimeDiffToStop_min'] > 10  # Adjust threshold if needed

# Features to use
features = ['Speed', 'DistanceToNearestStop_m', 'StoppedDuration', 'Is_Night', 'Hour', 'DayOfWeek']
X = df[features]
y = df['is_delayed'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to test
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "LogisticRegression": LogisticRegression(),
    "DecisionTree": DecisionTreeClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "NaiveBayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC()
}

# Run experiments
results = []

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        prec = precision_score(y_test, preds)
        rec = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)

        # Logging to MLflow
        mlflow.log_param("model", name)
        mlflow.log_metrics({
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1
        })
        mlflow.sklearn.log_model(model, f"{name}_model")

        # Print output
        print(f"📊 {name}")
        print(f"   Accuracy:  {acc:.4f}")
        print(f"   Precision: {prec:.4f}")
        print(f"   Recall:    {rec:.4f}")
        print(f"   F1 Score:  {f1:.4f}")
        print("-" * 40)

        results.append((name, acc, prec, rec, f1))

# Optional: Print summary
print("\n✅ Model Comparison Summary")
for name, acc, prec, rec, f1 in sorted(results, key=lambda x: x[4], reverse=True):
    print(f"{name:<18} F1: {f1:.4f}  | Acc: {acc:.4f}  | Prec: {prec:.4f}  | Rec: {rec:.4f}")




📊 RandomForest
   Accuracy:  0.8801
   Precision: 0.8753
   Recall:    0.8517
   F1 Score:  0.8633
----------------------------------------




📊 LogisticRegression
   Accuracy:  0.6713
   Precision: 0.6696
   Recall:    0.5146
   F1 Score:  0.5820
----------------------------------------




📊 DecisionTree
   Accuracy:  0.8586
   Precision: 0.8437
   Recall:    0.8371
   F1 Score:  0.8404
----------------------------------------




📊 GradientBoosting
   Accuracy:  0.8462
   Precision: 0.8712
   Recall:    0.7674
   F1 Score:  0.8160
----------------------------------------




📊 NaiveBayes
   Accuracy:  0.7003
   Precision: 0.7339
   Recall:    0.5112
   F1 Score:  0.6026
----------------------------------------




📊 KNN
   Accuracy:  0.6169
   Precision: 0.5808
   Recall:    0.4966
   F1 Score:  0.5354
----------------------------------------




📊 SVM
   Accuracy:  0.5624
   Precision: 0.6000
   Recall:    0.0472
   F1 Score:  0.0875
----------------------------------------

✅ Model Comparison Summary
RandomForest       F1: 0.8633  | Acc: 0.8801  | Prec: 0.8753  | Rec: 0.8517
DecisionTree       F1: 0.8404  | Acc: 0.8586  | Prec: 0.8437  | Rec: 0.8371
GradientBoosting   F1: 0.8160  | Acc: 0.8462  | Prec: 0.8712  | Rec: 0.7674
NaiveBayes         F1: 0.6026  | Acc: 0.7003  | Prec: 0.7339  | Rec: 0.5112
LogisticRegression F1: 0.5820  | Acc: 0.6713  | Prec: 0.6696  | Rec: 0.5146
KNN                F1: 0.5354  | Acc: 0.6169  | Prec: 0.5808  | Rec: 0.4966
SVM                F1: 0.0875  | Acc: 0.5624  | Prec: 0.6000  | Rec: 0.0472
