In [None]:
!git clone https://github.com/siddmuns/diabetes-ml-pipeline.git
%cd diabetes-ml-pipeline
!pip install -q -r requirements.txt

In [None]:
# Cell 1 — create folders
import os
os.makedirs("artifacts", exist_ok=True)
os.makedirs("mlruns", exist_ok=True)

In [None]:
# Cell 2 — preprocess & split
from data_ingestion import preprocess_and_split
X_train, X_valid, X_test, y_train, y_valid, y_test, scaler, feature_names = preprocess_and_split("diabetes.csv", out_dir="artifacts")

In [None]:
# Cell 3 — setup MLflow
from tune_and_train import setup_mlflow
mlflow = setup_mlflow(local_dir="mlruns", experiment_name="Diabetes_Pipeline")

In [None]:
# Cell 4 — hyperparameter tuning (RandomizedSearchCV)
from tune_and_train import run_random_search
# optional: define custom search space (else uses defaults in function)
rs = run_random_search(X_train, y_train, n_iter=25, cv=5)
best_params = rs.best_params_
print("Best params:", best_params)

In [None]:
# Cell 5 — retrain final & log
from tune_and_train import retrain_final_and_log
import numpy as np
X_train_full = np.vstack([X_train, X_valid])
y_train_full = np.concatenate([y_train, y_valid])
import mlflow

with mlflow.start_run(run_name="Final_Model"):
    model, test_acc, test_auc, train_time, model_path = retrain_final_and_log(
        X_train_full, y_train_full, X_test, y_test, best_params, artifacts_dir="artifacts"
    )
    mlflow.log_params(best_params)
    mlflow.log_metrics({"test_accuracy": float(test_acc), "test_roc_auc": float(test_auc), "train_time_s": float(train_time)})
    run_id = mlflow.active_run().info.run_id
    print("Final model run id:", run_id)

In [None]:
# Cell 6 — save artifacts locally
import joblib
joblib.dump(model, "artifacts/gb_final_model.pkl")
joblib.dump(scaler, "artifacts/scaler.pkl")
print("Saved model and scaler to artifacts/")

In [None]:
# Cell 7 — batch inference
from batch_inference import batch_predict
out = batch_predict("artifacts/gb_final_model.pkl", "artifacts/scaler.pkl", "diabetes2.csv", "artifacts/predictions.csv")
out.head()

In [None]:
# Cell 8 — drift detection & visualization
from drift_detection import detect_and_log_drift
rpt = detect_and_log_drift("diabetes.csv", "diabetes2.csv", out_dir="artifacts")
print(rpt)

from visualize_and_evaluate import make_plots_and_log
make_plots_and_log(model, X_test, y_test, feature_names)