In [None]:
!git clone https://github.com/siddmuns/diabetes-ml-pipeline.git
%cd diabetes-ml-pipeline
!pip install -q -r requirements.txt

In [None]:
# Ensure artifacts folders exist
import os
os.makedirs("artifacts", exist_ok=True)
os.makedirs("mlruns", exist_ok=True)

In [None]:
# Preprocess and split data
from data_ingestion import preprocess_and_split
X_train, X_valid, X_test, y_train, y_valid, y_test, scaler, feature_names = preprocess_and_split(
    "diabetes.csv", out_dir="artifacts"
)

In [None]:
# Setup MLflow
from tune_and_train import setup_mlflow
mlflow = setup_mlflow(local_dir="mlruns", experiment_name="Diabetes_Pipeline")

In [None]:
# Run Optuna hyperparameter tuning with fixed trial logging
from tune_and_train import run_optuna
study = run_optuna(X_train, y_train, X_valid, y_valid, n_trials=20)
best_params = study.best_trial.params
print("Best Optuna parameters:", best_params)

In [None]:
# Retrain final model on train+valid and log correctly
from tune_and_train import retrain_final_and_log
import mlflow
import pandas as pd

X_train_full = np.vstack([X_train, X_valid])
y_train_full = np.concatenate([y_train, y_valid])

with mlflow.start_run(run_name="Final_Model") as run:
    model, test_acc, test_auc, train_time, model_path = retrain_final_and_log(
        X_train_full, y_train_full, X_test, y_test, best_params, artifacts_dir="artifacts"
    )
    mlflow.log_params(best_params)
    mlflow.log_metrics({
        "test_accuracy": float(test_acc),
        "test_roc_auc": float(test_auc),
        "train_time_s": float(train_time)
    })
    
    example_input = pd.DataFrame(X_train_full[:5], columns=[f"feature_{i}" for i in range(X_train_full.shape[1])])
    from mlflow.models.signature import infer_signature
    signature = infer_signature(X_train_full, model.predict(X_train_full))
    mlflow.sklearn.log_model(model, name="final_model", input_example=example_input, signature=signature)

    run_id = run.info.run_id
    print("Final model logged, run_id:", run_id)

In [None]:
# Optional: register model in MLflow Model Registry
from mlflow import MlflowClient
from tune_and_train import register_model_mlflow
client = MlflowClient()
register_model_mlflow(run_id, client, model_name="Diabetes_GB_Model")

In [None]:
# Save locally as backup
import joblib
joblib.dump(model, "artifacts/gb_final_model.pkl")
joblib.dump(scaler, "artifacts/scaler.pkl")
print("Saved gb_final_model.pkl and scaler.pkl")

In [None]:
# Batch predictions
from batch_inference import batch_predict
out = batch_predict("artifacts/gb_final_model.pkl", "artifacts/scaler.pkl", "diabetes2.csv", "artifacts/predictions.csv")
out.head()

In [None]:
# Drift detection
from drift_detection import detect_and_log_drift
rpt = detect_and_log_drift("diabetes.csv", "diabetes2.csv", out_dir="artifacts")
print(rpt)

In [None]:
# Visualize and evaluate
from visualize_and_evaluate import make_plots_and_log
make_plots_and_log(model, X_test, y_test, feature_names, run_name="Final_Plots")