In [None]:
!git clone https://github.com/siddmuns/diabetes-ml-pipeline.git
%cd diabetes-ml-pipeline
!pip install -q -r requirements.txt

In [None]:
# Example: if files are in session storage already, ensure they exist:
!ls -l
# If you need to upload manually, use from google.colab import files; files.upload()

In [None]:
import os
os.makedirs("artifacts", exist_ok=True)
os.makedirs("mlruns", exist_ok=True)

In [None]:
# run the preprocessing script functions directly in notebook
from data_ingestion import preprocess_and_split
X_train, X_valid, X_test, y_train, y_valid, y_test, scaler, feature_names = preprocess_and_split(
    "diabetes.csv", out_dir="artifacts"
)
# Save scaler to artifacts (preprocess already saved scaler.pkl)


In [None]:
import mlflow
mlflow.set_tracking_uri("file://" + "/content/mlruns")  # adjust path if needed
mlflow.set_experiment("Diabetes_Pipeline")
print("MLflow tracking URI:", mlflow.get_tracking_uri())


In [None]:
# run the combined tune_and_train script
from tune_and_train import run_optuna, setup_mlflow
import numpy as np

# Ensure MLflow is set up
setup_mlflow(local_dir="mlruns", experiment_name="Diabetes_Pipeline")

# Load splits (already returned above, but tune_and_train main expects loads from artifacts/splits.npz if run as script)
import numpy as np
splits = np.load("artifacts/splits.npz", allow_pickle=True)
X_train = splits["X_train"]
X_valid = splits["X_valid"]
X_test = splits["X_test"]
y_train = splits["y_train"]
y_valid = splits["y_valid"]
y_test = splits["y_test"]

# run optuna
study = run_optuna(X_train, y_train, X_valid, y_valid, n_trials=20)
best_params = study.best_trial.params
print("Optuna best params:", best_params)

# retrain final model on train+valid (done by tune_and_train main if run as script)
from tune_and_train import retrain_final_and_log
X_train_full = np.vstack([X_train, X_valid])
y_train_full = np.concatenate([y_train, y_valid])
model, test_acc, test_auc, train_time, model_path = retrain_final_and_log(
    X_train_full, y_train_full, X_test, y_test, best_params, artifacts_dir="artifacts"
)
print("Final test accuracy:", test_acc, "AUC:", test_auc)


In [None]:
import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
import pandas as pd

# ⚡ Create example input for MLflow schema inference
example_input = pd.DataFrame(
    X_train_full[:5],
    columns=[f"feature_{i}" for i in range(X_train_full.shape[1])]
)

with mlflow.start_run(run_name="Final_Model") as run:
    # Log params and metrics
    mlflow.log_params(best_params)
    mlflow.log_metrics({
        "test_accuracy": float(test_acc),
        "test_roc_auc": float(test_auc)
    })

    # ⚡ Correct model logging
    mlflow.sklearn.log_model(
        model,
        name="final_model",
        input_example=example_input
    )

    run_id = run.info.run_id
    print("Logged final model to MLflow, run_id:", run_id)

# Attempt to register model
client = MlflowClient()
try:
    model_uri = f"runs:/{run_id}/final_model"
    registered = client.create_registered_model("Diabetes_GB_Model")
except Exception:
    pass

try:
    mv = client.create_model_version("Diabetes_GB_Model", model_uri, run.info.run_id)
    print("Registered model version:", mv.version)
except Exception as e:
    print("Model registration failed or not supported on local backend:", e)

In [None]:
import joblib
joblib.dump(model, "artifacts/gb_final_model.pkl")
joblib.dump(scaler, "artifacts/scaler.pkl")
print("Saved gb_final_model.pkl and scaler.pkl")


In [None]:
from batch_inference import batch_predict
out = batch_predict("artifacts/gb_final_model.pkl", "artifacts/scaler.pkl", "diabetes2.csv", "artifacts/predictions.csv")
out.head()


In [None]:
from drift_detection import detect_and_log_drift
rpt = detect_and_log_drift("diabetes.csv", "diabetes2.csv", out_dir="artifacts")
print(rpt)


In [None]:
from visualize_and_evaluate import make_plots_and_log
# Need X_test, y_test, feature_names; they were loaded earlier from splits
make_plots_and_log(model, X_test, y_test, feature_names, run_name="Final_Plots")
