In [None]:
%pip install model_deploy-0.0.0-py3-none-any.whl

In [None]:
%pip install -e ..
%restart_python

In [None]:
from pathlib import Path
import sys
sys.path.append(str(Path.cwd().parent / 'src'))

In [None]:

import mlflow
from loguru import logger
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils
from importlib.metadata import version
from mlflow.models import infer_signature

import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from model_deploy.config import ProjectConfig, Tags
from model_deploy.models.basic_model import BasicModel
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils

spark = SparkSession.builder.getOrCreate()
dbutils = DBUtils(spark)

In [None]:
%sql
/*
CREATE TABLE IF NOT EXISTS mlops_dev.model_test.training_control (
    allow_training BOOLEAN,
    updated_on TIMESTAMP
);

INSERT INTO mlops_dev.model_test.training_control
VALUES (true, current_timestamp());
*/
CREATE TABLE IF NOT EXISTS mlops_prod.model_test.training_control (
    allow_training BOOLEAN,
    updated_on TIMESTAMP
);

INSERT INTO mlops_prod.model_test.training_control
VALUES (true, current_timestamp());

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# -----------------------------------------------------------
# PATCH BASICMODEL FOR SKLEARN DATASET
# -----------------------------------------------------------
def patch_basicmodel_for_sklearn():
    """Patch BasicModel.load_data() and BasicModel.prepare_features() 
       to use sklearn dataset instead of Delta tables.
    """

    def load_data(self):
                # ----------------------------------------------------
        # üîê TRAINING CONTROL CHECK
        # ----------------------------------------------------
        logger.info("üîç Checking training control flag in Lakehouse...")

        # flag = spark.sql("""
        #     SELECT allow_training
        #     FROM mlops_prod.model_test.training_control
        #     ORDER BY updated_on DESC
        #     LIMIT 1
        # """).first()[0]

        #to disable training control check, uncomment below lines
        # if not flag:
        #     logger.warning("‚õî Training disabled via training_control table. Skipping training.")
        #     # Tell pipeline to skip
        #     dbutils.jobs.taskValues.set("training_skipped", "TRUE")
        #     sys.exit(0)

        logger.info("‚úÖ Training allowed ‚Äî proceeding with dataset loading.")

        # ----------------------------------------------------
        # LOAD SKLEARN DATASET
        # ----------------------------------------------------
        logger.info("üîÑ Loading sklearn breast cancer dataset...")

        data = load_breast_cancer()
        df = pd.DataFrame(data.data, columns=data.feature_names)
        df["target"] = data.target

        # ALL numeric features
        self.num_features = list(df.columns)
        self.num_features.remove("target")

        self.cat_features = []   # No categorical features

        # Train/Test split
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

        self.train_set = train_df
        self.test_set = test_df

        self.X_train = train_df[self.num_features]
        self.y_train = train_df["target"]
        self.X_test = test_df[self.num_features]
        self.y_test = test_df["target"]

        # Needed for model_improved()
        self.eval_data = test_df.copy()

        logger.info("‚úÖ sklearn dataset loaded successfully.")

    def prepare_features(self):
        logger.info("üîÑ No preprocessing needed for sklearn dataset.")
        from lightgbm import LGBMClassifier
        self.pipeline = LGBMClassifier(**self.parameters)
        logger.info("‚úÖ Pipeline ready using LGBMClassifier.")

    def log_model(self):
        logger.info("üì¶ Logging model + parameters + metrics to MLflow...")

        # mlflow.set_experiment(self.experiment_name)
        mlflow.set_experiment("/Shared/mlops_exp")

        with mlflow.start_run(run_name="basic-lgbm", tags=self.tags) as run:

            # ‚≠ê save run_id for register_model()
            self.run_id = run.info.run_id
            dbutils.jobs.taskValues.set("candidate_run_id", self.run_id)
            # --- parameters ---
            mlflow.log_params(self.config.parameters)

            # --- metrics ---
            y_pred = self.pipeline.predict(self.X_test)

            self.metrics = {
                "f1_score": float(f1_score(self.y_test, y_pred)),
                "accuracy": float(accuracy_score(self.y_test, y_pred)),
                "precision": float(precision_score(self.y_test, y_pred)),
                "recall": float(recall_score(self.y_test, y_pred)),
            }

            mlflow.log_metrics(self.metrics)

            # --- model artifact ---
            logger.info("üìÅ Logging sklearn LightGBM model...")
            signature = infer_signature(self.X_train, self.pipeline.predict(self.X_train))

            self.model_info = mlflow.sklearn.log_model(
                sk_model=self.pipeline,
                artifact_path="model",
                signature=signature,
                input_example=self.X_train.iloc[0:1],
            )

        logger.info(f"‚úÖ MLflow logging completed. Run ID: {self.run_id}, Metrics: {self.metrics}")


        # Patch methods
    BasicModel.load_data = load_data
    BasicModel.prepare_features = prepare_features
    BasicModel.log_model = log_model

In [None]:

# -----------------------------------------------------------
# MANUAL RUN (NO ARGPARSE NEEDED)
# -----------------------------------------------------------

env = "dev"  # You can change manually: dev / acc / prd
config_path = "model_config_deploy.yml"

spark = SparkSession.builder.getOrCreate()
dbutils = DBUtils(spark)

# Manual tags for logging
tags = Tags(
    git_sha="manual_run",
    branch="manual_run",
    job_run_id="manual_run"#spark.conf.get("spark.databricks.job.id", "unknown_job_id")
)

config = ProjectConfig.from_yaml(config_path=config_path, env=env)

In [None]:

# Patch BEFORE running training
patch_basicmodel_for_sklearn()
print(config)
basic_model = BasicModel(config=config, tags=tags, spark=spark)

basic_model.load_data()
basic_model.prepare_features()

basic_model.train()
basic_model.log_model()

model_improved = basic_model.model_improved() # Ensure the model alias exists before calling model_improved, or handle the case where it does not.



In [None]:

# -----------------------------------------------------------
#  REGISTER MODEL (COMMUNITY + UC-SAFE WAY)
# -----------------------------------------------------------
import mlflow
from mlflow import MlflowClient

mlflow.set_tracking_uri("databricks")

# ‚úÖ Use the run_id we stored during log_model()
run_id = basic_model.run_id
model_uri = f"runs:/{run_id}/model"

# üëâ IMPORTANT: use a NEW model name (avoid existing model_deploy that you lack rights on)
MODEL_NAME = "workspace.default.breast_cancer_lgbm"

print(f"üì¶ Registering model_uri={model_uri} as {MODEL_NAME} ...")

try:
    result = mlflow.register_model(
        model_uri=model_uri,
        name=MODEL_NAME,
    )
    print("‚úÖ Model successfully REGISTERED")
    print("‚úÖ Registered model:", result.name)
    print("‚úÖ Registered version:", result.version)
except Exception as e:
    raise SystemExit(f"‚ùå Registration failed: {e}")

# -----------------------------------------------------------
#  LOAD THE JUST-REGISTERED VERSION
# -----------------------------------------------------------

client = MlflowClient()

# Use the version we just created instead of hardcoding "1"
MODEL_VERSION = result.version

print("Tracking URI:", mlflow.get_tracking_uri())

# Verify model is visible
rm = client.get_registered_model(MODEL_NAME)
print("‚úÖ Registered model found:", rm.name)

MODEL_URI = f"models:/{MODEL_NAME}/{MODEL_VERSION}"
print("‚úÖ Final Model URI:", MODEL_URI)

model = mlflow.pyfunc.load_model(MODEL_URI)
print("‚úÖ Model loaded successfully")

In [None]:
# import mlflow

# run_id = mlflow.last_active_run().info.run_id
# model_uri = f"runs:/{run_id}/model"

# mlflow.register_model(
#     model_uri=model_uri,
#     name="model_deploy"
# )

# print("‚úÖ Model successfully REGISTERED")


In [None]:
# import mlflow
# from mlflow import MlflowClient

# # ‚úÖ Always force Databricks tracking
# mlflow.set_tracking_uri("databricks")

# client = MlflowClient()

# # ‚úÖ Your ACTUAL registered model (confirmed by your output)
# # MODEL_NAME = "workspace.default.model_deploy"
# MODEL_NAME = "workspace.default.breast_cancer_lgbm"
# MODEL_VERSION = "1"

# print("Tracking URI:", mlflow.get_tracking_uri())

# # --------------------------------------------------------
# # ‚úÖ 1. Verify registered model exists
# # --------------------------------------------------------
# try:
#     rm = client.get_registered_model(MODEL_NAME)
#     print("‚úÖ Registered model found:", rm.name)
# except Exception as e:
#     raise SystemExit(f"‚ùå Model not found: {e}")

# # --------------------------------------------------------
# # ‚úÖ 2. Build UC-compatible model URI (NO FILTER, NO ALIAS)
# # --------------------------------------------------------
# MODEL_URI = f"models:/{MODEL_NAME}/{MODEL_VERSION}"
# print("‚úÖ Final Model URI:", MODEL_URI)

# # --------------------------------------------------------
# # ‚úÖ 3. Load model for inference / deployment
# # --------------------------------------------------------
# model = mlflow.pyfunc.load_model(MODEL_URI)

# print("‚úÖ Model loaded successfully")

# # --------------------------------------------------------
# # ‚úÖ 4. (Optional) Test prediction
# # --------------------------------------------------------
# # preds = model.predict(test_dataframe)
# # print(preds)