In [0]:
pip install snowflake

In [0]:
pip install evidently

In [0]:
pip install mlflow

In [0]:
import os
import sys
import io
import pickle
import pandas as pd
import snowflake.connector
import mlflow
from evidently import Report
from evidently.presets import DataDriftPreset, ClassificationPreset
from evidently import Dataset, DataDefinition
from evidently import BinaryClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef


In [0]:
# Load credentials from environment variables

SNOWFLAKE_ACCOUNT = dbutils.widgets.get("SNOWFLAKE_ACCOUNT")
SNOWFLAKE_USER = dbutils.widgets.get("SNOWFLAKE_USER")
SNOWFLAKE_PASSWORD = dbutils.widgets.get("SNOWFLAKE_PASSWORD")
SNOWFLAKE_WAREHOUSE = dbutils.widgets.get("SNOWFLAKE_WAREHOUSE")
SNOWFLAKE_DATABASE = dbutils.widgets.get("SNOWFLAKE_DATABASE")
SNOWFLAKE_SCHEMA = dbutils.widgets.get("SNOWFLAKE_SCHEMA")
EMAIL = dbutils.widgets.get("DATABRICKS_EMAIL")


# Optional: set them as environment variables if you need

os.environ["SNOWFLAKE_USER"] = SNOWFLAKE_USER
os.environ["SNOWFLAKE_PASSWORD"] = SNOWFLAKE_PASSWORD
os.environ["SNOWFLAKE_ACCOUNT"] = SNOWFLAKE_ACCOUNT
os.environ["SNOWFLAKE_WAREHOUSE"] = SNOWFLAKE_WAREHOUSE
os.environ["SNOWFLAKE_DATABASE"] = SNOWFLAKE_DATABASE
os.environ["EMAIL"] = EMAIL
os.environ["DATABRICKS_SCHEMA"] = SNOWFLAKE_SCHEMA



In [0]:
# Paths for Databricks workspace
MONITORING_DIR = f"/Workspace/Users/{EMAIL}/CREDITCARD/Monitoring"
RETRAIN_DECISION_DIR = f"/Workspace/Users/{EMAIL}/CREDITCARD/Retraining_Decision"
CHAMPION_MODEL_PATH = f"/Workspace/Users/{EMAIL}/CREDITCARD/Champion_Model/champion_model.pkl"

# MLflow setup
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "databricks"))
mlflow.set_experiment(f"/Users/{EMAIL}/Monitoring_Experiments_V1")

In [0]:
def fetch_from_snowflake(query):
    conn = snowflake.connector.connect(
        user=SNOWFLAKE_USER,
        password=SNOWFLAKE_PASSWORD,
        account=SNOWFLAKE_ACCOUNT,
        warehouse=SNOWFLAKE_WAREHOUSE,
        database=SNOWFLAKE_DATABASE,
        schema=SNOWFLAKE_SCHEMA
    )
    df = conn.cursor().execute(query).fetch_pandas_all()
    conn.close()
    return df

In [0]:
def load_champion_model():
    if not os.path.exists(CHAMPION_MODEL_PATH):
        raise FileNotFoundError(f"Champion model not found at {CHAMPION_MODEL_PATH}")
    with open(CHAMPION_MODEL_PATH, "rb") as f:
        model = pickle.load(f)
    print(f"✅ Loaded champion model from {CHAMPION_MODEL_PATH}")
    return model

In [0]:
def calc_metrics(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1_Score": f1_score(y_true, y_pred, zero_division=0),
        "MatthewsCorrcoef": matthews_corrcoef(y_true, y_pred),
    }

In [0]:
def main():
    # Load model
    model = load_champion_model()

    # Fetch reference and current datasets from Snowflake
    ref_query = f"SELECT * FROM CREDITCARD_REFERENCE.PUBLIC.CREDITCARD_REFERENCE"
    cur_query = f"SELECT * FROM {SNOWFLAKE_DATABASE}.{SNOWFLAKE_SCHEMA}.CREDITCARD_BATCH_INPUTS"

    ref = fetch_from_snowflake(ref_query)
    cur = fetch_from_snowflake(cur_query)

    target = "CLASS"
    # Define features (exclude IDs, target, and prediction columns)
    exclude_cols = {'ID', 'CLASS', 'PREDICTION', 'PREDICTION_PROB'}
    feature_cols = [c for c in ref.columns if c not in exclude_cols]

    # Convert feature columns to numeric safely
    ref[feature_cols] = ref[feature_cols].apply(pd.to_numeric, errors='coerce')
    cur[feature_cols] = cur[feature_cols].apply(pd.to_numeric, errors='coerce')

    # Predict using the champion model
    ref["prediction"] = model.predict(ref[feature_cols])
    cur["prediction"] = model.predict(cur[feature_cols])

    # Define Evidently data definition for binary classification
    dd = DataDefinition(
        classification=[BinaryClassification(target=target, prediction_labels="prediction")],
        categorical_columns=[target, "prediction"]
    )

    ds_ref = Dataset.from_pandas(ref, data_definition=dd)
    ds_cur = Dataset.from_pandas(cur, data_definition=dd)

    # Build and run report for data drift and classification metrics
    report = Report(metrics=[DataDriftPreset(), ClassificationPreset()])
    result = report.run(reference_data=ds_ref, current_data=ds_cur)

    # Ensure output directories exist
    os.makedirs(MONITORING_DIR, exist_ok=True)
    os.makedirs(RETRAIN_DECISION_DIR, exist_ok=True)

    # Save Evidently HTML report
    report_path = os.path.join(MONITORING_DIR, "evidently_report.html")
    result.save_html(report_path)
    print(f"✅ Evidently report saved: {report_path}")

    # Calculate metrics for retraining decision
    ref_metrics = calc_metrics(ref[target], ref["prediction"])
    cur_metrics = calc_metrics(cur[target], cur["prediction"])

    # Retraining decision threshold: 10% degradation on key metrics
    degraded = []
    threshold = 0.10
    for metric in ["Accuracy", "Precision", "Recall", "F1_Score"]:
        if ref_metrics.get(metric) is not None and cur_metrics.get(metric) is not None:
            if ref_metrics[metric] - cur_metrics[metric] > threshold:
                degraded.append(metric)

    decision = "YES" if degraded else "NO"
    rationale = f"Threshold: 10% degradation. Degraded metrics: {', '.join(degraded)}" if degraded else "All metrics within threshold."

    # Save retraining decision as CSV
    retrain_path = os.path.join(RETRAIN_DECISION_DIR, "Retrain.csv")
    pd.DataFrame({
        "Retraining_Decision": [decision],
        "Rationale": [rationale]
    }).to_csv(retrain_path, index=False)
    print(f"✅ Retraining decision saved: {retrain_path}")

    # Log artifacts and metrics to MLflow
    with mlflow.start_run(run_name="Monitoring_Champion") as run:
        mlflow.log_artifact(report_path)
        mlflow.log_artifact(retrain_path)
        for k, v in cur_metrics.items():
            mlflow.log_metric(f"Current_{k}", v)
        for k, v in ref_metrics.items():
            mlflow.log_metric(f"Reference_{k}", v)
        mlflow.set_tag("Retrain_Decision", decision)
        mlflow.set_tag("Rationale", rationale)
        mlflow.set_tag("Model_Stage", "Production")
        mlflow.set_tag("Model_Role", "Champion")

    print("✅ Monitoring run completed, artifacts logged to MLflow.")

if __name__ == "__main__":
    main()