In [0]:
pip install snowflake

In [0]:
pip install mlflow

In [0]:
import json
import os
import pandas as pd
import snowflake.connector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, matthews_corrcoef, confusion_matrix
)
import joblib
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient

In [0]:
# Load credentials from environment variables
account = os.getenv('SNOWFLAKE_ACCOUNT')
user = os.getenv('SNOWFLAKE_USER')
password = os.getenv('SNOWFLAKE_PASSWORD')
warehouse = os.getenv('SNOWFLAKE_WAREHOUSE')
database = os.getenv('SNOWFLAKE_DATABASE')  # should be 'CREDITCARD'
schema = os.getenv('SNOWFLAKE_SCHEMA')      # should be 'PUBLIC'
email = os.getenv('DATABRICKS_EMAIL')

# Step 2: Construct target path
target_dir = f"/Workspace/Users/{email}/CREDITCARD/MODEL"

# Step 3: Ensure directory exists
os.makedirs(target_dir, exist_ok=True)

In [0]:
# Function to fetch data from original table
def fetch_data_from_snowflake():
    conn = snowflake.connector.connect(
        user=user,
        password=password,
        account=account,
        warehouse=warehouse,
        database=database,
        schema=schema
    )
    cur = conn.cursor()
    cur.execute("SELECT * FROM CREDITCARD.PUBLIC.CREDITCARD")
    df = cur.fetch_pandas_all()
    conn.close()
    return df


In [0]:


def main():
    # Step 1: Load data
    data = fetch_data_from_snowflake()
    print("‚úÖ Data loaded from Snowflake. Shape:", data.shape)

    # Step 2: Split features and target
    X = data.drop(['CLASS'], axis=1)
    y = data['CLASS']
    print("\nüéØ Features shape:", X.shape)
    print("üéØ Target shape:", y.shape)

    # Step 3: Train-test split
    xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=42)
    print("‚úÖ Data split into train and test sets.")

    # Step 4: Train model
    rfc = RandomForestClassifier()
    rfc.fit(xTrain, yTrain)
    print("‚úÖ Random Forest model trained.")

    # Step 5: Evaluate model
    yPred = rfc.predict(xTest)
    metrics = {
        'Accuracy': accuracy_score(yTest, yPred),
        'Precision': precision_score(yTest, yPred),
        'Recall': recall_score(yTest, yPred),
        'F1 Score': f1_score(yTest, yPred),
        'Matthews Corrcoef': matthews_corrcoef(yTest, yPred)
    }

    print("\nüìä Model Evaluation Metrics:")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")

    # Confusion matrix
    print("\nüìâ Confusion Matrix:")
    print(confusion_matrix(yTest, yPred))

    # Dump to JSON
    # Step 4: Save metrics directly to the target location
    metrics_path = f"{target_dir}/metrics.json"
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=4)
    
    print(f"‚úÖ Metrics dumped to: {metrics_path}")


    
    # Step 6: Save model
    model_path = f"{target_dir}/model.pkl"
    joblib.dump(rfc, model_path)
    print(f"\n‚úÖ Model saved to: {model_path}")

    
    print("\nüèÅ All steps completed successfully.")

if __name__ == "__main__":
    main()
#runagain

In [0]:
# Set MLflow tracking and registry URIs for Databricks Unity Catalog
mlflow.set_tracking_uri("databricks")
mlflow.set_registry_uri("databricks-uc")

In [0]:
# Set experiment path (can also be parameterized)
experiment_path = f"/Users/{email}/CreditCardFraudDetection"
mlflow.set_experiment(experiment_path)

In [0]:
# Load model and metrics
model_path = f"{target_dir}/model.pkl"
metrics_path = f"{target_dir}/metrics.json"
model = joblib.load(model_path)
with open(metrics_path, "r") as f:
    metrics = json.load(f)

In [0]:
# Thresholds
test_thresholds = {
    'Accuracy': 0.60,
    'Precision': 0.60,
    'Recall': 0.60,
    'F1 Score': 0.60,
    'Matthews Corrcoef': 0.60
}

In [0]:
# Check metrics
def tests_pass(metrics, thresholds):
    for metric, threshold in thresholds.items():
        value = metrics.get(metric)
        if value is None:
            print(f"‚ö†Ô∏è Metric '{metric}' not found.")
            return False
        if value < threshold:
            print(f"‚ùå Test failed: {metric} = {value:.4f} < threshold {threshold}")
            return False
    return True

In [0]:
if tests_pass(metrics, test_thresholds):
    print("‚úÖ All tests passed. Registering model with MLflow...")

    # Reload data to get signature
    data = fetch_data_from_snowflake()
    X = data.drop(['CLASS'], axis=1)
    y = data['CLASS']
    input_example = X.sample(1)
    signature = infer_signature(X, model.predict(X))

    # MLflow setup like IceCream model
    mlflow.set_tracking_uri("databricks")
    mlflow.set_registry_uri("databricks-uc")
    email = os.getenv("DATABRICKS_EMAIL", "sajag.mathur@exlservice.com")
    mlflow.set_experiment(f"/Users/{email}/CreditCardFraudModel")

    with mlflow.start_run(run_name="CreditCardFraudModel_Run") as run:
        # Log metrics
        for key, val in metrics.items():
            mlflow.log_metric(key, val)

        # Register model to Unity Catalog
        mlflow.sklearn.log_model(
            sk_model=model,
            name="model",
            registered_model_name="CreditCardFraudModel",  # ‚úÖ like IceCream
            input_example=input_example,
            signature=signature
        )

        # Optional: log original files
        mlflow.log_artifact(model_path)
        mlflow.log_artifact(metrics_path)

        print("‚úÖ Model logged and registered to Unity Catalog.")
        print(f"üîó Run ID: {run.info.run_id}")

    # Tag the latest model version as challenger
    client = MlflowClient()
    model_name = "CreditCardFraudModel"
    versions = client.search_model_versions(f"name='workspace.default.CreditCardFraudModel'")
    latest_version = max(versions, key=lambda v: int(v.version))
    model_version = latest_version.version

    client.set_model_version_tag(
        name=model_name,
        version=model_version,
        key="role",
        value="challenger"
    )
    client.set_model_version_tag(
        name=model_name,
        version=model_version,
        key="status",
        value="staging"
    )

    print(f"üè∑Ô∏è Model version {model_version} tagged as 'challenger' and 'staging'.")

else:
    print("‚ùå Model failed tests. Skipping registration.")