# Mistplay Fraud Demo â€“ Train & Register Model

This notebook uses Feature Store to build a training set, trains a fraud classifier, tracks metrics with MLflow, and registers the model.

In [0]:
# !pip install mlflow


In [0]:
%pip install databricks-feature-engineering>=0.13.0a4 


In [0]:
%pip install "pyarrow>=16,<21"
dbutils.library.restartPython()


In [0]:
!pip list | grep pyarrow

In [0]:
import mlflow
import mlflow.data
import mlflow.sklearn
import pandas as pd
from databricks.feature_store import FeatureStoreClient, FeatureLookup
from pyspark.sql import functions as F
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

DB_NAME = "ramin_serverless_aws_catalog.mistplay_fraud_demo"
MODEL_LR = "ramin_serverless_aws_catalog.mistplay_fraud_demo.mistplay_fraud_model_lr"
MODEL_RF = "ramin_serverless_aws_catalog.mistplay_fraud_demo.mistplay_fraud_model_rf"

In [0]:

fs = FeatureStoreClient()

training_base = spark.table(f"{DB_NAME}.training_base")

# lookups = [
#     FeatureLookup(table_name=f"{DB_NAME}.account_features", lookup_key="account_id"),
#     FeatureLookup(table_name=f"{DB_NAME}.device_features", lookup_key="device_id"),
# ]

# training_set = fs.create_training_set(
#     training_base,
#     feature_lookups=lookups,
#     label="is_fraud_label",
#     exclude_columns=["account_id", "device_id"],
# )


In [0]:
# When creating a training set, you can combine:
# 1. Regular feature lookups (pre-computed features)
# 2. On-demand features (computed at inference time)

from databricks.feature_engineering import FeatureLookup

training_set = fs.create_training_set(
    df=spark.table(f"{DB_NAME}.training_base"),
    feature_lookups=[
        FeatureLookup(
            table_name=f"{DB_NAME}.account_features",
            lookup_key="account_id",
            feature_names=[
                "country",
                "platform",
                "marketing_channel",
                "events_7d",
                "avg_session_minutes_7d",
                "vpn_rate_7d",
                "distinct_devices_7d",
                "rewards_7d",
                "reward_amount_7d",
                "reward_amount_since_last_batch"
            ]
        ),
        FeatureLookup(
            table_name=f"{DB_NAME}.device_features",
            lookup_key="device_id",
            feature_names=[
                "device_type",
                "os_version",
                "is_emulator",
                "device_risk_score",
                "accounts_per_device_7d"
            ]
        )
    ],
    feature_function=f"{DB_NAME}.last_batch_cutoff",  # Add on-demand features here
    label="is_fraud_label",
    exclude_columns=["account_id", "device_id"]
)

# Load the training data
training_df = training_set.load_df()
display(training_df.limit(5))

## scikitlearn package to use (normal ML)

In [0]:

training_df = training_set.load_df().toPandas()

label_col = "is_fraud_label"
categorical_cols = [
    "country",
    "platform",
    "marketing_channel",
    "device_type",
    "os_version",
]

numeric_cols = [
    c for c in training_df.columns if c not in categorical_cols + [label_col, "last_batch_cutoff"]
]

X = training_df.drop(columns=[label_col])
y = training_df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

preprocess = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("numeric", StandardScaler(), numeric_cols),
    ]
)


In [0]:
training_df

## mlflow to track metrics and model

In [0]:

def train_and_log(model_name: str, estimator):
    pipeline = Pipeline(steps=[("preprocess", preprocess), ("model", estimator)])
    
    with mlflow.start_run(run_name=model_name) as run:
        # Create dataset objects for train and test splits
        train_data = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
        test_data = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1)
        
        train_dataset = mlflow.data.from_pandas(
            train_data,
            source=f"{DB_NAME}.training_base",
            name="fraud-detection-train",
            targets=label_col
        )
        
        test_dataset = mlflow.data.from_pandas(
            test_data,
            source=f"{DB_NAME}.training_base",
            name="fraud-detection-test",
            targets=label_col
        )
        
        # Log datasets to MLflow
        mlflow.log_input(train_dataset, context="training")
        mlflow.log_input(test_dataset, context="testing")
        
        # Train and evaluate
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict_proba(X_test)[:, 1]

        roc_auc = roc_auc_score(y_test, preds)
        pr_auc = average_precision_score(y_test, preds)

        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("pr_auc", pr_auc)

        fs.log_model(
            model=pipeline,
            artifact_path="model",
            flavor=mlflow.sklearn,
            training_set=training_set,
            registered_model_name=model_name,
            extra_pip_requirements=["pyarrow>=16,<21"]
        )

    return {
        "model_name": model_name,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
    }

results = []

results.append(
    train_and_log(
        MODEL_LR,
        LogisticRegression(max_iter=200, class_weight="balanced"),
    )
)

results.append(
    train_and_log(
        MODEL_RF,
        RandomForestClassifier(
            n_estimators=200,
            max_depth=8,
            random_state=42,
            class_weight="balanced",
        ),
    )
)


## Create another version of model using less amount of data

In [0]:
X_sampled = training_df.drop(columns=[label_col])
y_sampled = training_df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X_sampled, y_sampled, test_size=0.99, random_state=42, stratify=y_sampled
)

def train_and_log(model_name: str, estimator):
    pipeline = Pipeline(steps=[("preprocess", preprocess), ("model", estimator)])
    
    with mlflow.start_run(run_name=model_name) as run:
        # Create dataset objects for train and test splits
        train_data = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
        test_data = pd.concat([X_test, y_test.reset_index(drop=True)], axis=1)
        
        train_dataset = mlflow.data.from_pandas(
            train_data,
            source=f"{DB_NAME}.training_base",
            name="fraud-detection-train-sampled",
            targets=label_col
        )
        
        test_dataset = mlflow.data.from_pandas(
            test_data,
            source=f"{DB_NAME}.training_base",
            name="fraud-detection-test-sampled",
            targets=label_col
        )
        
        # Log datasets to MLflow
        mlflow.log_input(train_dataset, context="training")
        mlflow.log_input(test_dataset, context="testing")
        
        # Train and evaluate
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict_proba(X_test)[:, 1]

        roc_auc = roc_auc_score(y_test, preds)
        pr_auc = average_precision_score(y_test, preds)

        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("pr_auc", pr_auc)

        fs.log_model(
            model=pipeline,
            artifact_path="model",
            flavor=mlflow.sklearn,
            training_set=training_set,
            registered_model_name=model_name,
            extra_pip_requirements=["pyarrow>=16,<21"]
        )

    return {
        "model_name": model_name,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
    }

results = []

results.append(
    train_and_log(
        MODEL_LR,
        LogisticRegression(max_iter=200, class_weight="balanced"),
    )
)

results.append(
    train_and_log(
        MODEL_RF,
        RandomForestClassifier(
            n_estimators=200,
            max_depth=8,
            random_state=42,
            class_weight="balanced",
        ),
    )
)

In [0]:
`

## Online Inference

### First the feature tables must be synced to an online table

create online tables through UI or following api. see https://docs.databricks.com/en/machine-learning/feature-store/online-tables.html#create-an-online-table-using-apis





### Use api to create online tables, Use below once to create the online table



In [0]:
from databricks.feature_engineering import FeatureEngineeringClient

# Initialize the client
fe = FeatureEngineeringClient()

# Create an online store with specified capacity
fe.create_online_store(
    name="ramin-online-store-2",
    capacity="CU_1"  # Valid options: "CU_1", "CU_2", "CU_4", "CU_8"
)

#### Cell directly below uses feature engineering client. 

In [0]:
# from databricks.feature_engineering import FeatureEngineeringClient

# fe = FeatureEngineeringClient()

# # OPTION A: List what exists
# stores = fe.list_online_stores()
# for s in stores:
#     print(s.name, s.state, s.capacity)

# # OPTION B: Get (and check) a specific store
# online_store = fe.get_online_store(name="my-online-store")
# print(online_store)  # should NOT be None

In [0]:
from databricks.ml_features.entities.online_store import DatabricksOnlineStore
from databricks.feature_engineering import FeatureEngineeringClient

# Initialize the client
fe = FeatureEngineeringClient()
# Get the online store instance
online_store = fe.get_online_store(name="ramin-online-store-2")

# Publish the feature table to the online store
fe.publish_table(
    online_store=online_store,
    source_table_name="ramin_serverless_aws_catalog.mistplay_fraud_demo.account_features",
    online_table_name = "ramin_serverless_aws_catalog.mistplay_fraud_demo.account_features-online",
    publish_mode="SNAPSHOT"
)

fe.publish_table(
    online_store=online_store,
    source_table_name="ramin_serverless_aws_catalog.mistplay_fraud_demo.device_features",
    online_table_name = "ramin_serverless_aws_catalog.mistplay_fraud_demo.device_features-online",
    publish_mode="SNAPSHOT"
)
# fe.publish_table(
#     online_store=online_store,
#     source_table_name="ramin_serverless_aws_catalog.mistplay_fraud_demo.transaction_aggregates",
#     online_table_name = "ramin_serverless_aws_catalog.mistplay_fraud_demo.transaction_aggregates-online"
# )

In [0]:
results_df = pd.DataFrame(results)

best_row = results_df.sort_values("roc_auc", ascending=False).iloc[0]

best_model_name = best_row["model_name"]
best_roc_auc = float(best_row["roc_auc"])

best_model_df = spark.createDataFrame([
    {
        "model_name": best_model_name,
        "selection_metric": "roc_auc",
        "metric_value": best_roc_auc,
        "selected_at": None,
    }
]).withColumn("selected_at", F.current_timestamp())

best_model_df.write.mode("overwrite").saveAsTable(f"{DB_NAME}.model_selection")

print("Registered models:")
print(f"- {MODEL_LR}")
print(f"- {MODEL_RF}")
print("Best model by ROC AUC:")
print(best_model_name)