# Mistplay Fraud Demo â€“ Feature Pipeline

This notebook builds feature tables in Feature Store using the synthetic data. It creates account-level and device-level features and prepares a base dataset for model training.

In [None]:
from pyspark.sql import functions as F
from databricks.feature_store import FeatureStoreClient

DB_NAME = "mistplay_fraud_demo"
fs = FeatureStoreClient()

accounts = spark.table(f"{DB_NAME}.accounts")
account_device = spark.table(f"{DB_NAME}.account_device")
devices = spark.table(f"{DB_NAME}.devices")
events = spark.table(f"{DB_NAME}.events")
rewards = spark.table(f"{DB_NAME}.rewards")

# Account-level features (7-day window)
account_event_features = (
    events.groupBy("account_id")
    .agg(
        F.count("event_type").alias("events_7d"),
        F.avg("session_minutes").alias("avg_session_minutes_7d"),
        F.avg("is_vpn").alias("vpn_rate_7d"),
        F.countDistinct("device_id").alias("distinct_devices_7d"),
    )
)

account_reward_features = (
    rewards.groupBy("account_id")
    .agg(
        F.count("reward_type").alias("rewards_7d"),
        F.sum("reward_amount").alias("reward_amount_7d"),
    )
)

account_features = (
    accounts.select("account_id", "country", "platform", "marketing_channel")
    .join(account_event_features, on="account_id", how="left")
    .join(account_reward_features, on="account_id", how="left")
    .fillna({
        "events_7d": 0,
        "avg_session_minutes_7d": 0.0,
        "vpn_rate_7d": 0.0,
        "distinct_devices_7d": 0,
        "rewards_7d": 0,
        "reward_amount_7d": 0.0,
    })
)

# Device-level features
accounts_per_device = (
    account_device.groupBy("device_id")
    .agg(F.countDistinct("account_id").alias("accounts_per_device_7d"))
)

device_features = (
    devices.join(accounts_per_device, on="device_id", how="left")
    .fillna({"accounts_per_device_7d": 0})
    .select(
        "device_id",
        "device_type",
        "os_version",
        "is_emulator",
        "device_risk_score",
        "accounts_per_device_7d",
    )
)

# Feature Store write helpers

def table_exists(name: str) -> bool:
    try:
        fs.get_table(name)
        return True
    except Exception:
        return False


def upsert_feature_table(name: str, df, primary_keys, description: str):
    if table_exists(name):
        fs.write_table(name=name, df=df, mode="overwrite")
    else:
        fs.create_table(name=name, primary_keys=primary_keys, df=df, description=description)

account_features_name = f"{DB_NAME}.account_features"
device_features_name = f"{DB_NAME}.device_features"

upsert_feature_table(
    account_features_name,
    account_features,
    primary_keys=["account_id"],
    description="Account-level behavioral features for fraud detection",
)

upsert_feature_table(
    device_features_name,
    device_features,
    primary_keys=["device_id"],
    description="Device-level risk features for fraud detection",
)

# Base training dataset (labels + entity keys)
training_base = (
    accounts.select("account_id", "is_fraud_label")
    .join(account_device.select("account_id", "device_id"), on="account_id", how="left")
)

training_base.write.mode("overwrite").saveAsTable(f"{DB_NAME}.training_base")

print("Feature tables created:")
print(f"- {account_features_name}")
print(f"- {device_features_name}")
print("Training base table created:")
print(f"- {DB_NAME}.training_base")