# From Raw Transactions to Real-Time Fraud Detection with Databricks

This demo showcases how Databricks enables a seamless machine learning pipeline using:
- Data versioning with Delta Lake
- Interactive data dashboards
- Centralized feature store
- Model training and registration
- Real-time model deployment and monitoring


In [None]:
# SECTION 1: Data Versioning & Maturity – Delta Lake
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.getOrCreate()

df_v1 = spark.range(1000).withColumn("amount", (F.rand() * 1000).cast("int")) \
    .withColumn("merchant_code", (F.rand() * 5).cast("int")) \
    .withColumn("channel", F.expr("CASE WHEN rand() < 0.5 THEN 'online' ELSE 'instore' END")) \
    .withColumn("fraud", F.when(F.rand() < 0.02, 1).otherwise(0))

df_v1.write.format("delta").mode("overwrite").save("/tmp/fraud_data")
spark.sql("DROP TABLE IF EXISTS fraud_data")
spark.sql("CREATE TABLE fraud_data USING DELTA LOCATION '/tmp/fraud_data'")


In [None]:
# Simulate Version 2 – Improved Labeling
df_v2 = df_v1.withColumn("fraud", F.when((F.col("amount") > 900) | (F.rand() < 0.03), 1).otherwise(0))
df_v2.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("/tmp/fraud_data")


In [None]:
# SECTION 2: Visual Exploration
df_current = spark.read.format("delta").load("/tmp/fraud_data")
df_current.groupBy("fraud").count().display()
df_current.groupBy("channel", "fraud").count().orderBy("channel").display()
df_current.select("amount").display()


In [None]:
# SECTION 3: Feature Store Integration
from databricks.feature_store import FeatureStoreClient

fs = FeatureStoreClient()

features_df = df_current.withColumn("high_amount", F.when(F.col("amount") > 500, 1).otherwise(0))

fs.create_table(
    name="fraud_demo_features",
    primary_keys=["id"],
    df=features_df.withColumn("id", F.monotonically_increasing_id()).select(
        "id", "amount", "merchant_code", "channel", "high_amount"
    ),
    description="Features for fraud detection"
)


In [None]:
# SECTION 4: Train + Register Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import mlflow
import mlflow.sklearn
import pandas as pd

training_df = features_df.select("amount", "merchant_code", "high_amount", "fraud").toPandas()
X = training_df[["amount", "merchant_code", "high_amount"]]
y = training_df["fraud"]

model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
model.fit(X, y)
acc = accuracy_score(y, model.predict(X))

with mlflow.start_run(run_name="FraudModel-Demo"):
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "model", registered_model_name="FraudDetectionModel")

print(f"Model registered. Accuracy: {acc:.3f}")


In [None]:
# SECTION 5: Simulate Monitoring
mlflow.log_metric("precision", 0.82)
mlflow.log_metric("recall", 0.67)
