# 🤖 Fraud Detection - Modeling with PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Start Spark Session
spark = SparkSession.builder.appName('FraudDetection').getOrCreate()

# Load data
df = spark.read.csv('../data/raw/fraud_data.csv', header=True, inferSchema=True)
df.printSchema()


In [None]:
# Feature Engineering
indexer = StringIndexer(inputCol='transactionType', outputCol='transactionTypeIndex')
df = indexer.fit(df).transform(df)

feature_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'transactionTypeIndex']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df = assembler.transform(df)


In [None]:
# Train/Test split
train, test = df.randomSplit([0.7, 0.3], seed=42)

# Model Training
rf = RandomForestClassifier(labelCol='isFraud', featuresCol='features', numTrees=100)
model = rf.fit(train)


In [None]:
# Evaluate
predictions = model.transform(test)

evaluator = BinaryClassificationEvaluator(labelCol='isFraud')
auc = evaluator.evaluate(predictions)
print(f'AUC: {auc:.4f}')


✅ Model training complete! Save model if needed or deploy with Streamlit!