# 📦 Step 1: Load the Cleaned Data

In [0]:
# Load balanced dataset from Parquet
df = spark.read.parquet("dbfs:/FileStore/tables/creditcard_balanced.parquet")

# Register it as a SQL view
df.createOrReplaceTempView("creditcard_balanced")

# Preview using SQL
df_sql = spark.sql("SELECT * FROM creditcard_balanced")
df_sql.show(5)

+-------+-------------------+------------------+-----------------+-------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+--------------------+------------------+-----------------+-------------------+------+-----+
|   Time|                 V1|                V2|               V3|                 V4|                V5|                V6|                V7|               V8|                V9|               V10|               V11|               V12|               V13|                V14|               V15|              V16|               V17|               V18|               V19|               V20|               V21|               V22|              

# 🔀 Step 2: Train/Test Split

In [0]:
train_df, test_df = df.randomSplit([0.8,0.2], seed=42)

# ⚙️ Step 3: Feature Engineering Pipeline

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

features = [col for col in df.columns if col not in ['Time', 'Class']]
assembler = VectorAssembler(inputCols=features,outputCol="features")

# 📊 Step 4: Train a Logistic Regression Model

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="Class")

pipeline = Pipeline(stages=[assembler, lr])
lr_model = pipeline.fit(train_df)

# 🌲 Step 5: Train a Random Forest (for comparison)

In [0]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol="features", labelCol="Class")
pipeline = Pipeline(stages=[assembler, rf])
rf_model = pipeline.fit(train_df)

# 📈 Step 6: Evaluate Models

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions_lr = lr_model.transform(test_df)
predictions_rf = rf_model.transform(test_df)

evaluator = BinaryClassificationEvaluator(labelCol="Class", metricName="areaUnderROC")

auc_lr = evaluator.evaluate(predictions_lr)
auc_rf = evaluator.evaluate(predictions_rf)

print(f"Logistic Regression AUC: {auc_lr: .4f}")
print(f"Random Forest AUC: {auc_rf: .4f}")


Logistic Regression AUC:  0.9848
Random Forest AUC:  0.9859


AUC (Area Under the Curve)
Logistic Regression AUC:  0.9848
Random Forest AUC:  0.9859

Random Forest is performing slightly better than Logistic Regression. 

# 📊 Step 7: Confusion Matrix

In [0]:
from pyspark.sql.functions import col

predictions_rf.select("prediction", "Class").groupBy("prediction", "Class").count().show()


+----------+-----+-----+
|prediction|Class|count|
+----------+-----+-----+
|       0.0|    0|  376|
|       1.0|    0|    1|
|       0.0|    1|   12|
|       1.0|    1|   93|
+----------+-----+-----+



True Negatives	  376	    Legit transactions predicted as legit ✅

False Positives	  1	      Legit transaction predicted as fraud ❌

False Negatives	  12	    Fraud predicted as legit ❌

True Positives	  93	    Fraud correctly predicted ✅

# 💾 Step 8: Save the Best Model

In [0]:
rf_model.save("dbfs:/FileStore/models/rf_fraud_model")