In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, pandas_udf, PandasUDFType
import pyspark.sql.functions as F
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyspark.sql.types import ArrayType, DoubleType
from sklearn.metrics import (
    precision_recall_curve, 
    average_precision_score, 
    roc_auc_score, 
    confusion_matrix, 
    classification_report
)
import logging
logging.basicConfig(level=logging.INFO)

try:
    # spark = SparkSession.builder.appName("FraudDetection").getOrCreate()
    spark = SparkSession.builder \
        .appName("FraudDetection") \
        .config("spark.executor.memory", "8g") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.instances", "4") \
        .config("spark.default.parallelism", "16") \
        .config("spark.sql.shuffle.partitions", "16") \
        .config("spark.memory.fraction", "0.8") \
        .config("spark.memory.storageFraction", "0.2") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.dynamicAllocation.enabled", "true") \
        .getOrCreate()
    spark.sparkContext.setLogLevel("DEBUG")
    print(spark.version)

    # Read the dataset
    df = spark.read.csv("./data/creditcard.csv", header=True, inferSchema=True)
    df = df.drop("Time")

    # Separate fraud and non-fraud transactions
    non_fraud_df = df.filter(col("Class") == 0)
    fraud_df = df.filter(col("Class") == 1)

    feature_columns = [
        col for col in df.columns 
        if col not in ['Class']
    ]

    # Assemble features into a vector
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    assembled_df = assembler.transform(df)

    # Convert features to list
    def vector_to_list(vector):
        """Convert Spark Vector to list"""
        return vector.toArray().tolist()

    vector_to_list_udf = F.udf(vector_to_list, ArrayType(DoubleType()))

    assembled_df = assembled_df.withColumn("features_list", 
                                    vector_to_list_udf(F.col("features")))

    # Extract numpy array of features for PyOD
    # Use only non-fraud data for training
    non_fraud_features = assembled_df.filter(col("Class") == 0).select("features_list").toPandas()
    non_fraud_features_array = np.array(non_fraud_features["features_list"].tolist())

    # Full dataset features for evaluation
    full_features = assembled_df.select("features_list", "Class").toPandas()
    full_features_array = np.array(full_features["features_list"].tolist())
    true_labels = full_features["Class"].values

    # Train PyOD model
    model = IForest(random_state=42, contamination=0.01) # 0.00172
    model.fit(non_fraud_features_array)

    # Predict on full dataset
    predictions = model.predict(full_features_array)
    scores = model.decision_function(full_features_array)

    def print_evaluation_metrics(y_true, y_pred, scores):
        """
        Compute and print various evaluation metrics
        
        Parameters:
        - y_true: True labels
        - y_pred: Predicted labels
        - scores: Anomaly scores
        """
        print("\n--- Fraud Detection Evaluation Metrics ---")
        
        # Confusion Matrix
        cm = confusion_matrix(y_true, y_pred)
        print("\nConfusion Matrix:")
        print(cm)
        
        # Classification Report
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred))
        
        # Area Under Precision-Recall Curve (AUPRC)
        precision, recall, _ = precision_recall_curve(y_true, scores)
        auprc = average_precision_score(y_true, scores)
        print(f"\nArea Under Precision-Recall Curve (AUPRC): {auprc:.4f}")
        
        # ROC AUC Score
        roc_auc = roc_auc_score(y_true, scores)
        print(f"ROC AUC Score: {roc_auc:.4f}")
        
        # Detailed Fraud Detection Performance
        fraud_mask = y_true == 1
        non_fraud_mask = y_true == 0
        
        print("\nFraud Detection Performance:")
        print(f"Total Transactions: {len(y_true)}")
        print(f"Fraud Transactions: {sum(fraud_mask)}")
        print(f"Non-Fraud Transactions: {sum(non_fraud_mask)}")
        
        fraud_predictions = y_pred[fraud_mask]
        print(f"Correctly Detected Fraud: {sum(fraud_predictions)} out of {sum(fraud_mask)}")
        print(f"Fraud Detection Rate: {sum(fraud_predictions) / sum(fraud_mask):.4f}")

    # Run evaluation
    print_evaluation_metrics(true_labels, predictions, scores)
finally:
    # Stop Spark Session
    spark.stop()

24/12/12 02:11:07 INFO SparkContext: Running Spark version 3.5.3
24/12/12 02:11:07 INFO SparkContext: OS info Linux, 5.15.167.4-microsoft-standard-WSL2, amd64
24/12/12 02:11:07 INFO SparkContext: Java version 19.0.2
24/12/12 02:11:08 INFO ResourceUtils: No custom resources configured for spark.driver.
24/12/12 02:11:08 INFO SparkContext: Submitted application: FraudDetection
24/12/12 02:11:09 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 4, script: , vendor: , memory -> name: memory, amount: 8192, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
24/12/12 02:11:10 INFO ResourceProfile: Limiting resource is cpus at 4 tasks per executor
24/12/12 02:11:10 INFO ResourceProfileManager: Added ResourceProfile id: 0
24/12/12 02:11:10 INFO SecurityManager: Changing view acls to: root
24/12/12 02:11:10 INFO SecurityManager: Changing modify acls to: root


3.5.3


24/12/12 02:11:12 INFO SharedState: Setting hive.metastore.warehouse.dir ('null') to the value of spark.sql.warehouse.dir.
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.sql.adaptive.enabled -> true
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.memory.storageFraction -> 0.2
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.app.name -> FraudDetection
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.driver.memory -> 16g
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.executor.instances -> 4
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.default.parallelism -> 16
24/12/12 02:11:12 DEBUG SharedState: Applying other initial session options to HadoopConf: spark.executor.memory -> 8g
24/12/12 02:11:12 D


--- Fraud Detection Evaluation Metrics ---

Confusion Matrix:
[[281471   2844]
 [   209    283]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    284315
           1       0.09      0.58      0.16       492

    accuracy                           0.99    284807
   macro avg       0.54      0.78      0.58    284807
weighted avg       1.00      0.99      0.99    284807


Area Under Precision-Recall Curve (AUPRC): 0.1091
ROC AUC Score: 0.9496

Fraud Detection Performance:
Total Transactions: 284807
Fraud Transactions: 492
Non-Fraud Transactions: 284315
Correctly Detected Fraud: 283 out of 492
Fraud Detection Rate: 0.5752


24/12/12 02:12:56 INFO SparkContext: SparkContext is stopping with exitCode 0.
24/12/12 02:12:56 INFO SparkUI: Stopped Spark web UI at http://10.255.255.254:4041
24/12/12 02:12:56 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
24/12/12 02:12:57 INFO MemoryStore: MemoryStore cleared
24/12/12 02:12:57 INFO BlockManager: BlockManager stopped
24/12/12 02:12:57 INFO BlockManagerMaster: BlockManagerMaster stopped
24/12/12 02:12:57 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
24/12/12 02:12:57 INFO SparkContext: Successfully stopped SparkContext


24/12/12 02:12:58 DEBUG PoolThreadCache: Freed 14 thread-local buffer(s) from thread: shuffle-server-26-1
