In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql.functions import col, pandas_udf, PandasUDFType
import pyspark.sql.functions as F
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyspark.sql.types import ArrayType, DoubleType
from sklearn.metrics import (
    precision_recall_curve, 
    average_precision_score, 
    roc_auc_score, 
    confusion_matrix, 
    classification_report
)
import logging
logging.basicConfig(level=logging.INFO)
try:
    spark = SparkSession.builder.appName("FraudDetection").getOrCreate()
    spark.sparkContext.setLogLevel("DEBUG")
    print(spark.version)

    df = spark.read.csv("./data/creditcard.csv", header=True, inferSchema=True)
    df = df.drop("Time")

    # Separate fraud and non-fraud transactions
    non_fraud_df = df.filter(col("Class") == 0)
    fraud_df = df.filter(col("Class") == 1)

    feature_columns = [
        col for col in df.columns 
        if col not in ['Class']
    ]

    # Assemble features into a vector for non-fraud data
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="raw_features")
    assembled_non_fraud_df = assembler.transform(non_fraud_df)

    # Scale the features
    scaler = StandardScaler(inputCol="raw_features", outputCol="scaled_features", withStd=True, withMean=True)
    scaler_model = scaler.fit(assembled_non_fraud_df)
    scaled_non_fraud_df = scaler_model.transform(assembled_non_fraud_df)

    # Convert scaling model to full dataset
    scaled_full_df = scaler_model.transform(assembler.transform(df))

    # Convert scaled features to list
    def vector_to_list(vector):
        """Convert Spark Vector to list"""
        return vector.toArray().tolist()

    vector_to_list_udf = F.udf(vector_to_list, ArrayType(DoubleType()))

    scaled_non_fraud_df = scaled_non_fraud_df.withColumn("scaled_features_list", 
                                    vector_to_list_udf(F.col("scaled_features")))
    scaled_full_df = scaled_full_df.withColumn("scaled_features_list", 
                                    vector_to_list_udf(F.col("scaled_features")))

    # Extract numpy array of features for PyOD
    non_fraud_features = scaled_non_fraud_df.select("scaled_features_list").toPandas()
    non_fraud_features_array = np.array(non_fraud_features["scaled_features_list"].tolist())

    # Full dataset features for evaluation
    full_features = scaled_full_df.select("scaled_features_list", "Class").toPandas()
    full_features_array = np.array(full_features["scaled_features_list"].tolist())
    true_labels = full_features["Class"].values

    # Train PyOD model
    model = IForest(random_state=42, contamination=0.01)  # Set expected fraud rate
    model.fit(non_fraud_features_array)

    # Predict on full dataset
    predictions = model.predict(full_features_array)
    scores = model.decision_function(full_features_array)

    # Evaluation Metrics
    def print_evaluation_metrics(y_true, y_pred, scores):
        """
        Compute and print various evaluation metrics
        
        Parameters:
        - y_true: True labels
        - y_pred: Predicted labels
        - scores: Anomaly scores
        """
        print("\n--- Fraud Detection Evaluation Metrics ---")
        
        # Confusion Matrix
        cm = confusion_matrix(y_true, y_pred)
        print("\nConfusion Matrix:")
        print(cm)
        
        # Classification Report
        print("\nClassification Report:")
        print(classification_report(y_true, y_pred))
        
        # Area Under Precision-Recall Curve (AUPRC)
        precision, recall, _ = precision_recall_curve(y_true, scores)
        auprc = average_precision_score(y_true, scores)
        print(f"\nArea Under Precision-Recall Curve (AUPRC): {auprc:.4f}")
        
        # ROC AUC Score
        roc_auc = roc_auc_score(y_true, scores)
        print(f"ROC AUC Score: {roc_auc:.4f}")
        
        # Detailed Fraud Detection Performance
        fraud_mask = y_true == 1
        non_fraud_mask = y_true == 0
        
        print("\nFraud Detection Performance:")
        print(f"Total Transactions: {len(y_true)}")
        print(f"Fraud Transactions: {sum(fraud_mask)}")
        print(f"Non-Fraud Transactions: {sum(non_fraud_mask)}")
        
        fraud_predictions = y_pred[fraud_mask]
        print(f"Correctly Detected Fraud: {sum(fraud_predictions)} out of {sum(fraud_mask)}")
        print(f"Fraud Detection Rate: {sum(fraud_predictions) / sum(fraud_mask):.4f}")

    # Run evaluation
    print_evaluation_metrics(true_labels, predictions, scores)
finally:
    # Stop Spark Session
    spark.stop()

your 131072x1 screen size is bogus. expect trouble
24/12/11 22:41:24 WARN Utils: Your hostname, KrystalXPS resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/12/11 22:41:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/11 22:41:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/11 22:41:44 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


3.5.3


24/12/11 22:41:48 DEBUG FileSystem: Loading filesystems
24/12/11 22:41:48 DEBUG FileSystem: nullscan:// = class org.apache.hadoop.hive.ql.io.NullScanFileSystem from /home/vuphamas/code/fx/venv/lib/python3.10/site-packages/pyspark/jars/hive-exec-2.3.9-core.jar
24/12/11 22:41:48 DEBUG FileSystem: file:// = class org.apache.hadoop.fs.LocalFileSystem from /home/vuphamas/code/fx/venv/lib/python3.10/site-packages/pyspark/jars/hadoop-client-api-3.3.4.jar
24/12/11 22:41:48 DEBUG FileSystem: file:// = class org.apache.hadoop.hive.ql.io.ProxyLocalFileSystem from /home/vuphamas/code/fx/venv/lib/python3.10/site-packages/pyspark/jars/hive-exec-2.3.9-core.jar
24/12/11 22:41:48 DEBUG FileSystem: viewfs:// = class org.apache.hadoop.fs.viewfs.ViewFileSystem from /home/vuphamas/code/fx/venv/lib/python3.10/site-packages/pyspark/jars/hadoop-client-api-3.3.4.jar
24/12/11 22:41:48 DEBUG FileSystem: har:// = class org.apache.hadoop.fs.HarFileSystem from /home/vuphamas/code/fx/venv/lib/python3.10/site-package


--- Fraud Detection Evaluation Metrics ---

Confusion Matrix:
[[281471   2844]
 [   209    283]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    284315
           1       0.09      0.58      0.16       492

    accuracy                           0.99    284807
   macro avg       0.54      0.78      0.58    284807
weighted avg       1.00      0.99      0.99    284807


Area Under Precision-Recall Curve (AUPRC): 0.1091
ROC AUC Score: 0.9496

Fraud Detection Performance:
Total Transactions: 284807
Fraud Transactions: 492
Non-Fraud Transactions: 284315
Correctly Detected Fraud: 283 out of 492
Fraud Detection Rate: 0.5752


24/12/11 22:44:40 INFO SparkContext: SparkContext is stopping with exitCode 0.
24/12/11 22:44:41 INFO SparkUI: Stopped Spark web UI at http://10.255.255.254:4041
24/12/11 22:44:41 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
24/12/11 22:44:41 INFO MemoryStore: MemoryStore cleared
24/12/11 22:44:41 INFO BlockManager: BlockManager stopped
24/12/11 22:44:41 INFO BlockManagerMaster: BlockManagerMaster stopped
24/12/11 22:44:41 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
24/12/11 22:44:41 INFO SparkContext: Successfully stopped SparkContext


24/12/11 22:44:43 DEBUG PoolThreadCache: Freed 12 thread-local buffer(s) from thread: shuffle-server-7-1
