In [0]:
# Databricks notebook: Fraud Detection with Logistic Regression in PySpark

# Import necessary libraries
from pyspark.sql import SparkSession  # Create and manage a Spark session.
from pyspark.ml.feature import VectorAssembler  # Combine features into a single vector for ML.
from pyspark.ml.classification import LogisticRegression  # Train and apply logistic regression models.
from pyspark.ml.evaluation import BinaryClassificationEvaluator  # Evaluate classification model performance.

# Step 1: Load the CSV file into PySpark DataFrame
# Create a Spark session
#spark = SparkSession.builder.appName("FraudDetection-LogisticRegression").getOrCreate()

# Assume the data is in a table named 'transactions_data'
data_df = spark.sql("SELECT * FROM fraud_detection")

# Display the first few rows of the DataFrame to verify loading
data_df.show()

# Ensure columns are in numeric format for processing
from pyspark.sql.functions import col
data_df = data_df.withColumn("Transaction_Amount", col("Transaction_Amount").cast("double")) \
                 .withColumn("Customer_Age", col("Customer_Age").cast("double")) \
                 .withColumn("Transaction_Frequency", col("Transaction_Frequency").cast("double")) \
                 .withColumn("Account_Balance", col("Account_Balance").cast("double")) \
                 .withColumn("Fraudulent", col("Fraudulent").cast("integer"))  # Target column: Fraud (1) or Not Fraud (0)

# Verify the schema
data_df.printSchema()


+--------------+------------------+------------+---------------------+---------------+----------+
|Transaction_ID|Transaction_Amount|Customer_Age|Transaction_Frequency|Account_Balance|Fraudulent|
+--------------+------------------+------------+---------------------+---------------+----------+
|             1|             500.0|          34|                    2|         1500.0|         0|
|             2|            1200.0|          28|                    5|         2000.0|         1|
|             3|              50.0|          40|                   15|         1800.0|         0|
|             4|            1500.0|          25|                    1|         2500.0|         1|
|             5|             200.0|          55|                    8|         3000.0|         0|
|             6|            3000.0|          31|                    1|         1000.0|         1|
|             7|             100.0|          45|                   12|         5000.0|         0|
|             8|    

In [0]:

# Step 2: Prepare the data
# Select relevant features for the model
feature_columns = ["Transaction_Amount", "Customer_Age", "Transaction_Frequency", "Account_Balance"]

# Combine selected features into a single vector column
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
#The VectorAssembler combines multiple feature columns into a single vector column (features), which is the required format for machine learning algorithms in PySpark.

data_prepared = assembler.transform(data_df).select("features", "Fraudulent")
#This step transforms the original dataset into a format required for modeling by creating a features column (vector of input features) and selecting only the relevant columns (features and Fraudulent) for training the machine learning model.

# Display the prepared data to confirm the transformation
data_prepared.show()


+--------------------+----------+
|            features|Fraudulent|
+--------------------+----------+
|[500.0,34.0,2.0,1...|         0|
|[1200.0,28.0,5.0,...|         1|
|[50.0,40.0,15.0,1...|         0|
|[1500.0,25.0,1.0,...|         1|
|[200.0,55.0,8.0,3...|         0|
|[3000.0,31.0,1.0,...|         1|
|[100.0,45.0,12.0,...|         0|
|[800.0,37.0,3.0,7...|         1|
|[10.0,60.0,20.0,6...|         0|
|[2500.0,29.0,1.0,...|         1|
|[150.0,38.0,10.0,...|         0|
|[4000.0,22.0,0.0,...|         1|
|[75.0,50.0,18.0,4...|         0|
|[600.0,33.0,4.0,8...|         1|
|[1000.0,48.0,2.0,...|         0|
|[5000.0,26.0,0.0,...|         1|
|[90.0,43.0,11.0,4...|         0|
|[700.0,36.0,3.0,7...|         1|
|[15.0,65.0,25.0,5...|         0|
|[3000.0,30.0,1.0,...|         1|
+--------------------+----------+



In [0]:

# Step 3: Split the data into training and test sets
train_data, test_data = data_prepared.randomSplit([0.8, 0.2], seed=42)

# Step 4: Train the Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="Fraudulent")  # Define the logistic regression model.
#lr:
#This is the variable used to store the instance of the LogisticRegression class, which represents the logistic regression model.

#LogisticRegression:
#machine learning algorithm for binary classification tasks. It predicts the probability of a categorical dependent variable (in this case, whether a transaction is fraudulent or not).

#featuresCol="features":
#Specifies the name of the column in the dataset that contains the feature vector. This column is created by the VectorAssembler and includes all the input features required for training the model.

#labelCol="Fraudulent":
#Specifies the name of the column in the dataset that contains the target labels (ground truth). In this case, the Fraudulent column indicates whether a transaction is fraudulent (1) or not (0)
#===================================================================================================

lr_model = lr.fit(train_data)  # trains the logistic regression model (lr) on the train_data dataset, learning the optimal coefficients and intercept for predicting the target label.


In [0]:

# Step 5: Output model coefficients and intercept
print("Coefficients: ", lr_model.coefficients)  # Feature weights.
print("Intercept: ", lr_model.intercept)  # Bias term.

# Step 6: Make predictions
predictions = lr_model.transform(test_data)  # Applies the trained logistic regression model (lr_model) to the test_data dataset to generate predictions.
#It creates additional columns like prediction (the model's predicted class) and probability (the probabilities for each class)

# Display predictions alongside actual values
predictions.select("features", "Fraudulent", "prediction", "probability").show()


Coefficients:  [-0.0005072706919656436,-1.338978536927151,0.3418714210344946,-0.00803018857917959]
Intercept:  71.71979470532187
+--------------------+----------+----------+--------------------+
|            features|Fraudulent|prediction|         probability|
+--------------------+----------+----------+--------------------+
|[50.0,40.0,15.0,1...|         0|       1.0|[1.49357031337132...|
|[150.0,38.0,10.0,...|         0|       0.0|[0.98062917130857...|
|[500.0,34.0,2.0,1...|         0|       1.0|[4.65874504382337...|
|[1200.0,28.0,5.0,...|         1|       1.0|[4.28283444903493...|
|[5000.0,26.0,0.0,...|         1|       1.0|[3.63718637815555...|
+--------------------+----------+----------+--------------------+



In [0]:

# Step 7: Evaluate the model
# Initialize a binary classification evaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="Fraudulent",
    rawPredictionCol="rawPrediction",
    metricName="areaUnderROC"  # Area Under the ROC Curve +++++ The ROC (Receiver Operating Characteristic) curve illustrates a classifier's performance across various thresholds, plotting the True Positive Rate (TPR) against the False Positive Rate (FPR)
)

roc_auc = evaluator.evaluate(predictions)  # Calculate AUC-ROC.
print("Area Under ROC (AUC):", roc_auc)

# Evaluate accuracy
correct = predictions.filter(predictions.Fraudulent == predictions.prediction).count()
total = predictions.count()
accuracy = correct / total
print("Accuracy:", accuracy)

# Summary
print("Logistic Regression Model Summary:")
print(f"  Coefficients: {lr_model.coefficients}")
print(f"  Intercept: {lr_model.intercept}")
print(f"  AUC-ROC: {roc_auc}")
print(f"  Accuracy: {accuracy}")

#AUC (1.0): The model perfectly distinguishes between fraudulent and non-fraudulent transactions.
#Accuracy (0.6): The model correctly predicts 60% of transactions, which may indicate dataset imbalance.
#Coefficients: Weights for features show their influence, with Feature 2 having the strongest negative impact.
#Intercept (71.7198): A high baseline prediction value when all features are zero, likely due to class bias.
#Key Insight: Despite perfect AUC, low accuracy suggests evaluating additional metrics like precision and recall.


#Summary:
#The model has excellent discriminatory power (AUC), but the relatively low accuracy signals that further analysis of the dataset (e.g., class imbalance) and #evaluation of additional metrics (precision, recall, F1-score) are needed for a complete performance assessment.


Area Under ROC (AUC): 1.0
Accuracy: 0.6
Logistic Regression Model Summary:
  Coefficients: [-0.0005072706919656436,-1.338978536927151,0.3418714210344946,-0.00803018857917959]
  Intercept: 71.71979470532187
  AUC-ROC: 1.0
  Accuracy: 0.6


In [0]:
#EXTRAS :
    #The ROC (Receiver Operating Characteristic) curve illustrates a classifier's performance across various thresholds, plotting the True Positive Rate (TPR) against the False Positive Rate (FPR). The AUC (Area Under the Curve) quantifies the classifier's ability to distinguish between classes, with values closer to 1 indicating better performance.

    