# 3.1. Classification with Logistic Regression

### Import libraries and load the dataset

In [53]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SparkSession
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionWithSGD
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.linalg import Vectors

In [54]:
spark = SparkSession.builder.appName("RDD-Based-Implementation").getOrCreate()

### 3.1.2. MLlib RDD-Based Implementation


In [55]:
data = spark.read.parquet("../../data/creditcard_preprocessed.parquet")
data.show()

+--------------------+-----+--------------------+
|            features|Class|      scaledFeatures|
+--------------------+-----+--------------------+
|[1.38639697419213...|    0|[0.71005441038295...|
|[-2.1434575316891...|    0|[-1.0977890908419...|
|[-4.0668622711825...|    0|[-2.0828763664576...|
|[-0.9456431509172...|    0|[-0.4843187791494...|
|[-3.5900235269187...|    0|[-1.8386595514265...|
|[-3.8405843371581...|    0|[-1.9669862945538...|
|[-0.7353859070637...|    0|[-0.3766338331402...|
|[-1.4000322465173...|    0|[-0.7170378252573...|
|[-1.4539401037675...|    0|[-0.7446471698442...|
|[0.91196330496498...|    0|[0.46706937396133...|
|[-2.6686038604838...|    0|[-1.3667470255448...|
|[1.29926838042254...|    0|[0.66543079721284...|
|[-1.1892931244430...|    0|[-0.6091060814244...|
|[-0.9282650755347...|    0|[-0.4754184574530...|
|[1.15444484782558...|    0|[0.59125825503196...|
|[1.2095749964979,...|    0|[0.61949360604508...|
|[-0.4483096494488...|    0|[-0.2296054086484...|


In [56]:
# Convert scaledFeatures to pyspark.mllib.linalg.Vector
rdd_data = data.rdd.map(lambda row: LabeledPoint(row['Class'], Vectors.dense(row['scaledFeatures'].toArray())))

# Split the data into training and testing sets
train_rdd, test_rdd = rdd_data.randomSplit([0.8, 0.2], seed=42)

# Train the logistic regression model
model = LogisticRegressionWithSGD.train(train_rdd, iterations=100)

# Make predictions on the test set
predictions_and_labels = test_rdd.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Compute evaluation metrics
metrics = MulticlassMetrics(predictions_and_labels)

# Print evaluation metrics
print(f"Accuracy: {metrics.accuracy}")
print(f"Precision: {metrics.weightedPrecision}")
print(f"Recall: {metrics.weightedRecall}")
print(f"F1 Score: {metrics.weightedFMeasure()}")



Accuracy: 0.9947194959849961
Precision: 0.9981011234720676
Recall: 0.9947194959849961
F1 Score: 0.996156160582718


In [57]:
spark.stop()