In [12]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.sql import functions as F
from sklearn.datasets import load_breast_cancer

spark = (SparkSession.builder
        .appName("SVM Classification")
        .getOrCreate())
    

In [None]:
#  load data
data = load_breast_cancer()

X = [Vectors.dense(x) for x in data.data]
y = data.target.tolist()

spark_df = spark.createDataFrame(zip(X, y), ["features", "label"])

# spit data
train_data, test_data = spark_df.randomSplit([0.8, 0.2], seed=87)

# feature scaling
scaler = StandardScaler(
    inputCol="features", outputCol="scaledFeatures")
scaled_model = scaler.fit(train_data)

train_data_scaled = scaled_model.transform(train_data)
test_data_scaled = scaled_model.transform(test_data)

# train SVM model
svm = LinearSVC()
svm_model = svm.fit(train_data_scaled)

# predict
predictions = svm_model.transform(test_data_scaled)
predictions.show()

# evaluate
evaluator = BinaryClassificationEvaluator()
evaluator.setMetricName("areaUnderROC")
evaluator.setLabelCol("label")
evaluator.setRawPredictionCol("rawPrediction")

area_under_roc = evaluator.evaluate(predictions)
tp = predictions.filter("label = 1 AND prediction = 1").count()
fp = predictions.filter("label = 0 AND prediction = 1").count()
tn = predictions.filter("label = 0 AND prediction = 0").count()
fn = predictions.filter("label = 1 AND prediction = 0").count()
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
confusion = [[tp, fp], [fn, tn]]

print("=" * 50)
print("Evaluation Metrics")
print(f"Area Under ROC: {area_under_roc}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix:")
for row in confusion:
    print(row)
print("=" * 50)


+--------------------+-----+--------------------+--------------------+----------+
|            features|label|      scaledFeatures|       rawPrediction|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[9.029,17.33,58.7...|    1|[2.57781037893780...|[2.57729958716644...|       0.0|
|[10.95,21.35,71.9...|    0|[3.12626244870627...|[2.45225423510883...|       0.0|
|[11.76,21.6,74.72...|    1|[3.35752021888454...|[-5.0091975486869...|       1.0|
|[13.05,19.31,82.6...|    1|[3.72581963064994...|[-6.8779555688572...|       1.0|
|[13.73,22.61,93.6...|    0|[3.91996195623170...|[1.14379322170671...|       0.0|
|[14.25,21.72,93.6...|    0|[4.06842373461775...|[3.44382872329724...|       0.0|
|[16.02,23.24,102....|    0|[4.57376478797027...|[2.46876933007435...|       0.0|
|[16.13,17.88,107....|    0|[4.60517016416732...|[6.34610160728945...|       0.0|
|[16.13,20.68,108....|    0|[4.60517016416732...|[11.2152587349016...|       0.0|
|[16.65,21.38,11