## Step 7: Plot the ROC Curve
Generate the ROC curve and calculate the area under the curve (AUC).

In [7]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import matplotlib.pyplot as plt

# Evaluate the ROC curve and AUC
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="probability", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Area Under ROC: {auc:.2f}")

# Plotting the ROC Curve
probabilities = predictions.select("label", "probability").rdd.map(lambda row: (row[1][1], row[0])).collect()
probabilities.sort(reverse=True, key=lambda x: x[0])

true_positive_rate = []
false_positive_rate = []
positive = sum([1 for _, label in probabilities if label == 1.0])
negative = sum([1 for _, label in probabilities if label == 0.0])
tp, fp = 0, 0

# Calculate the TPR and FPR
for prob, label in probabilities:
    if label == 1.0:
        tp += 1
    else:
        fp += 1
    true_positive_rate.append(tp / positive)
    false_positive_rate.append(fp / negative)

# Plot the curve
plt.figure(figsize=(8, 6))
plt.plot(false_positive_rate, true_positive_rate, label=f"ROC Curve (AUC = {auc:.2f})")
plt.plot([0, 1], [0, 1], "r--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid()
plt.show()