In [None]:
from pyspark.sql import SparkSession

# --- Step 1: Initialize Spark ---
spark = SparkSession.builder.appName("Lab3_Task3_Evaluation").getOrCreate()
sc = spark.sparkContext

# --- Step 2: Define True Matches (Ground Truth) ---
# Example: (id1, id2) are the true duplicates
true_matches = [
    (1, 5),   # alice smith = alice smith
    (3, 4)    # charlie johnson ≈ david johnson
]
true_rdd = sc.parallelize(true_matches)

# --- Step 3: Define Predicted Matches (Model Output) ---
predicted_matches = [
    (1, 5),   # correct prediction
    (1, 2),   # false positive
    (3, 4)    # correct prediction
]
pred_rdd = sc.parallelize(predicted_matches)

# --- Step 4: Compute True Positives (TP), False Positives (FP), False Negatives (FN) ---
TP = pred_rdd.intersection(true_rdd).count()
FP = pred_rdd.subtract(true_rdd).count()
FN = true_rdd.subtract(pred_rdd).count()

# --- Step 5: Compute Metrics ---
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# --- Step 6: Print Results ---
print("✅ Evaluation Metrics for Entity Resolution Model:")
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")

# --- Step 7: Stop Spark ---
spark.stop()
