In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve

In [None]:
full_set = pd.read_csv("/dbfs/mnt/AA/ba008/data_samples/matching/matching_iteration3/matching_iteration3_manual_match.csv")

In [None]:
# get the most similar entries but also the ones that are manual matches so that we can learn from their similarity as well
training_set = full_set.loc[(full_set["match"] == 1) | (full_set["rank"] == 1), ["similarity", "match"]]
fpr, tpr, thresholds = roc_curve(training_set["match"], training_set["similarity"])
roc_auc = auc(fpr, tpr)
optimal_thr = thresholds[np.argmin((0-fpr)**2 + (1-tpr)**2)]
optimal_tpr = tpr[thresholds==optimal_thr][0]
optimal_fpr = fpr[thresholds==optimal_thr][0]

In [None]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.scatter(optimal_fpr, optimal_tpr, color="red", lw=lw, label=f"Opt. similarity threshold: {optimal_thr}\nOpt. TPR:{round(optimal_tpr, 2)}, Opt. FPR: {round(optimal_fpr, 2)}")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Matching iteration 3 ROC curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(training_set["match"], (training_set["similarity"] >= optimal_thr).astype(int)).ravel()

In [None]:
tn, fp, fn, tp

In [None]:
print("Accuracy:", round((tp + tn)/(tp+tn+fp+fn),2))