In [None]:
!pip install python-docx

from docx import Document

In [None]:
def get_similarity_report(actual_data, predicted_data, threshold):
    from sklearn.metrics.pairwise import cosine_similarity

    actual_embeddings = {ent: get_entity_embedding(ent) for ent in actual_data}
    report_matched = []
    report_unmatched = []

    matched_actual_entities = set()
    unknown_label_count = 0

    for pred_ent, pred_label in predicted_data.items():
        pred_label_clean = pred_label.strip().lower()
        if pred_label_clean == "unknown":
            unknown_label_count += 1
            continue

        pred_emb = get_entity_embedding(pred_ent)
        best_sim = 0
        best_match = None

        for actual_ent in actual_data:
            if actual_ent in matched_actual_entities:
                continue
            sim = cosine_similarity([pred_emb], [actual_embeddings[actual_ent]])[0][0]
            if sim > best_sim:
                best_sim = sim
                best_match = actual_ent

        if best_sim >= threshold and best_match:
            matched_actual_entities.add(best_match)
            report_matched.append({
                "predicted_entity": pred_ent,
                "actual_entity": best_match,
                "similarity": round(best_sim, 4)
            })
        else:
            report_unmatched.append({
                "predicted_entity": pred_ent,
                "most_similar_actual_entity": best_match,
                "similarity": round(best_sim, 4)
            })

    return report_matched, report_unmatched


In [None]:
def save_similarity_report_to_docx(matched_entities, unmatched_entities, filename="entity_similarity_report_3.docx"):
    doc = Document()
    doc.add_heading("Entity Similarity Report 2", level=1)

    # Matched Entities
    doc.add_heading("‚úÖ Matched Entities", level=2)
    table = doc.add_table(rows=1, cols=3)
    table.style = 'Table Grid'
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = "Predicted Entity"
    hdr_cells[1].text = "Actual Entity"
    hdr_cells[2].text = "Cosine Similarity"

    for entry in matched_entities:
        row_cells = table.add_row().cells
        row_cells[0].text = entry["predicted_entity"]
        row_cells[1].text = entry["actual_entity"]
        row_cells[2].text = str(entry["similarity"])

    # Unmatched Entities
    doc.add_paragraph()
    doc.add_heading("‚ùå Unmatched Entities", level=2)
    table = doc.add_table(rows=1, cols=3)
    table.style = 'Table Grid'
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = "Predicted Entity"
    hdr_cells[1].text = "Actual Entity"
    hdr_cells[2].text = "Cosine Similarity"

    for entry in unmatched_entities:
        row_cells = table.add_row().cells
        row_cells[0].text = entry["predicted_entity"]
        row_cells[1].text = entry["most_similar_actual_entity"]
        row_cells[2].text = str(entry["similarity"])

    # Save the document
    doc.save(filename)
    print(f"üìÑ Report saved to: {filename}")


In [None]:
matched_entities, unmatched_entities = get_similarity_report(actual_data, predicted_data, threshold=0.80)
save_similarity_report_to_docx(matched_entities, unmatched_entities)

In [None]:
ent1 = "lower abdominal pain"
ent2 = "lower abd pain"

embedding1 = get_entity_embedding(ent1)
embedding2 = get_entity_embedding(ent2)

from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]

print(f"Cosine Similarity between:\n  '{ent1}' and\n  '{ent2}' is: {similarity_score:.4f}")
