## Keword extraction code experiments

In [6]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util

# Initialize models
kw_model = KeyBERT('all-MiniLM-L6-v2')
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import pandas as pd

In [2]:
event_text="""A medical device used in an endovascular aneurysm repair procedure was attempted to close a common femoral artery, but the suture material got stuck in the device. Despite unsuccessful attempts to remove it with a plunger, hemostasis was achieved through surgical suturing. The delay in the procedure was attributed to the patient's poor vessel condition, not related to the device. The device was returned for evaluation and visual inspections confirmed a needle-to-cuff miss, indicating no manufacturing nonconformities that could have contributed to the issue."""

In [3]:

# Define dictionaries
malfunction_terms = ["device malfunction",
    "deployment failure", "device malfunction", "balloon leak", "balloon deflation", "deflation",
    "device damage", "sealant leakage", "handle breakage", "device rupture", "balloon burst",
    "device separation", "catheter kink", "component dislodged", "broken wire",
    "malfunction during use", "unable to deploy", "failed to lock", "delivery issue",
    "material integrity issue", "device stuck", "torn sleeve", "system error",
    "device misfire", "sealant exposed", "incomplete deployment", "kinked sheath",
    "suture failure", "defective suture", "deployment interrupted", "device not working",
    "activation failure", "broken tip", "loose seal", "locking issue",
]

injury_terms = [
    "bleeding", "control of bleeding", "inability to stop bleeding", "hemostasis failure", "manual compression",
    "hematoma", "hematoma formation", "pain", "site pain", "discomfort", "infection", "swelling",
    "pseudoaneurysm", "vessel injury", "vascular injury", "tissue damage", "necrosis", "burns",
    "wound complication", "inflammatory reaction", "complication after use", "excessive bleeding",
    "limb numbness", "ischemia", "vascular spasm", "nerve injury", "numbness", "redness", "bruising",
    "puncture site complication", "access site pain", "delayed healing", "local irritation", "localized pain",
    "post-procedure pain", "seroma", "site discomfort", "extravasation", "hypersensitivity reaction"
]

death_terms = [
    "patient died", "patient death", "death reported", "expired", "fatal event",
    "fatal outcome", "mortality", "death occurred", "patient demise", "loss of life",
    "lethal event", "fatality","death following procedure",
    "procedure-related death", "unexpected death", "sudden death"
]


In [None]:
DICT_TERMS = {
    "Device Malfunction": [
        # General device failure
        "device malfunction", "equipment malfunction", "malfunction during use",
        "device not working", "device failed", "failure during operation",
        "failure to function", "malfunction on deployment", "malfunction after use",

        # Deployment / operational issues
        "deployment failure", "incomplete deployment", "unable to deploy",
        "failed to deploy", "partial deployment", "deployment interrupted",
        "failed to lock", "locking issue", "activation failure",
        "device misfire", "delivery issue", "misalignment of device",

        # Structural / mechanical failures
        "device damage", "component breakage", "handle breakage",
        "material integrity issue", "broken tip", "torn sleeve",
        "kinked sheath", "catheter kink", "component dislodged",
        "sealant exposed", "loose seal", "sealant leakage", "sealant failure",

        # Blockages and leaks
        "balloon leak", "balloon burst", "balloon deflation",
        "unable to inflate", "leak in system", "fluid leakage",

        # Obstruction / entrapment
        "device stuck", "entrapment issue", "obstructed device",
        "retained device", "unable to remove device",

        # Electronics / software
        "system error", "software error", "firmware bug",
        "power failure", "battery failure", "low battery alert",
        "display malfunction", "false alarm", "incorrect reading"
    ],

    "Injury": [
        # Bleeding / hematoma
        "bleeding", "excessive bleeding", "control of bleeding",
        "inability to stop bleeding", "hemostasis failure", "manual compression",
        "hematoma", "hematoma formation", "site hematoma",

        # Pain & discomfort
        "pain", "site pain", "localized pain", "post-procedure pain",
        "site discomfort", "discomfort", "local irritation",

        # Infections & inflammation
        "infection", "site infection", "wound infection",
        "inflammatory reaction", "swelling", "redness",
        "bruising", "seroma", "delayed healing", "wound complication",

        # Vascular / tissue injuries
        "vessel injury", "vascular injury", "arterial injury",
        "tissue damage", "tissue necrosis", "necrosis",
        "pseudoaneurysm", "extravasation", "vascular spasm",
        "ischemia", "limb ischemia", "nerve injury", "limb numbness", "numbness",

        # Hypersensitivity / allergy
        "hypersensitivity reaction", "allergic reaction", "skin reaction",
        "contact dermatitis", "rash", "burns"
    ],

    "Death": [
        # Direct death reports
        "patient died", "patient death", "death reported",
        "death occurred", "fatal event", "fatal outcome",
        "loss of life", "mortality", "fatality", "lethal event",
        "procedure-related death", "unexpected death", "sudden death",
        "death following procedure", "fatal complication",

        # Common MAUDE phrasings
        "patient demise", "passed away", "found deceased",
        "collapsed and died", "expired", "death confirmed",
        "pronounced dead", "declared dead", "cause of death",
        "autopsy revealed", "post mortem", "post-mortem",
        "death certificate", "no signs of life", "end of life",

        # Indirect death phrasing
        "died suddenly", "died during procedure", "died after procedure"
    ]
}


In [7]:
# Load embedding model (example — you can swap with your own)

# Precompute embeddings for all dictionary terms
DICT_EMBEDS = {
    category: embed_model.encode(terms, convert_to_tensor=True)
    for category, terms in DICT_TERMS.items()
}

In [8]:
from sentence_transformers import util
import torch

# Helper: Get top N dictionary terms for a given label & text
def top_dict_terms_for_label(text: str, label: str, top_k=10):
    if not isinstance(text, str) or not text.strip():
        return []
    if label not in DICT_TERMS:
        return []

    # Embed event text
    text_emb = embed_model.encode(text, convert_to_tensor=True)

    # Dictionary term embeddings for this label
    term_embs = DICT_EMBEDS[label]

    # Cosine similarity
    sims = util.cos_sim(text_emb, term_embs)[0]

    # Get top_k terms
    top_indices = torch.topk(sims, k=min(top_k, sims.size(0))).indices.tolist()

    return [
        {
            "term": DICT_TERMS[label][idx],
            "score": round(float(sims[idx].item()), 3)
        }
        for idx in top_indices
    ]

# # Apply to dataframe
# def add_top_terms_column(df, text_col="input", label_col="predicted_label", top_k=10):
#     df["top_dict_terms"] = df.apply(
#         lambda row: top_dict_terms_for_label(row[text_col], row[label_col], top_k=top_k),
#         axis=1
#     )
#     return df

# df=pd.read_csv("essential_2021.csv")
# df=df.head(30)
# # Example usage:
# df = add_top_terms_column(df, text_col="input", label_col="predicted_label", top_k=10)

# # View first row example
# print(df[["predicted_label", "input", "top_dict_terms"]])



In [9]:
# Example ProGlide case
text = """A customer reported a Prostyle sheath had weakness in the tip and was unable to advance during a procedure. The physician suspected an issue with the extra hydrophilic coating on the sheath, which caused it to buckle under fluoroscopy. A backup device was used to achieve hemostasis, but the cause of the problem couldn't be determined due to lack of information. There is currently no indication that the product quality issue is related to design, manufacturing, or labeling of the device."""

predicted_label = "Malfunction"  # manually setting what model would predict

# Get top 10 similar dictionary terms for this label
top_terms = top_dict_terms_for_label(text, predicted_label, top_k=10)

# Display results
print(f"Predicted label: {predicted_label}")
print("Top dictionary terms:")
for t in top_terms:
    print(f"- {t['term']} (score: {t['score']})")

Predicted label: Malfunction
Top dictionary terms:
- kinked sheath (score: 0.377)
- broken tip (score: 0.357)
- failure during operation (score: 0.338)
- malfunction during use (score: 0.338)
- material integrity issue (score: 0.325)
- equipment malfunction (score: 0.31)
- sealant failure (score: 0.298)
- device damage (score: 0.29)
- malfunction after use (score: 0.287)
- catheter kink (score: 0.273)


In [10]:
from sentence_transformers import util
import torch

# Function to get top dictionary terms for a label
def top_dict_terms_for_label(text: str, label: str, top_k=10):
    if not isinstance(text, str) or not text.strip():
        return []
    if label not in DICT_TERMS:
        return []

    # Embed event text
    text_emb = embed_model.encode(text, convert_to_tensor=True)

    # Dictionary term embeddings for this label
    term_embs = DICT_EMBEDS[label]

    # Cosine similarity
    sims = util.cos_sim(text_emb, term_embs)[0]

    # Top-k terms
    top_indices = torch.topk(sims, k=min(top_k, sims.size(0))).indices.tolist()

    return [
        {
            "term": DICT_TERMS[label][idx],
            "score": round(float(sims[idx].item()), 3)
        }
        for idx in top_indices
    ]

# ==== Your example text ====
text = """A customer reported a Prostyle sheath had weakness in the tip and was unable to advance during a procedure.
The physician suspected an issue with the extra hydrophilic coating on the sheath, which caused it to buckle under fluoroscopy.
A backup device was used to achieve hemostasis, but the cause of the problem couldn't be determined due to lack of information.
There is currently no indication that the product quality issue is related to design, manufacturing, or labeling of the device."""

predicted_label = "Malfunction"

# Run the similarity check
top_terms = top_dict_terms_for_label(text, predicted_label, top_k=10)

# Display results
print(f"Predicted label: {predicted_label}")
print("Top dictionary terms:")
for t in top_terms:
    print(f"- {t['term']} (score: {t['score']})")


Predicted label: Malfunction
Top dictionary terms:
- kinked sheath (score: 0.377)
- broken tip (score: 0.357)
- failure during operation (score: 0.338)
- malfunction during use (score: 0.338)
- material integrity issue (score: 0.325)
- equipment malfunction (score: 0.31)
- sealant failure (score: 0.298)
- device damage (score: 0.29)
- malfunction after use (score: 0.287)
- catheter kink (score: 0.273)


## Keywbert

In [None]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util
import numpy as np

# ===== Your dictionary =====
DICT_TERMS = {
    "Malfunction": [
        "device malfunction", "equipment malfunction", "malfunction during use",
        "device not working", "device failed", "failure during operation",
        "failure to function", "malfunction on deployment", "malfunction after use",
        "deployment failure", "incomplete deployment", "unable to deploy",
        "failed to deploy", "partial deployment", "deployment interrupted",
        "failed to lock", "locking issue", "activation failure",
        "device misfire", "delivery issue", "misalignment of device",
        "device damage", "component breakage", "handle breakage",
        "material integrity issue", "broken tip", "torn sleeve",
        "kinked sheath", "catheter kink", "component dislodged",
        "sealant exposed", "loose seal", "sealant leakage", "sealant failure",
        "balloon leak", "balloon burst", "balloon deflation",
        "unable to inflate", "leak in system", "fluid leakage",
        "device stuck", "entrapment issue", "obstructed device",
        "retained device", "unable to remove device",
        "system error", "software error", "firmware bug",
        "power failure", "battery failure", "low battery alert",
        "display malfunction", "false alarm", "incorrect reading","cuff miss"
    ],
    "Injury": [
        "bleeding", "excessive bleeding", "control of bleeding",
        "inability to stop bleeding", "hemostasis failure", "manual compression",
        "hematoma", "hematoma formation", "site hematoma",
        "pain", "site pain", "localized pain", "post-procedure pain",
        "site discomfort", "discomfort", "local irritation",
        "infection", "site infection", "wound infection",
        "inflammatory reaction", "swelling", "redness",
        "bruising", "seroma", "delayed healing", "wound complication",
        "vessel injury", "vascular injury", "arterial injury",
        "tissue damage", "tissue necrosis", "necrosis",
        "pseudoaneurysm", "extravasation", "vascular spasm",
        "ischemia", "limb ischemia", "nerve injury", "limb numbness", "numbness",
        "hypersensitivity reaction", "allergic reaction", "skin reaction",
        "contact dermatitis", "rash", "burns","hematosis","occlusion"
    ],
    "Death": [
        "patient died", "patient death", "death reported",
        "death occurred", "fatal event", "fatal outcome",
        "loss of life", "mortality", "fatality", "lethal event",
        "procedure-related death", "unexpected death", "sudden death",
        "death following procedure", "fatal complication",
        "patient demise", "passed away", "found deceased",
        "collapsed and died", "expired", "death confirmed",
        "pronounced dead", "declared dead", "cause of death",
        "autopsy revealed", "post mortem", "post-mortem",
        "death certificate", "no signs of life", "end of life",
        "died suddenly", "died during procedure", "died after procedure"
    ]
}

# ===== Models =====
bert_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=bert_model)

# ===== Function =====
def label_specific_keyword_match(text, label, top_k=10, threshold=0.3):
    """
    Extracts top keywords from text using KeyBERT and matches
    them against the DICT_TERMS for the given label using cosine similarity.
    """
    if label not in DICT_TERMS:
        raise ValueError(f"Label '{label}' not found in DICT_TERMS")

    # Step 1: Extract top keywords from text
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        top_n=top_k
    )
    extracted_terms = [kw for kw, score in keywords]

    # Step 2: Compare only to keywords for that label
    dict_terms = DICT_TERMS[label]
    dict_embeddings = bert_model.encode(dict_terms, convert_to_tensor=True, normalize_embeddings=True)
    extracted_embeddings = bert_model.encode(extracted_terms, convert_to_tensor=True, normalize_embeddings=True)

    cosine_scores = util.cos_sim(extracted_embeddings, dict_embeddings)

    # Step 3: Collect matches
# Step 3: Collect matches
    matches = []
    for i, term in enumerate(extracted_terms):
        scores = cosine_scores[i].cpu().numpy()  # FIX
        max_idx = int(np.argmax(scores))
        max_score = float(scores[max_idx])

        if max_score >= threshold:
            matches.append({
                "extracted_term": term,
                "matched_dict_term": dict_terms[max_idx],
                "score": round(max_score, 3)
            })


    return {
        "label": label,
        "keywords": extracted_terms,
        "matches": matches
    }

# ===== Example =====
text_example = """
A patient died during the procedure. The report mentioned sudden death
and a fatal complication post deployment.
"""

predicted_label = "Death"

result = label_specific_keyword_match(text_example, predicted_label, top_k=8, threshold=0.3)

print("Extracted Keywords:", result["keywords"])
print("\nMatches:")
for m in result["matches"]:
    print(f"- {m['extracted_term']} → {m['matched_dict_term']} (score: {m['score']})")


  from .autonotebook import tqdm as notebook_tqdm


Extracted Keywords: ['died procedure report', 'patient died procedure', 'died procedure', 'death fatal complication', 'patient died', 'fatal complication', 'fatal complication post', 'sudden death fatal']

Matches:
- died procedure report → died during procedure (score: 0.885)
- patient died procedure → died after procedure (score: 0.917)
- died procedure → died during procedure (score: 0.905)
- death fatal complication → fatal complication (score: 0.858)
- patient died → patient died (score: 1.0)
- fatal complication → fatal complication (score: 1.0)
- fatal complication post → fatal complication (score: 0.821)
- sudden death fatal → sudden death (score: 0.909)


In [22]:
# ===== Function =====
import json 
import pandas as pd

def extract_label_keywords(text, label, top_k=10, threshold=0.3):
    if label not in DICT_TERMS:
        return []

    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        top_n=top_k
    )
    extracted_terms = [kw for kw, _ in keywords]

    dict_terms = DICT_TERMS[label]
    dict_embeddings = bert_model.encode(dict_terms, convert_to_tensor=True, normalize_embeddings=True)
    extracted_embeddings = bert_model.encode(extracted_terms, convert_to_tensor=True, normalize_embeddings=True)

    cosine_scores = util.cos_sim(extracted_embeddings, dict_embeddings)

    matches = []
    for i, term in enumerate(extracted_terms):
        scores = cosine_scores[i].cpu().numpy()
        max_idx = int(np.argmax(scores))
        max_score = float(scores[max_idx])

        if max_score >= threshold:
            matches.append({
                "extracted_term": term,
                "matched_dict_term": dict_terms[max_idx],
                "score": round(max_score, 3)
            })

    # Deduplicate by matched_dict_term
    seen = set()
    unique_matches = []
    for m in sorted(matches, key=lambda x: x["score"], reverse=True):
        if m["matched_dict_term"] not in seen:
            unique_matches.append(m)
            seen.add(m["matched_dict_term"])

    return unique_matches

# ===== Process CSV =====
df = pd.read_csv("abbott_2024.csv")  # replace with your file path

def process_row(row):
    text = row["input"]
    pred_label = row["predicted_label"]
    event_type = row["EVENT_TYPE"]  # match exact column name from your CSV

    results = {}

    # Always get matches for predicted label
    pred_matches = extract_label_keywords(text, pred_label)
    results[pred_label] = pred_matches

    # If no matches or labels don't match, also get for event type
    if not pred_matches or pred_label != event_type:
        event_matches = extract_label_keywords(text, event_type)
        results[event_type] = event_matches

    return json.dumps(results)

df["matched_keywords"] = df.apply(process_row, axis=1)

# ===== Save CSV =====
df.to_csv("abbott_2024_extended.csv", index=False)
print("✅ Saved with dual label keyword matches")

✅ Saved with dual label keyword matches
