Install and import necessary libraries

In [13]:
!pip install transformers sentence-transformers scikit-learn

import os
import re
import torch
import numpy as np
from pathlib import Path
from collections import defaultdict, Counter
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity




Load ClinicalBERT model

In [14]:
# Load ClinicalBERT
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
    return embeddings[0].numpy()

Load the four files

In [15]:
model_name = "DeepSeek"
file_path = Path(f"../Results/test/{model_name}/")

zeroshot_filename = "0001_zeroshot.txt"
document_filename = "0001_document_level.txt"
sentence_filename = "0001_sentence_level.txt"
entity_filename = "0001_entity_unseen.txt"

zeroshot_doc = file_path / zeroshot_filename
document_doc = file_path / document_filename
sentence_doc = file_path / sentence_filename
entity_doc = file_path / entity_filename

file_paths = [zeroshot_doc, document_doc, sentence_doc, entity_doc]

print(file_paths)

[WindowsPath('../Results/test/DeepSeek/0001_zeroshot.txt'), WindowsPath('../Results/test/DeepSeek/0001_document_level.txt'), WindowsPath('../Results/test/DeepSeek/0001_sentence_level.txt'), WindowsPath('../Results/test/DeepSeek/0001_entity_unseen.txt')]


parse all entity-label pairs

In [16]:
all_entities = []  # List of dicts: {"entity": ..., "label": ..., "file": ..., "embedding": ...}

pattern = r'entity="(.*?)"\s+label="(.*?)"'

for idx, path in enumerate(file_paths):
    with open(path, 'r') as f:
        content = f.readlines()
        for line in content:
            match = re.search(pattern, line.strip())
            if match:
                entity, label = match.groups()
                all_entities.append({
                    "entity": entity,
                    "label": label,
                    "file": f"doc{idx+1}",
                    "embedding": embed_text(entity)
                })

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Cluster similar entities (≥ 0.92 cosine similarity)

In [17]:
clusters = []  # Each cluster is a list of indices into `all_entities`
visited = set()

for i, ent1 in enumerate(all_entities):
    if i in visited:
        continue
    cluster = [i]
    visited.add(i)
    for j in range(i + 1, len(all_entities)):
        if j in visited:
            continue
        sim = cosine_similarity(
            [ent1["embedding"]],
            [all_entities[j]["embedding"]]
        )[0][0]
        if sim >= 0.92:
            cluster.append(j)
            visited.add(j)
    clusters.append(cluster)

Reduce clusters based on document coverage and assign majority label

In [18]:
final_outputs = []

for cluster in clusters:
    files_covered = set(all_entities[i]["file"] for i in cluster)
    if len(files_covered) < 2:
        continue  # Skip entities not in at least 2 docs

    labels = [all_entities[i]["label"] for i in cluster]
    label_counts = Counter(labels)
    most_common = label_counts.most_common()

    if len(most_common) == 1 or (len(most_common) > 1 and most_common[0][1] > most_common[1][1]):
        final_label = most_common[0][0]
    else:
        final_label = "unknown"

    representative_entity = all_entities[cluster[0]]["entity"]  # You can change this to centroid logic
    final_outputs.append(f'entity="{representative_entity}" label="{final_label}"')


Write final results to output.txt

In [19]:
output_filename = "0001_self_consistency.txt"
output_file = file_path / output_filename


with open(output_file, "w") as f:
    for line in final_outputs:
        f.write(line + "\n")

print(f"Saved output to {output_file}")


Saved output to ..\Results\test\DeepSeek\0001_self_consistency.txt
