In [4]:
import os
import re

# Directory containing your .txt files
train_dir = "../Data/processed/llm/entity_only/train/"
output_dir = "../Data/processed/llm/entity_only/train/"
output_file = os.path.join(output_dir, "all_entities.txt")


# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Sets to store unique entities by type
entities_by_type = {
    "test": set(),
    "treatment": set(),
    "problem": set()
}

# Regex pattern to extract entity and label
pattern = re.compile(r'entity="(.*?)"\s+label="(.*?)"')

# Read and parse all files
for filename in os.listdir(train_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(train_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            for line in file:
                matches = pattern.findall(line)
                for entity, label in matches:
                    label = label.strip().lower()
                    if label in entities_by_type:
                        entities_by_type[label].add(entity.strip())

In [5]:
# Define output path for Python-style format
list_output_file = os.path.join(output_dir, "all_entities_as_list.txt")

# Write categorized entities as Python lists
with open(list_output_file, "w", encoding='utf-8') as f:
    for label in ["problem", "test", "treatment"]:
        entity_list = sorted(entities_by_type[label])
        entity_list_str = ", ".join(f'"{e}"' for e in entity_list)
        f.write(f"{label} = [{entity_list_str}]\n\n")

# Also print summary to the console
print("Entity counts by type:")
for label in ["problem", "test", "treatment"]:
    print(f"{label}: {len(entities_by_type[label])} unique entities")

print(f"\nPython list-style entity data saved to: {list_output_file}")


Entity counts by type:
problem: 2567 unique entities
test: 1206 unique entities
treatment: 1582 unique entities

Python list-style entity data saved to: ../Data/processed/llm/entity_only/train/all_entities_as_list.txt


In [8]:
from collections import Counter
import os
import re

# Input and output directory
train_dir = "../Data/processed/llm/entity_only/train/"
output_dir = "../Data/processed/llm/entity_only/train/"
output_file = os.path.join(output_dir, "all_entities_freq.txt")

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Dictionary to hold frequencies per label
entity_freq_by_type = {
    "test": Counter(),
    "treatment": Counter(),
    "problem": Counter()
}

# Regex to extract entity and label
pattern = re.compile(r'entity="(.*?)"\s+label="(.*?)"')

# Read and count frequencies
for filename in os.listdir(train_dir):
    if filename.endswith(".txt"):
        filepath = os.path.join(train_dir, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            for line in file:
                matches = pattern.findall(line)
                for entity, label in matches:
                    label = label.strip().lower()
                    if label in entity_freq_by_type:
                        entity_freq_by_type[label][entity.strip()] += 1

# Write to output file
with open(output_file, "w", encoding="utf-8") as f:
    for label in ["test", "problem", "treatment"]:
        header = f"{label.upper()} — {len(entity_freq_by_type[label])} unique entities"
        print(f"\n{header}")
        print("-" * 40)
        f.write(header + "\n")
        f.write("-" * 40 + "\n")
        for entity, freq in entity_freq_by_type[label].most_common():
            line = f"{entity}: {freq}"
            print(line)
            f.write(line + "\n")
        f.write("\n")

print(f"\n✅ Entity frequency report saved to: {output_file}")


TEST — 1206 unique entities
----------------------------------------
blood: 60
glucose: 45
hct: 36
wbc: 34
rbc: 33
mcv: 33
creat: 32
hgb: 32
rdw: 32
pt: 31
mchc: 31
mch: 31
ptt: 27
potassium: 27
plt ct: 25
hematocrit: 25
sodium: 25
creatinine: 25
chloride: 24
bp: 23
cxr: 23
blood pressure: 23
inr: 22
ctropnt: 21
k: 20
na: 20
cl: 20
calcium: 19
urean: 18
hr: 18
inr(pt): 18
hco3: 17
platelets: 17
echo: 16
auscultation: 16
total co2: 15
ast: 15
bun: 15
ck-mb: 14
angap: 14
alt: 14
amylase: 14
rr: 13
urea n: 13
ekg: 12
anion gap: 12
respiratory rate: 12
chest x-ray: 12
cardiac catheterization: 12
plt count: 11
pulse: 11
heart rate: 10
magnesium: 10
examination: 10
monos: 10
lymphs: 10
neuts: 10
eos: 10
lipase: 10
ck: 9
catheterization: 9
blood cultures: 9
mg: 8
t: 8
phosphate: 8
alkaline phosphatase: 8
total bilirubin: 8
albumin: 8
temperature: 8
ct scan: 8
oxygen saturation: 8
weight: 8
ef: 7
phos: 7
white count: 7
bicarbonate: 7
white blood cell count: 7
vital signs: 7
vs: 7
electrocardi

Remove the test entity 001.txt from all entities list

In [6]:
import re
import os

# === CONFIG ===
exclude_file = "../Data/processed/llm/entity_only/test/0001.txt"
output_dir = "../Data/processed/llm/entity_only/train/"
filtered_output_file = os.path.join(output_dir, "all_entities_but_test_0001.txt")

# === PARSE EXCLUSION FILE ===
exclude_pattern = re.compile(r'entity="(.*?)"\s+label="(.*?)"')
exclude_pairs = set()

with open(exclude_file, "r", encoding="utf-8") as ef:
    for line in ef:
        match = exclude_pattern.search(line)
        if match:
            entity, label = match.groups()
            exclude_pairs.add((entity.strip().lower(), label.strip().lower()))

# === FILTER ENTITIES BY BOTH ENTITY AND LABEL ===
filtered_entities_by_type = {
    label: {
        entity for entity in entities
        if (entity.lower(), label) not in exclude_pairs
    }
    for label, entities in entities_by_type.items()
}

# === COUNT FILTERED ENTITIES ===
filtered_counts = {label: len(entities) for label, entities in filtered_entities_by_type.items()}

# === SAVE FILTERED ENTITIES AS PYTHON-LIST STYLE FILE ===
with open(filtered_output_file, "w", encoding="utf-8") as f:
    for label in ["problem", "test", "treatment"]:
        entity_list = sorted(filtered_entities_by_type[label])
        entity_list_str = ", ".join(f'"{e}"' for e in entity_list)
        f.write(f"{label} = [{entity_list_str}]\n\n\n")  # Two newlines between each list

# === PRINT SUMMARY ===
print("Filtered entity counts by type:")
for label in ["test", "problem", "treatment"]:
    print(f"{label}: {filtered_counts[label]} unique entities")

print(f"\nFiltered entity list saved to: {filtered_output_file}")


Filtered entity counts by type:
test: 1160 unique entities
problem: 2540 unique entities
treatment: 1559 unique entities

Filtered entity list saved to: ../Data/processed/llm/entity_only/train/all_entities_but_test_0001.txt
