In [3]:
from pathlib import Path
import pandas as pd
import spacy
from tqdm import tqdm

nlp_bio = spacy.load("en_ner_bc5cdr_md")

DATA_PATH = Path("../data/processed/cleaned_transcriptions.csv")
df = pd.read_csv(DATA_PATH)

preferred_keywords = ["transcript", "text", "content", "utterance", "description"]
text_col = None
for kw in preferred_keywords:
    matches = [col for col in df.columns if kw.lower() in col.lower()]
    if matches:
        text_col = matches[0]
        break
if text_col is None:
    for col in df.columns:
        if df[col].dtype == object:
            text_col = col
            break
if text_col is None:
    raise ValueError("No text-like column found in the CSV.")

print(f"Using column: {text_col} — {len(df)} rows")

entities_all = []
for doc in tqdm(nlp_bio.pipe(df[text_col].astype(str), batch_size=20), total=len(df)):
    entities_all.append([
        {"text": ent.text, "label": ent.label_, "start": ent.start_char, "end": ent.end_char}
        for ent in doc.ents
    ])
df["entities"] = entities_all

OUTPUT_PATH = Path(r"C:\Users\praty\Downloads\Mirai\data\processed\entities_extracted_bio.csv")
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)

print(f"Saved {len(df)} rows to: {OUTPUT_PATH.resolve()}")

Using column: transcription — 4999 rows


100%|██████████████████████████████████████████████████████████████████████████████| 4999/4999 [07:13<00:00, 11.54it/s]


Saved 4999 rows to: C:\Users\praty\Downloads\Mirai\data\processed\entities_extracted_bio.csv


In [15]:
#small tests

In [11]:
import ast #top diseases/chemicals
from collections import Counter

all_ents = []
for ents_str in df["entities"]:
    ents = ents_str if isinstance(ents_str, list) else ast.literal_eval(ents_str)
    all_ents.extend(ent["text"] for ent in ents if ent["label"] in ["DISEASE", "CHEMICAL"])

print(Counter(all_ents).most_common(20))

[('pain', 3285), ('bleeding', 1315), ('tumor', 756), ('infection', 734), ('hypertension', 642), ('edema', 639), ('fracture', 634), ('tenderness', 586), ('p.o', 581), ('stenosis', 578), ('chest pain', 568), ('alcohol', 566), ('shortness of breath', 557), ('fever', 553), ('swelling', 546), ('lidocaine', 515), ('Marcaine', 509), ('cough', 421), ('creatinine', 404), ('diabetes', 384)]


In [9]:
type_counts = Counter(ent["label"] for ents_str in df["entities"] for ent in (ents_str if isinstance(ents_str, list) else ast.literal_eval(ents_str)))
print(type_counts) #Disease vs chemicals

Counter({'DISEASE': 64667, 'CHEMICAL': 23197})


In [13]:
import ast
from collections import Counter

all_entities = []
for ents_str in df["entities"]:
    ents = ents_str if isinstance(ents_str, list) else ast.literal_eval(ents_str)
    all_entities.extend(ent["text"] for ent in ents)

entity_freq = Counter(all_entities)
print(entity_freq.most_common(20))

[('pain', 3285), ('bleeding', 1315), ('tumor', 756), ('infection', 734), ('hypertension', 642), ('edema', 639), ('fracture', 634), ('tenderness', 586), ('p.o', 581), ('stenosis', 578), ('chest pain', 568), ('alcohol', 566), ('shortness of breath', 557), ('fever', 553), ('swelling', 546), ('lidocaine', 515), ('Marcaine', 509), ('cough', 421), ('creatinine', 404), ('diabetes', 384)]
