https://htmlpreview.github.io/?https://github.com/CogStack/MedCATtutorials/blob/main/notebooks/specialised/Preprocessing_SNOMED_CT.html

In [None]:
import logging
import pickle
import re
from pathlib import Path

import spacy
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.cdb_maker import CDBMaker
from medcat.config import Config
from medcat.utils.normalizers import BasicSpellChecker
from medcat.utils.preprocess_snomed import Snomed
from tqdm.notebook import tqdm

from discharge_summaries.schemas.mimic import Record

In [None]:
SNOMED_PATH = (
    Path.cwd().parent / "data" / "SnomedCT_InternationalRF2_PRODUCTION_20230731T120000Z"
)
MODEL_DIR = Path.cwd().parent / "models"
logging.basicConfig(level=logging.INFO)

Preprocessing SNOMED CT for MedCAT

In [None]:
sowmed = Snomed(str(SNOMED_PATH))
sowmed.uk_ext = True

In [None]:
df = sowmed.to_concept_df()
df.head()

In [None]:
# Body Structure
# Clinical Finding
# Environment and Geographical Location*
# environment / location
# event
# observable entity
# organism

In [None]:
df.description_type_ids.value_counts()

In [None]:
df[df.description_type_ids == "finding"].tail(100)

In [None]:
(len(df) - len(df.name.unique())) / len(df) * 100

In [None]:
snomed_path_df = SNOMED_PATH / "preprocessed_snomed.csv"
df.to_csv(snomed_path_df, index=False)

In [None]:
len(df)

Create a MedCAT CDB using SNOMED CT release files

In [None]:
config = Config()
config.general["spacy_model"] = "en_core_sci_md"
config.general["log_level"] = logging.INFO
maker = CDBMaker(config)

In [None]:
cdb = maker.prepare_csvs([str(snomed_path_df)], full_build=True)

In [None]:
print(cdb.name2cuis["epilepsy"])
print(cdb.cui2preferred_name["84757009"])
print(cdb.cui2names["84757009"])

In [None]:
cdb.save(SNOMED_PATH / "SNOMED_cdb.dat")

In [None]:
print(cdb.config.general)

In [None]:
cdb = CDB.load(SNOMED_PATH / "SNOMED_cdb.dat")

In [None]:
logging.basicConfig(level=logging.DEBUG)

In [None]:
cat = CAT(cdb=cdb, config=config)

In [None]:
cat.pipe.spacy_nlp.pipeline

In [None]:
spell_checker = BasicSpellChecker(cdb_vocab=cdb.vocab, config=config, data_vocab=None)
cat.pipe.add_token_normalizer(spell_checker=spell_checker, config=config)

In [None]:
# ner = NER(cdb, config)
# cat.pipe.add_ner(ner)

# cat.pipe.spacy_nlp.pipeline

In [None]:
cdb.cui2average_confidence["704647008"]

In [None]:
DATA_DIR = Path.cwd().parent / "data"
GT_DATA_PATH = DATA_DIR / "train.pkl"

MODEL_PATH = (
    Path.cwd().parent
    / "models"
    / "mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5.zip"
)

In [None]:
with open(GT_DATA_PATH, "rb") as in_file:
    gt_dataset = [Record(**record) for record in pickle.load(in_file)]
len(gt_dataset)

In [None]:
num_hits = 0
num_examples = 0
misses = set()
for sample in tqdm(gt_dataset):
    for para in sample.discharge_summary.bhc_paragraphs:
        if not para.heading:
            continue
        clean_heading = para.heading
        heading_match = re.search(r"[a-zA-Z]", para.heading)
        clean_heading = (
            para.heading[heading_match.start() :] if heading_match else para.heading
        )
        # clean_heading = clean_heading.replace('/', ' ')
        doc = cat(clean_heading)
        # if not(any(word in clean_heading.lower() for word in {"fen", "communication", "access", "code"})):
        num_examples += 1
        if doc.ents:
            num_hits += 1
        else:
            misses.add(clean_heading.lower().strip())

num_hits / num_examples

In [None]:
misses

In [None]:
doc._.ents[0]._.confidence

In [None]:
"stroke" in cdb.snames

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("stroke")

In [None]:
config.ner["min_name_len"]

In [None]:
# ner(doc).ents

In [None]:
_doc = [tkn for tkn in doc if not tkn._.to_skip]
name_versions = [_doc[0]._.norm, _doc[0].lower_]
# name_versions[1] in ner.cdb.name2cuis

In [None]:
name_versions

In [None]:
cat("This is about a stroke.").ents

In [None]:
test = "This is a test about heart attacks."



doc = 

snomed_ner