# Noisy ICD Code Labeling of Clincal Notes

In [None]:
%load_ext autoreload
%autoreload 2

## Preprocess the data

In [None]:
import json
from pathlib import Path

from sklearn.preprocessing import MultiLabelBinarizer

valid = [json.loads(line) for line in Path("../data/MIMIC-III/valid.jsonl").read_text().strip().split("\n")]
texts = [example["text"] for example in valid]
labels = [example["labels"] for example in valid]
mlb = MultiLabelBinarizer()
_= mlb.fit_transform(labels)
whitelist = mlb.classes_

## (Noisy) Label the data

First, collect textual descriptions of each code from SNOWMED

In [None]:
import pandas as pd

from deep_patient_cohorts.common.utils import DISEASE_IDS

descr_df = pd.read_csv(
    "../data/SnomedCT_USEditionRF2_PRODUCTION_20200901T120000Z/Full/Terminology/sct2_TextDefinition_Full-en_US1000124_20200901.txt",
    sep="\t",
    usecols=["conceptId", "term"],
    dtype={"conceptId": str, "term": str},
)

descriptions = {id_: descr for id_, descr  in zip(descr_df["conceptId"], descr_df["term"]) if id_ in DISEASE_IDS}

Then initalize the labeller and label the validation set

In [None]:
from deep_patient_cohorts import NoisyLabeler
from deep_patient_cohorts.common.utils import DISEASE_IDS

labeler = NoisyLabeler(labels=DISEASE_IDS, descriptions=descriptions)

In [None]:
noisy_labels = labeler(texts)

Finally, we can check the accuracy of each labelling function

In [None]:
accuracy, abstain_rate = labeler.accuracy(noisy_labels, labels)

In [None]:
import numpy as np

m = list(noisy_labels.values())[0].shape[-1]
for i in range(m):
    print(
        f"LF {i}: Accuracy {np.mean(accuracy[i]) * 100:.2f}%,"
        f" Abstain rate {np.mean(abstain_rate[i]) * 100:.2f}%"
    )