# Noisy ICD Code Labeling of Clincal Notes

In [None]:
%load_ext autoreload
%autoreload 2

import json
from pathlib import Path
from typing import Dict, List

import numpy as np
from deep_patient_cohorts import NoisyLabeler
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
def print_statistics(noisy_labels: Dict[str, np.ndarray], gold_labels: List[List[str]]) -> None:
    """Print the accuracy and abstain rate of each labelling function in `noisy_labels`,
    based on the given `gold_labels`."""
    # Binarize the labels, setting the negative (absence) of a class to 
    # -1 to match FlyingSquids convention.
    mlb = MultiLabelBinarizer()
    gold_labels = mlb.fit_transform(gold_labels)
    gold_labels = np.where(gold_labels == 0, -1 * np.ones_like(gold_labels), np.ones_like(gold_labels))
    
    # (no. of examples, no. of lfs)
    n, m = list(noisy_labels.values())[0].shape  
    
    accuracy = [[] for _ in range(m)]
    abstain_rate = [[] for _ in range(m)]
    for i, class_ in enumerate(mlb.classes_):
        for j in range(m):
            if class_ in noisy_labels:
                num_predictions = np.sum(noisy_labels[class_][:,j] != 0)
                if num_predictions != 0:
                    accuracy[j].append(
                        np.sum(noisy_labels[class_][:,j] == gold_labels[:, i]) / num_predictions
                    )
                abstain_rate[j].append(
                    np.sum(noisy_labels[class_][:,j] == 0) / n
                )
            else:
                abstain_rate[j].append(1)

    for i in range(m):
        print(f"LF {i}: Accuracy {np.mean(accuracy[i]) * 100:.2f}%, Abstain rate {np.mean(abstain_rate[i]) * 100:.2f}%")

## Preprocess the data

In [None]:
valid = [json.loads(line) for line in Path("../data/MIMIC-III/valid.jsonl").read_text().strip().split("\n")]
texts = [example["text"] for example in valid[:500]]
labels = [example["labels"] for example in valid[:500]]
mlb = MultiLabelBinarizer()
_= mlb.fit_transform(labels)
whitelist = mlb.classes_

## (Noisy) Label the data

In [None]:
labeler = NoisyLabeler(descriptions="../data/D_ICD_DIAGNOSES.csv", whitelist=whitelist)

In [None]:
noisy_labels = labeler(texts)

In [None]:
print_statistics(noisy_labels=noisy_labels, gold_labels=labels)