In [None]:
# first party
import ast

# third party
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from torch.utils.data import DataLoader

# first party
from data import NBMEDataset, build_pseudo_data, load_training_data
from model import NBMEModel
from utils import (
    Configuration,
    build_pseudo_predictions,
    create_labels_for_scoring,
    get_character_probabilities,
    get_predictions,
    get_score,
    get_thresholded_sequences,
    training_function,
    validation_function,
)

In [None]:
config = Configuration()
data = pd.read_csv("./nbme_data/train_data_with_pseudo_labels.csv")
data["annotation"] = data["annotation"].apply(ast.literal_eval)
data["location"] = data["location"].apply(ast.literal_eval)

In [None]:
valid_df = data.loc[data["fold_number"] == 4].reset_index(drop=True)
valid_patient_notes_texts = valid_df["pn_history"].values
valid_labels = valid_df["location"].apply(create_labels_for_scoring)

# Create the datasets and data loaders
valid_dataset = NBMEDataset(valid_df, config)

# Training loaders
valid_loader = DataLoader(
    valid_dataset, batch_size=4, shuffle=False, pin_memory=True, drop_last=False
)

# Get the loss and optimizers and model
model = NBMEModel(config=config)
model.load_state_dict(torch.load('./models/deberta_v3_base_cpt_epoch_6.pt'))

In [None]:
# Get the probability outputs
device = torch.device('cuda:1')
model = model.to(device)
predictions, labels = validation_function(config, valid_loader, model, device)

In [None]:
import numpy as np

# Reshape the predictions and labels
samples = len(valid_df)
predictions = predictions.reshape((samples, config.max_length))
labels = labels.reshape((samples, config.max_length))

# Get character probabilities
character_probabilities = get_character_probabilities(
    valid_patient_notes_texts, predictions, config
)

# Get results
results = get_thresholded_sequences(character_probabilities)
preds = get_predictions(results)

In [None]:
# First we want to see the actuals vs the predictions
from utils import pseudo_label
label_preds = []
for pred in preds:
    locations = list(map(lambda x: f'{x[0]} {x[1]}', pred))
    label_preds.append(locations)
    
valid_df['predicted_location'] = label_preds

In [None]:
import spacy
from spacy import displacy

pn_num = 16
pn_num_mask = valid_df['pn_num'] == pn_num
df = valid_df.loc[pn_num_mask].reset_index(drop=True)

text = df['pn_history'].unique()[0]
features = df['feature_text'].tolist()
locations = df['location'].tolist()
predicted_locations = df['predicted_location'].tolist()

In [None]:
text

In [None]:
from pathlib import Path

ents = []
for location, feature in zip(locations, features):
    if len(location) != 0:
        for i in location:
            start, end = i.split(' ')
            start, end = int(start), int(end)
            
            ents.append({'start': start, 'end': end, 'label': feature})
            
# Create the doc
colors = {
    "Annotation": "linear-gradient(90deg, darkviolet, palegreen)" 
}
doc = {'text': text, 'ents': ents}
options = {'colors': colors, 'distance': 200, 'word_spacing': 60}

svg = displacy.render(doc, manual=True, style='ent', options=options, jupyter=True, page=True)
# output_path = Path('./annotated_example.svg')
# output_path.open('w', encoding='utf-8').write(svg)

In [None]:
ents = []
for location, feature in zip(predicted_locations, features):
    if len(location) != 0:
        for i in location:
            start, end = i.split(' ')
            start, end = int(start), int(end)
            
            ents.append({'start': start, 'end': end, 'label': 'Annotation'})
            
# Create the doc
colors = {
    "Annotation": "lightblue" 
}
doc = {'text': text, 'ents': ents}
options = {'colors': colors}
displacy.render(doc, manual=True, style='ent', options=options)

In [None]:
from utils import get_score

In [None]:
valid_labels[0], preds[0]

In [None]:
scores = []

text = valid_df['pn_history'].values
for label, pred, text in zip(valid_labels.tolist(), preds, text):
    score = None
    true_annotation = []
    predicted_annotation = []
    if len(label) != 0:
        score = get_score([label], [pred])
        for i in label:
            start, end = i
            annotation = text[start:end]
            true_annotation.append(annotation)
            
        for j in pred:
            start, end = j
            annotation = text[start:end]
            predicted_annotation.append(annotation)
            
    true_annotation = ', '.join(true_annotation)
    predicted_annotation = ', '.join(predicted_annotation)
        
    scores.append((score, true_annotation, predicted_annotation))

In [None]:
performance_df = pd.DataFrame(
    scores,
    columns=['f1_score', 'true_annotation', 'predicted_annotation']
)

In [None]:
performance_df.sort_values('f1_score').dropna().head(50)

In [None]:
performance_df.iloc[829, :]

In [None]:
plt.rcParams.update({'font.size': 20})

ax = performance_df['f1_score'].hist(figsize=(10, 5), bins=20)
ax.axvline(performance_df['f1_score'].mean(), color='red')
ax.set_title('Distribution of F1 Scores on Holdout')
ax.set_xlabel('F1 Score')
ax.set_ylabel('Frequency')

In [None]:
performance_df.loc[performance_df['f1_score'] < 0.90]

In [None]:
import transformers
import shap
import torch

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

In [None]:
index = 0  # 1885
text = valid_df['pn_history'].values[index]
feature_text = valid_df['feature_text'].values[index]
annotation = valid_df['annotation'].values[index]

In [None]:
inputs = tokenizer(
    text,
    feature_text,
    add_special_tokens=True,
    max_length=466,
    padding='max_length',
    return_offsets_mapping=False
)

for k, v in inputs.items():
    inputs[k] = torch.tensor(v, dtype=torch.long).reshape(1, 466)

In [None]:
data = [f'{text}[SEP]{feature_text}']
data

In [None]:
def make_annotation_scorer(annotation):
    def f(notes):
        out = []
        # There will be a bunch of masked notes created
        for n in notes:
            scores = []
            notes, feature = n.split("[SEP]")
            inputs = tokenizer(
                text,
                feature_text,
                add_special_tokens=True,
                max_length=466,
                padding='max_length',
                return_offsets_mapping=False
            )
            
            for k, v in inputs.items():
                inputs[k] = torch.tensor(v, dtype=torch.long, device=device).reshape(1, 466)

            predictions = model(inputs)
            predictions = torch.sigmoid(predictions.flatten())
            predictions = predictions.reshape((1, 466)).detach().cpu()
            
            character_probabilities = get_character_probabilities(
                [notes], predictions, config
            )
            results = get_thresholded_sequences(character_probabilities)
            preds = get_predictions(results)
            score = get_score(annotation['labels'], preds)
            scores.append(score)
            out.append(scores)
            
        return out
    
    f.output_names = annotation['annotation']
    return f

annotations = {
    'labels': [valid_labels[index]],
    'annotation': annotation,
}
f_annotation = make_annotation_scorer(annotation=annotations)

In [None]:
import matplotlib.pyplot as plt

explainer_answers = shap.Explainer(f_annotation, tokenizer)
shap_values_answers = explainer_answers(data)

In [None]:
shap.plots.text(shap_values_answers, display=True)