In [None]:
import re
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from ast import literal_eval
from matplotlib_venn import venn3
from matplotlib import pyplot as plt
from transformers import AutoTokenizer

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
patient_notes = pd.read_csv("../input/nbme-score-clinical-patient-notes/patient_notes.csv")
features = pd.read_csv("../input/nbme-score-clinical-patient-notes/features.csv")

train_data = pd.read_csv("../input/nbme-score-clinical-patient-notes/train.csv")

# patients_notes

In [None]:
print(patient_notes.shape)
patient_notes.sample(2)

In [None]:
# pn_num - A unique identifier for each patient note.
patient_notes.pn_num.nunique()

In [None]:
# case_num - A unique identifier for the clinical case a patient note represents.
sns.countplot(patient_notes.case_num);

In [None]:
patient_notes[patient_notes.pn_history.apply(lambda x: len(str(x)) < 31)]

In [None]:
patient_notes.pn_history[40711]

# features

In [None]:
print(features.shape)
features.sample(2)

In [None]:
# feature_num - A unique identifier for each feature.
features.feature_num.nunique()

In [None]:
# case_num - A unique identifier for each case.
sns.countplot(features.case_num);

In [None]:
# feature_text - A description of the feature.
features.feature_text[2]

# train_data

In [None]:
print(train_data.shape)
train_data.sample(2)

In [None]:
train_data.nunique()

* pn_num - The patient note annotated in this row
* id - Unique identifier for each patient note / feature pair.
* pn_num - The patient note annotated in this row.
* feature_num - The feature annotated in this row.
* case_num - The case to which this patient note belongs.
* annotation - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
* location - Character spans indicating the location of each annotation within the note.

In [None]:
sns.countplot(train_data.case_num);

# Example train_data

In [None]:
samples_with_annotation = train_data[train_data.annotation.str.len() > 2].shape[0]
samples_without_annotation = train_data[train_data.annotation.str.len() == 2].shape[0]
all_samples = train_data.shape[0]
assert samples_with_annotation + samples_without_annotation == all_samples

samples_with_annotation, samples_without_annotation

In [None]:
train_data[train_data.annotation.str.len() > 2]

In [None]:
example = pd.merge(pd.merge(train_data.query('id == "00016_004"'), patient_notes, on=['case_num', 'pn_num']),
         features,
         on=['feature_num', 'case_num']
        )
example

In [None]:
def get_loc_ind(loc:str) -> list:
    array = re.findall(r'\[(.*?)\]', loc)[0].replace("'", "").split()
    if array:
        return list(map(int, array))
    return 0, 0

In [None]:
strart_index, end_index = get_loc_ind(example['location'].values[0])

In [None]:
example.pn_history.values[0][strart_index : end_index]

In [None]:
train_data.query('case_num == 0 & feature_num == 4')['annotation'].value_counts()

# Preprocessing

In [None]:
train_data.location[0]

In [None]:
train_data.annotation[0]

In [None]:
train_data['annotation'] = train_data.annotation.apply(literal_eval)
train_data['location'] = train_data.location.apply(literal_eval)

train_data.location[0]

In [None]:
train_data.annotation[0]

In [None]:
train_merge = pd.merge(train_data.merge(patient_notes, on=['case_num', 'pn_num']),
                                        features,
                                        on=['feature_num', 'case_num']
                                        )
assert train_merge.shape[0] == train_data.shape[0]

print(train_merge.shape)
train_merge.sample(3)

In [None]:
train_merge["feature_text"] = train_merge["feature_text"].apply(lambda x: x.lower())
train_merge["pn_history"] = train_merge["pn_history"].apply(lambda x: x.lower())
train_merge.sample(3)

In [None]:
def prepare_input(tokenizer, text, feature_text):
    inputs = tokenizer(text, feature_text, 
                        max_length=416,
                        padding="max_length")
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

# Feature Words

In [None]:
feature_set = set()
for n, row in features.iterrows():
    tmp_set = set(row['feature_text'].split('-'))
    feature_set = feature_set | tmp_set
    
feature_set = {item.lower() for item in feature_set}
len(feature_set)

# Tokenizer
* ## Bert
* ## RoBerta_Large
* ## PubMedBert

In [None]:
tokenizers = ["bert-base-uncased",
              'roberta-large',
              "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"]

In [None]:
def token(tokenizer, feature_set):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer)
    
    voc_size_model = tokenizer.vocab_size
    print(voc_size_model)
    
    voc = np.array(list(tokenizer.get_vocab().keys()))
    voc = set(voc)
    voc = {item.lower() for item in voc}
    
    diff = feature_set - voc
    print(len(diff))
    
    return voc_size_model, voc, len(diff)

# Compare

In [None]:
res = []
for tok in tqdm(tokenizers):
    res.append(token(tok,feature_set))

assert len(res) == len(tokenizers)

In [None]:
voc_size_berta, voc_size_roberta, voc_size_pubmedbert = res[0][0], res[1][0], res[2][0]

undetected = np.array([res[0][-1], res[1][-1], res[2][-1]]) / len(feature_set) * 100

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(12, 6))
labels = ['Bert', 'RoBerta_Large', 'PubMedBert']

axes[0].bar([1,2,3], [voc_size_berta, voc_size_roberta, voc_size_pubmedbert],
           tick_label=labels, color=['#E7B8B9', '#BBDCC2', '#B7C6DF'])
axes[0].set_ylabel('vocabulary size', fontsize=15)
axes[0].set_title('Vocabulary size among 3 tokenizers', fontsize=15, fontweight='bold')
axes[0].tick_params(axis='x', labelsize=15)
axes[1].bar([1,2,3], 100 - undetected, tick_label=labels, color=['#E7B8B9', '#BBDCC2', '#B7C6DF'])
axes[1].set_ylabel('detected feature words (%)', fontsize=15)
axes[1].set_title('Detected feature words among 3 tokenizers', fontsize=15, fontweight='bold')
axes[1].tick_params(axis='x', labelsize=15)
fig.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot()

labels = ['Bert', 'RoBerta_Large', 'PubMedBert']

venn3([res[0][1], res[1][1], res[2][1]], set_labels=labels, ax=ax)
fig.tight_layout()
plt.show()

# Conclusion
* PubMedBert has the least vocabularies, but detects the most technical terms;
* PubMedBert can be much effective to biomedical notes because of having much technical terms;
* But since PubMedBert has less vocabularies, it can be poor at catching the whole text meanings.