- **OBJECTIVES**
    - Probably NBME texts have much technical terms for biomedicine.
    - Recently microsoft researchers showed even domain-specific pretraining can benefit by starting from general-domain language models. [PubMedBert](https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract)
    - The aim of this notebook is to roughly explore how much techical terms in NBME features cannot be detected by various tokenizers, especially Roberta, DeBerta_V2 and PubMedBert

**VER3**
- added DeBerta.
- explored how many tokens 'location' truth are devided into regarding various tokenizers.

# Imports

In [None]:
!pip install transformers==4.16.2

In [None]:
!pip install tokenizers==0.11.0

In [None]:
# https://www.kaggle.com/nbroad/deberta-v2-3-fast-tokenizer
import shutil
from pathlib import Path

transformers_path = Path('/opt/conda/lib/python3.7/site-packages/transformers') 
input_dir = Path('../input/deberta-v2-3-fast-tokenizer')

convert_file = input_dir / 'convert_slow_tokenizer.py'
conversion_path = transformers_path/convert_file.name 
if conversion_path.exists():
    print('previous convert file exists. will be unlinked.')
    conversion_path.unlink() 
shutil.copy(convert_file, transformers_path)

deberta_v2_path = transformers_path / 'models' / 'deberta_v2' 
for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        print(f'previous {filename} exists. will be unlinked.')
        filepath.unlink() 
    shutil.copy(input_dir/filename, filepath)

In [None]:
import os
import sys
import glob
import random
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
import ast

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.model_selection import train_test_split

import transformers
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
from transformers import AutoConfig, AutoModel, AutoTokenizer

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn2
import seaborn as sns
sns.set()

print('TF version,', tf.__version__)
print('transformers version,', transformers.__version__)

# Configuration

In [None]:
seed = 42
MAX_LEN = 512

In [None]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
seed_everything(seed)

# Load Train

In [None]:
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval) # Construct an object from a string
train['location'] = train['location'].apply(ast.literal_eval) # Construct an object from a string
train

In [None]:
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
features

In [None]:
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
patient_notes

- Merge

In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
train['annotation_length'] = train['annotation'].apply(len)
train

# Feature Words

In [None]:
feature_set = set()
for n, row in features.iterrows():
    tmp_set = set(row['feature_text'].split('-'))
    feature_set = feature_set | tmp_set
feature_set = {item.lower() for item in feature_set}

In [None]:
for n, item in enumerate(feature_set):
    print(item.ljust(15), end='')

In [None]:
len(feature_set)

In [None]:
num_words = []
for n, row in train.iterrows():
    if len(row['annotation']) > 0:
        for anno in row['annotation']:
            num_words.append(len(anno.split()))
num_words = np.array(num_words)

# Tokenizer

- Deberta_v2

In [None]:
deberta_v2_tokenizer = DebertaV2TokenizerFast.from_pretrained('kamalkraj/deberta-v2-xlarge')

In [None]:
voc_size_deberta_v2 = deberta_v2_tokenizer.vocab_size
print(voc_size_deberta_v2)

In [None]:
voc_deberta_v2 = np.array(list(deberta_v2_tokenizer.get_vocab().keys()))
voc_deberta_v2 = set(voc_deberta_v2)
voc_deberta_v2 = {item.lower() for item in voc_deberta_v2}

In [None]:
diff_deberta_v2 = feature_set - voc_deberta_v2
for n, item in enumerate(diff_deberta_v2):
    print(item.ljust(15), end='')

In [None]:
len(diff_deberta_v2)

In [None]:
num_tokens_deberta_v2 = []
for n, row in train.iterrows():
    if len(row['annotation'])>0:
        for anno in row['annotation']:
            tmp = deberta_v2_tokenizer(anno, add_special_tokens=False)
            num_tokens_deberta_v2.append(len(tmp['input_ids']))

In [None]:
num_tokens_deberta_v2 = np.array(num_tokens_deberta_v2)
devided_fold_deberta_v2 = num_tokens_deberta_v2 / num_words

- deberta

In [None]:
deberta_tokenizer = AutoTokenizer.from_pretrained('kamalkraj/deberta-base')

In [None]:
deberta_tokenizer(train.iloc[0]['annotation'][0], add_special_tokens=False)

In [None]:
voc_size_deberta = deberta_tokenizer.vocab_size
print(voc_size_deberta)

In [None]:
voc_deberta = np.array(list(deberta_tokenizer.get_vocab().keys()))
voc_deberta = set(voc_deberta)
voc_deberta = {item.lower() for item in voc_deberta}

In [None]:
diff_deberta = feature_set - voc_deberta
for n, item in enumerate(diff_deberta):
    print(item.ljust(15), end='')

In [None]:
len(diff_deberta)

In [None]:
num_tokens_deberta = []
for n, row in train.iterrows():
    if len(row['annotation'])>0:
        for anno in row['annotation']:
            tmp = deberta_tokenizer(anno, add_special_tokens=False)
            num_tokens_deberta.append(len(tmp['input_ids']))

In [None]:
num_tokens_deberta = np.array(num_tokens_deberta)
devided_fold_deberta = num_tokens_deberta / num_words

- Roberta

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-large')

In [None]:
voc_size_roberta = roberta_tokenizer.vocab_size
print(voc_size_roberta)

In [None]:
voc_roberta = np.array(list(roberta_tokenizer.get_vocab().keys()))
voc_roberta = set(voc_roberta)
voc_roberta = {item.lower() for item in voc_roberta}

In [None]:
diff_roberta = feature_set - voc_roberta
for n, item in enumerate(diff_roberta):
    print(item.ljust(15), end='')

In [None]:
len(diff_roberta)

In [None]:
num_tokens_roberta = []
for n, row in train.iterrows():
    if len(row['annotation'])>0:
        for anno in row['annotation']:
            tmp = roberta_tokenizer(anno, add_special_tokens=False)
            num_tokens_roberta.append(len(tmp['input_ids']))

In [None]:
num_tokens_roberta = np.array(num_tokens_roberta)
devided_fold_roberta = num_tokens_roberta / num_words

- PubMedBert

In [None]:
pubmedbert_tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

In [None]:
voc_size_pubmedbert = pubmedbert_tokenizer.vocab_size
print(voc_size_pubmedbert)

In [None]:
voc_pubmedbert = np.array(list(pubmedbert_tokenizer.get_vocab().keys()))
voc_pubmedbert = set(voc_pubmedbert)
voc_pubmedbert = {item.lower() for item in voc_pubmedbert}

In [None]:
diff_pubmedbert = feature_set - voc_pubmedbert
for n, item in enumerate(diff_pubmedbert):
    print(item.ljust(15), end='')

In [None]:
len(diff_pubmedbert)

In [None]:
num_tokens_pubmedbert = []
for n, row in train.iterrows():
    if len(row['annotation'])>0:
        for anno in row['annotation']:
            tmp = pubmedbert_tokenizer(anno, add_special_tokens=False)
            num_tokens_pubmedbert.append(len(tmp['input_ids']))

In [None]:
num_tokens_pubmedbert = np.array(num_tokens_pubmedbert)
devided_fold_pubmedbert = num_tokens_pubmedbert / num_words

- Comparison

In [None]:
undetected = np.array([len(diff_deberta_v2), len(diff_deberta), len(diff_roberta), len(diff_pubmedbert)])
undetected = undetected / len(feature_set) * 100

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(12, 6))
labels = ['DeBerta_V2', 'DeBerta', 'RoBerta', 'PubMedBert']
axes[0].bar([1,2,3, 4], [voc_size_deberta_v2, voc_size_deberta, voc_size_roberta, voc_size_pubmedbert],
           tick_label=labels, color=['#E7B8B9', '#BBDCC2', '#BBDCC2', '#B7C6DF'])
axes[0].set_ylabel('vocabulary size', fontsize=15)
axes[0].set_title('Vocabulary size among 4 tokenizers', fontsize=15, fontweight='bold')
axes[0].tick_params(axis='x', labelsize=15)
axes[1].bar([1,2,3, 4], 100-undetected, tick_label=labels, color=['#E7B8B9', '#BBDCC2', '#BBDCC2', '#B7C6DF'])
axes[1].set_ylabel('detected feature words (%)', fontsize=15)
axes[1].set_title('Detected feature words among 4 tokenizers', fontsize=15, fontweight='bold')
axes[1].tick_params(axis='x', labelsize=15)
fig.tight_layout()
plt.show()

- Deberta V2 has the most vocabularies, but misses very important technical terms like asthma, bowel, hallucination, infarction, insomnia, nervous, stool, urinary etc.
- PubMedBert has the least vocabularies, but detects the most technical terms.

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(12, 6))
labels = ['DeBerta_V2', 'DeBerta', 'RoBerta', 'PubMedBert']
axes[0].bar([1,2,3, 4], [num_tokens_deberta_v2.mean(), num_tokens_deberta.mean(), num_tokens_roberta.mean(), num_tokens_pubmedbert.mean()],
           tick_label=labels, color=['#E7B8B9', '#BBDCC2', '#BBDCC2', '#B7C6DF'])
axes[0].set_ylabel('length of ground truth tokens', fontsize=15)
axes[0].set_title('Length of Annotation tokens among 4 tokenizers', fontsize=15, fontweight='bold')
axes[0].tick_params(axis='x', labelsize=15)
axes[1].bar([1,2,3, 4], [devided_fold_deberta_v2.mean(), devided_fold_deberta.mean(), devided_fold_roberta.mean(), devided_fold_pubmedbert.mean()],
            tick_label=labels, color=['#E7B8B9', '#BBDCC2', '#BBDCC2', '#B7C6DF'])
axes[1].set_ylabel('devided rate (tokens/words)', fontsize=15)
axes[1].set_title('Tokens/Words rate among 4 tokenizers', fontsize=15, fontweight='bold')
axes[1].tick_params(axis='x', labelsize=15)
fig.tight_layout()
plt.show()

- The annotaion length is about 4 tokens. 
- If we set MAX_LEN=512 and tackle this competition as NER task, only 0.78% (4/MAX_LEN) is positive label in each patient note.
- Tokens/Words rate of PubMedBert was higher than I expected.

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot()
venn3([voc_deberta, voc_roberta, voc_pubmedbert], set_labels=labels, ax=ax)
fig.tight_layout()
plt.show()

- PubMedBert can be much effective to biomedical notes because of having much technical terms.
- But since PubMedBert has less vocabularies, it can be poor at catching the whole text meanings.
- We should include PubMedBert when ensembling?

- thanks