I hope this notebook would be helpful.

- By default, transformers ver is 4.5.1 when we turn TPU acceralation on.
- This notebook itself successfully works even if using the old version.
- But upgrating it to 4.16.2 is very important regarding how to treat the whitespace, which is mentioned in [disscussion](https://www.kaggle.com/c/nbme-score-clinical-patient-notes/discussion/310897)

In [None]:
!pip install transformers==4.16.2

In [None]:
!pip install tokenizers==0.11.0

# Imports

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import typing
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import tensorflow as tf
import tensorflow_addons as tfa

import transformers
from transformers import *

print('TF version,', tf.__version__)
print('transformers version,', transformers.__version__)

# Configuration

In [None]:
MODEL_NAME = ['roberta-large', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'dmis-lab/biobert-large-cased-v1.1']
MAX_LEN = 512

folds = 5
epochs = 7
seed = 660
lr = 2e-5
min_lr = 1e-6
batch_size_single = 4
num_warmup_steps = 0
num_cycles = 0.5

In [None]:
def hardware_config() -> tuple:
    """Return strategy and batch size according to hardware state"""
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        batch_size = batch_size_single * strategy.num_replicas_in_sync
    except Exception:
        tpu = None
        strategy = tf.distribute.get_strategy()
        batch_size = 4

    return strategy, tpu, batch_size

strategy, TPU, batch_size = hardware_config()
print('batch size, ', batch_size)

In [None]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
seed_everything(seed)

# Load Train

In [None]:
def process_feature_text(text):
    text = re.sub('I-year', '1-year', text)
    text = re.sub('-OR-', " or ", text)
    text = re.sub('-', ' ', text)
    return text

def clean_spaces(txt):
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
#     txt = re.sub(r'\s+', ' ', txt)
    return txt

In [None]:
# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
train = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')

print(f"train.shape: {train.shape}")
display(train.head(10))
print(f"features.shape: {features.shape}")
display(features.head(10))
print(f"patient_notes.shape: {patient_notes.shape}")
display(patient_notes.head(10))

In [None]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

In [None]:
# https://www.kaggle.com/code/yasufuminakama/nbme-deberta-base-baseline-train/notebook
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [None]:
# https://www.kaggle.com/code/theoviel/roberta-strikes-back/notebook
train['feature_text'] = train['feature_text'].apply(process_feature_text)
train['feature_text'] = train['feature_text'].apply(clean_spaces)
train['clean_text'] = train['pn_history'].apply(clean_spaces)
train['pn_history'] = train['pn_history'].apply(lambda x: x.strip())

In [None]:
train['annotation_length'] = train['annotation'].apply(len)
display(train['annotation_length'].value_counts())

# CV split

In [None]:
Fold = GroupKFold(n_splits=folds)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

# Helper Functions

In [None]:
# From https://www.kaggle.com/theoviel/evaluation-metric-folds-baseline

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

In [None]:
def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'][1:], prediction[1:])):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
            if len(text) == end: break
    return results

def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

In [None]:
def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths

In [None]:
def get_dataset(
    input_ids: np.array,
    attention_mask: np.array,
    labels: typing.Optional[np.array] = None,
    ordered: bool = False,
    repeated: bool = False,
    drop_remainder = True
) -> tf.data.Dataset:
    """Return batched and prefetched dataset"""
    if labels is not None:
        dataset = tf.data.Dataset.from_tensor_slices(
            ({"input_ids": input_ids, "attention_mask": attention_mask}, labels)
        )
    else:
        dataset = tf.data.Dataset.from_tensor_slices(
            {"input_ids": input_ids, "attention_mask": attention_mask}
        )

    if repeated:
        dataset = dataset.repeat()
    if not ordered:
        dataset = dataset.shuffle(1024)
    if drop_remainder:
        dataset = dataset.batch(batch_size, drop_remainder=True)
    else:
        dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

In [None]:
class CustomNonPaddingTokenLoss(tf.keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = tf.keras.losses.BinaryCrossentropy(
            from_logits=False, reduction=tf.keras.losses.Reduction.NONE
        )
        active_loss = tf.reshape(y_true, (-1,)) != -1
        reduced_pred = tf.boolean_mask(tf.reshape(y_pred, (-1,)), active_loss)
        labels = tf.boolean_mask(tf.reshape(y_true, (-1,)), active_loss)
#         loss = loss_fn(y_true, y_pred)
#         mask = tf.cast((y_true >= 0), dtype=tf.float32)
#         loss = loss * mask
        return loss_fn(labels, reduced_pred)#tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [None]:
class F1Callback(tf.keras.callbacks.Callback):
    def __init__(self, model, val_dataset, val_texts, val_labels, tokenizer, file_name):
        self.model = model
        self.val_dataset = val_dataset
        self.valid_texts = val_texts
        self.valid_labels = val_labels
        self.tokenizer = tokenizer
        self.best_f1 = 0
        self.file_name = file_name

    def on_epoch_end(self, epoch, logs):
        pred = self.model.predict(self.val_dataset)
        pred = np.squeeze(pred)
        # scoring
        char_probs = get_char_probs(self.valid_texts, pred, self.tokenizer)
        results = get_results(char_probs, th=0.5)
        preds = get_predictions(results)
        score = get_score(self.valid_labels, preds)
        
        print("f1_val =", score)
        
        if self.best_f1 < score:
            self.best_f1 = score
            self.model.save_weights(self.file_name, save_format='h5')

In [None]:
def get_lr(epoch, lr, num_warmup_steps=num_warmup_steps, num_training_steps=epochs):
    if epoch < num_warmup_steps:
        return float(epoch) / float(max(1, num_warmup_steps)) * lr
    progress = float(epoch - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) * lr

# RoBerta

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME[0], trim_offsets=False)

In [None]:
train_inp_ids = []
train_attn_mask = []
train_labels = []
for n, row in tqdm(train.iterrows(), total=len(train)):
    encoded = tokenizer(row['pn_history'], row['feature_text'],
                       add_special_tokens=True,
                           max_length=MAX_LEN,
                           padding="max_length",
                           return_offsets_mapping=False)
    train_inp_ids.append(encoded['input_ids'])
    train_attn_mask.append(encoded['attention_mask'])
    encoded = tokenizer(row['pn_history'],
                       add_special_tokens=True,
                           max_length=MAX_LEN,
                           padding="max_length",
                           return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if row['annotation_length'] != 0:
        for location in row['location']:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    train_labels.append(label)

In [None]:
train_inp_ids = np.stack(train_inp_ids)
train_attn_mask = np.stack(train_attn_mask)
train_labels = np.stack(train_labels)

In [None]:
def build_model():
    config = RobertaConfig.from_pretrained(MODEL_NAME[0])
    backbone = TFRobertaModel.from_pretrained(MODEL_NAME[0], config=config)
    
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    x = backbone({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        })[0]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(tf.keras.layers.Dropout(0.2)(x))
    
    model = tf.keras.Model(inputs=[input_ids,attention_mask], outputs=out)
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate = lr),
                  loss = [CustomNonPaddingTokenLoss()])
    return model

In [None]:
gc.collect()

In [None]:
lr_callback = tf.keras.callbacks.LearningRateScheduler(get_lr, verbose=1)
for fold in range(folds):
    if fold not in [0,1]:
        continue
    print(f'=============== Training fold {fold} =================')
    train_dataset = get_dataset(
            input_ids=train_inp_ids[train['fold'].values!=fold],
            attention_mask=train_attn_mask[train['fold'].values!=fold],
            labels=train_labels[train['fold'].values!=fold],
            repeated=True,
            drop_remainder=True
        )
    val_dataset = get_dataset(
            input_ids=train_inp_ids[train['fold'].values==fold],
            attention_mask=train_attn_mask[train['fold'].values==fold],
            labels=train_labels[train['fold']==fold],
            ordered=True,
            drop_remainder=True
        )
    train_total_steps = sum(train['fold'].values!=fold) // batch_size
    val_total_steps = sum(train['fold'].values==fold) // batch_size# + int(sum(train['fold'].values==fold) % batch_size > 0)
    val_labels = create_labels_for_scoring(train[train['fold']==fold].copy().reset_index())
    val_texts = train[train['fold']==fold]['pn_history'].values
    resi = len(val_texts) - val_total_steps * batch_size
    with strategy.scope():
        model = build_model()
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        f'./best_roberta_large_fold{fold}.h5', save_best_only=True, monitor='val_loss', mode='min', save_weights_only=True)
    history = model.fit(train_dataset,
                    validation_data = val_dataset,
                    callbacks =[checkpoint, F1Callback(model, val_dataset, val_texts[:-resi], val_labels[:-resi], tokenizer,
                                                      f'./best_f1_roberta_large_fold{fold}.h5'), lr_callback], #[checkpoint, lr_reducer], #[checkpoint, lr_callback],
                    epochs = epochs,
                    steps_per_epoch=train_total_steps,
                    verbose = 1)
    gc.collect()

# PubMedBert

- change some functions in order to properly handle whitespace

In [None]:
def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        text = text.replace('\r\n', '__')
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        offsets = encoded['offset_mapping']
        tmp = offsets
        for n, (start, end) in enumerate(tmp):
            if n == 0: continue
            if tmp[n-1][1] == start - 1:
                offsets[n] = (start - 1, end)
        for idx, (offset_mapping, pred) in enumerate(zip(offsets[1:], prediction[1:])):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
            if len(text) == end: break
    return results

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME[1])

In [None]:
train_inp_ids = []
train_attn_mask = []
train_labels = []
for n, row in tqdm(train.iterrows(), total=len(train)):
    text = row['pn_history']
    text = text.replace('\r\n', '__')
    encoded = tokenizer(text, row['feature_text'],
                       add_special_tokens=True,
                           max_length=MAX_LEN,
                           padding="max_length",
                           return_offsets_mapping=False)
    train_inp_ids.append(encoded['input_ids'])
    train_attn_mask.append(encoded['attention_mask'])
    encoded = tokenizer(text,
                       add_special_tokens=True,
                           max_length=MAX_LEN,
                           padding="max_length",
                           return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    tmp = offset_mapping
    for n, (start, end) in enumerate(tmp):
        if n == 0: continue
        if tmp[n-1][1] == start - 1:
            offset_mapping[n] = (start - 1, end)
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if row['annotation_length'] != 0:
        for location in row['location']:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    train_labels.append(label)

In [None]:
train_inp_ids = np.stack(train_inp_ids)
train_attn_mask = np.stack(train_attn_mask)
train_labels = np.stack(train_labels)

In [None]:
def build_model():
    config = AutoConfig.from_pretrained(MODEL_NAME[1])
    backbone = TFBertModel.from_pretrained(MODEL_NAME[1], config=config, from_pt=True)
    
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    x = backbone({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        })[0]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(tf.keras.layers.Dropout(0.2)(x))
    
    model = tf.keras.Model(inputs=[input_ids,attention_mask], outputs=out)
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate = lr),
                  loss = [CustomNonPaddingTokenLoss()])
    return model

In [None]:
gc.collect()

In [None]:
lr_callback = tf.keras.callbacks.LearningRateScheduler(get_lr, verbose=1)
for fold in range(folds):
    if fold not in [0,1]:
        continue
    print(f'=============== Training fold {fold} =================')
    train_dataset = get_dataset(
            input_ids=train_inp_ids[train['fold'].values!=fold],
            attention_mask=train_attn_mask[train['fold'].values!=fold],
            labels=train_labels[train['fold'].values!=fold],
            repeated=True,
            drop_remainder=True
        )
    val_dataset = get_dataset(
            input_ids=train_inp_ids[train['fold'].values==fold],
            attention_mask=train_attn_mask[train['fold'].values==fold],
            labels=train_labels[train['fold']==fold],
            ordered=True,
            drop_remainder=True
        )
    train_total_steps = sum(train['fold'].values!=fold) // batch_size
    val_total_steps = sum(train['fold'].values==fold) // batch_size# + int(sum(train['fold'].values==fold) % batch_size > 0)
    val_labels = create_labels_for_scoring(train[train['fold']==fold].copy().reset_index())
    val_texts = train[train['fold']==fold]['pn_history'].values
    resi = len(val_texts) - val_total_steps * batch_size
    with strategy.scope():
        model = build_model()
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        f'./best_pubmedbert_fold{fold}.h5', save_best_only=True, monitor='val_loss', mode='min', save_weights_only=True)
    history = model.fit(train_dataset,
                    validation_data = val_dataset,
                    callbacks =[checkpoint, F1Callback(model, val_dataset, val_texts[:-resi], val_labels[:-resi], tokenizer,
                                                      f'./best_f1_pubmedbert_fold{fold}.h5'), lr_callback], #[checkpoint, lr_reducer], #[checkpoint, lr_callback],
                    epochs = epochs,
                    steps_per_epoch=train_total_steps,
                    verbose = 1)
    gc.collect()

# bioBert

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME[2], trim_offsets=False)

In [None]:
train_inp_ids = []
train_attn_mask = []
train_labels = []
for n, row in tqdm(train.iterrows(), total=len(train)):
    text = row['pn_history']
    text = text.replace('\r\n', '__')
    encoded = tokenizer(text, row['feature_text'],
                       add_special_tokens=True,
                           max_length=MAX_LEN,
                           padding="max_length",
                           return_offsets_mapping=False)
    train_inp_ids.append(encoded['input_ids'])
    train_attn_mask.append(encoded['attention_mask'])
    encoded = tokenizer(text,
                       add_special_tokens=True,
                           max_length=MAX_LEN,
                           padding="max_length",
                           return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    tmp = offset_mapping
    for n, (start, end) in enumerate(tmp):
        if n == 0: continue
        if tmp[n-1][1] == start - 1:
            offset_mapping[n] = (start - 1, end)
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1
    if row['annotation_length'] != 0:
        for location in row['location']:
            for loc in [s.split() for s in location.split(';')]:
                start_idx = -1
                end_idx = -1
                start, end = int(loc[0]), int(loc[1])
                for idx in range(len(offset_mapping)):
                    if (start_idx == -1) & (start < offset_mapping[idx][0]):
                        start_idx = idx - 1
                    if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                        end_idx = idx + 1
                if start_idx == -1:
                    start_idx = end_idx
                if (start_idx != -1) & (end_idx != -1):
                    label[start_idx:end_idx] = 1
    train_labels.append(label)

In [None]:
train_inp_ids = np.stack(train_inp_ids)
train_attn_mask = np.stack(train_attn_mask)
train_labels = np.stack(train_labels)

In [None]:
def build_model():
    config = AutoConfig.from_pretrained(MODEL_NAME[2])
    backbone = TFBertModel.from_pretrained(MODEL_NAME[2], config=config, from_pt=True)
    
    input_ids = tf.keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    x = backbone({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        })[0]
    out = tf.keras.layers.Dense(1, activation='sigmoid')(tf.keras.layers.Dropout(0.2)(x))
    
    model = tf.keras.Model(inputs=[input_ids,attention_mask], outputs=out)
    model.compile(optimizer = tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate = lr),
                  loss = [CustomNonPaddingTokenLoss()])
    return model

In [None]:
gc.collect()

In [None]:
lr_callback = tf.keras.callbacks.LearningRateScheduler(get_lr, verbose=1)
for fold in range(folds):
    if fold not in [0,1]:
        continue
    print(f'=============== Training fold {fold} =================')
    train_dataset = get_dataset(
            input_ids=train_inp_ids[train['fold'].values!=fold],
            attention_mask=train_attn_mask[train['fold'].values!=fold],
            labels=train_labels[train['fold'].values!=fold],
            repeated=True,
            drop_remainder=True
        )
    val_dataset = get_dataset(
            input_ids=train_inp_ids[train['fold'].values==fold],
            attention_mask=train_attn_mask[train['fold'].values==fold],
            labels=train_labels[train['fold']==fold],
            ordered=True,
            drop_remainder=True
        )
    train_total_steps = sum(train['fold'].values!=fold) // batch_size
    val_total_steps = sum(train['fold'].values==fold) // batch_size# + int(sum(train['fold'].values==fold) % batch_size > 0)
    val_labels = create_labels_for_scoring(train[train['fold']==fold].copy().reset_index())
    val_texts = train[train['fold']==fold]['pn_history'].values
    resi = len(val_texts) - val_total_steps * batch_size
    with strategy.scope():
        model = build_model()
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        f'./best_biobert_fold{fold}.h5', save_best_only=True, monitor='val_loss', mode='min', save_weights_only=True)
    history = model.fit(train_dataset,
                    validation_data = val_dataset,
                    callbacks =[checkpoint, F1Callback(model, val_dataset, val_texts[:-resi], val_labels[:-resi], tokenizer,
                                                      f'./best_f1_biobert_fold{fold}.h5'), lr_callback], #[checkpoint, lr_reducer], #[checkpoint, lr_callback],
                    epochs = epochs,
                    steps_per_epoch=train_total_steps,
                    verbose = 1)
    gc.collect()