In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**import libraries**

In [None]:
import gc
import json
import math
import string
import pickle
import warnings
import spacy
import random
import itertools
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',20)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)

from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader,Dataset
from sklearn.model_selection import train_test_split
import tokenizers 
import transformers
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import models
from transformers import AutoTokenizer, AutoConfig, TFAutoModel
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

**Load Data**

In [None]:
train=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')
test=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/test.csv')
features=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')
patient_notes=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')
sample_submission=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/sample_submission.csv')

**config**

In [None]:
# ---------- Model ---------- 
MODEL_NAME = 'microsoft/deberta-base'
TOKENIZER_PATH = "microsoft/deberta-base_tokenizer"
MAX_LEN = 512

# ---------- Training ----------
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 2e-5
CLIP_NORM = 1000

# ---------- Dataset ----------
seed=42
n_fold=5
trn_fold=[0, 1, 2, 3, 4]

debug=False

if debug:
    EPOCHS = 5
    trn_fold = [0]

Helper functions for scoring

In [None]:
import ast
train['annotation'] = train['annotation'].apply(ast.literal_eval) # Construct an object from a string
train['location'] = train['location'].apply(ast.literal_eval) # Construct an object from a string
print(f"train.shape: {train.shape}")
train.head()

In [None]:
test.head()

In [None]:
features.head()

In [None]:
patient_notes.head()

**Merging**

In [None]:
train=train.merge(features,on=['feature_num','case_num'],how='left')
train.head()

In [None]:
train=train.merge(patient_notes,on=['case_num','pn_num'],how='left')
train.head()

In [None]:
train['annotation_length'] = train['annotation'].apply(len)
train.head()

In [None]:
train['pn_history'][5910]

In [None]:
idx=5910

locations = train.loc[idx,'location']
pn_history= train.loc[idx,'pn_history']

start_pos = []
end_pos = []
for location in locations:
    for loc in [s.split() for s in location.split(';')]:
        start_pos.append(int(loc[0]))
        end_pos.append(int(loc[1]))


ents = []
for i in range(len(start_pos)):
    ents.append({
        'start': int(start_pos[i]), 
        'end' : int(end_pos[i]),
        "label" : "Annotation"
    })
doc = {
    'text' : pn_history,
    "ents" : ents
}

colors = {"Annotation": "linear-gradient(0deg, #888, #eeaaaa)"} 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options=options , manual=True, jupyter=True);

In [None]:
locations

In [None]:
train,test=train_test_split(train[['pn_history','feature_text','annotation_length','location']],
                           test_size=0.2,
                           random_state=seed)

In [None]:
train.head()

In [None]:
train['feature_text'][1]

In [None]:
train['pn_history'][1]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.save_pretrained(f'{TOKENIZER_PATH}')

config = AutoConfig.from_pretrained(MODEL_NAME)
config.save_pretrained(f'{TOKENIZER_PATH}')

**preprocess the data**

In [None]:
# ------------------------- prepare_location ------------------------------

def prepare_location(locations: str):
    """
    This function returns list of tuples of locations
    """
    location_tuple_list = []
    for location in locations:
        for loc in [s.split() for s in location.split(';')]:
            start, end = int(loc[0]), int(loc[1])
            location_tuple_list.append((start, end))
    
    return location_tuple_list
# ------------------------- prepare_input ------------------------------

def prepare_input(pn_history: str, feature_text: str):
    """
    This function tokenizes pn_history and feature text and
    returns numpy array of input_ids and attention_masks
    """
    tokens = tokenizer(
        pn_history,
        feature_text,
        max_length=MAX_LEN,
        padding="max_length",
        add_special_tokens=True,
    )
    
    input_ids = tokens['input_ids']
    attention_mask = tokens["attention_mask"]
    return (np.array(input_ids), np.array(attention_mask))

# ------------------------- prepare_labels ------------------------------

def prepare_labels(pn_history, annotation_length, location_list):
    """
    This function creates labels with are vectors of zeros (no entity)
    and ones (entity)
    """
    tokenized = tokenizer(
        pn_history,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding="max_length",
        return_offsets_mapping=True
    )
    offset_mapping = tokenized["offset_mapping"]
    label = np.zeros(len(offset_mapping))
    if annotation_length != 0:
        locations = prepare_location(location_list)
        for location in locations:
            start_idx, end_idx = -1, -1
            start, end = location
            for idx in range(len(offset_mapping)):
                if (start_idx == -1) & (start < offset_mapping[idx][0]):
                    start_idx = idx - 1
                if (end_idx == -1) & (end <= offset_mapping[idx][1]):
                    end_idx = idx + 1
            if start_idx == -1:
                start_idx = end_idx
            if (start_idx != -1) & (end_idx != -1):
                label[start_idx:end_idx] = 1
            
    return np.array(label)

In [None]:
def Dataset_generator(dataframe: pd.DataFrame):
    def arg_generator():
        pn_history = dataframe["pn_history"].values
        feature_text = dataframe["feature_text"].values
        annotation_length = dataframe['annotation_length'].values
        location = dataframe['location'].values

        for i in range(len(dataframe)):
            inputs, masks = prepare_input(pn_history[i], feature_text[i])
            labels = prepare_labels(pn_history[i], annotation_length[i], location[i])
            yield (inputs, masks), labels
    return arg_generator

In [None]:
ds_training = tf.data.Dataset.from_generator(
        Dataset_generator(train),
        output_signature=(
            (
                tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="inputs"),
                tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="attention_masks"),
            ),
            tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="labels"),
        )
    )
ds_training = ds_training.batch(BATCH_SIZE)

ds_valid = tf.data.Dataset.from_generator(
        Dataset_generator(test),
        output_signature=(
            (
                tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="inputs"),
                tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="attention_masks"),
            ),
            tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="labels"),
        )
    )

ds_valid = ds_valid.batch(BATCH_SIZE)

In [None]:
model_save= tf.keras.callbacks.ModelCheckpoint(
    './model_deberta.h5', 
    save_best_only = True, 
    save_weights_only = False,
    monitor = 'val_loss', 
    mode = 'min', verbose = 1
)
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    min_delta=1e-5, 
    patience=5, 
    verbose=1,
    mode='auto', 
    restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.5, 
    patience=2, 
    mode='auto', 
    min_delta=0.001,
    verbose = 1
)

In [None]:
# metrics
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.f1 = tfa.metrics.F1Score(num_classes=2, average='micro', threshold=0.50)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, (-1,MAX_LEN))
        y_pred = tf.reshape(y_pred, (-1,MAX_LEN))
        self.f1.update_state(y_true, y_pred)
        
    def reset_state(self):
        self.f1.reset_state()
    
    def result(self):
        return self.f1.result()
    
metrics = [
    F1Score(), 
    tf.keras.metrics.Recall(thresholds=[0.5]), 
    tf.keras.metrics.Precision(thresholds=[0.5])
]

In [None]:
def create_model():
    input_tokens=tf.keras.layers.Input(shape=(MAX_LEN,),dtype=tf.int32)
    attention_mask=tf.keras.layers.Input(shape=(MAX_LEN,),dtype=tf.int32)
    
    config=AutoConfig.from_pretrained(MODEL_NAME,output_hiddin_states=True)
    backbone=TFAutoModel.from_pretrained(MODEL_NAME,config=config)
    
    out=backbone(input_tokens,attention_mask=attention_mask)[0]
    out=tf.keras.layers.Dropout(0.2)(out)
    out=tf.keras.layers.Dense(1,activation='sigmoid')(out)
    
    return tf.keras.Model(inputs=[input_tokens,attention_mask],outputs=out)

model=create_model()
model.summary()
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE, clipnorm=CLIP_NORM)
loss = tf.keras.losses.BinaryCrossentropy(reduction="none")
model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

In [None]:
his=model.fit(ds_training,epochs=EPOCHS,validation_data=ds_valid,callbacks=[model_save,early_stop,reduce_lr])

In [None]:
test = pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
test.head()

In [None]:
his=model.fit(ds_training,epochs=EPOCHS,validation_data=ds_valid,callbacks=[model_save,early_stop,reduce_lr])

In [None]:
# Create the generator which yields inputs for test dataset
def Dataset_generator_test(dataframe: pd.DataFrame):
    def arg_generator_test():
        pn_history = dataframe["pn_history"].values
        feature_text = dataframe["feature_text"].values
        
        for i in range(len(dataframe)):
            inputs, masks = prepare_input(pn_history[i], feature_text[i])
            labels = prepare_labels(pn_history[i], 0, '') # just to build BatchDataset  
            yield (inputs, masks),labels
    return arg_generator_test



ds_test = tf.data.Dataset.from_generator(
        Dataset_generator_test(test),
         output_signature=(
                    (
                        tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="inputs"),
                        tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="attention_masks"),
                    ),
                    tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.dtypes.int32, name="labels"),
                )
            )
ds_test = ds_test.batch(BATCH_SIZE)

idxx=0
for dst in ds_test.take(1):
    inputs_masks,labels = dst # ignore labels
    inputs_ids=inputs_masks[0]
    attention_masks=inputs_masks[1]
    print("inputs_ids shape=",inputs_ids.shape)
    print("-----------------------------------------------------------")
    print("attention_masks shape=",attention_masks.shape)

In [None]:
# Thanks yasufuminakama 
# https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

In [None]:
preds = model.predict(ds_test)
preds = preds.reshape(len(test), MAX_LEN)

In [None]:
char_probs = get_char_probs(test['pn_history'].values, preds, tokenizer)
results = get_results(char_probs, th=0.5)

In [None]:
submission = pd.read_csv('../input/nbme-score-clinical-patient-notes/sample_submission.csv')
submission['location'] = results
display(submission.head())
submission[['id', 'location']].to_csv('submission.csv', index=False)