Hello Fellow Kagglers,

This notebook demonstrates the inference process by predicting the features each token belongs to.

The inference process is structured as follows:

1) For each token, predict the features it belongs to by thresholding the probability, resulting in pairs of tokens and features: i.e. token 42 belongs to feature 10,20 and 25

2) For each pair, find the character location in the patient note: i.e. token 42 decodes to "doctor" and refers to patient note character span 100:106

3) Group the tokens that belong to a single feature and produce the location spans by grouping consecutive tokens: i.e. character location 1:5 and 6:10 will result in location span 1:10

[Training Notebook](https://www.kaggle.com/markwijkhuizen/nbme-albert-large-training-tpu)

[Preprocessing Notebook](https://www.kaggle.com/markwijkhuizen/nbme-preprocessing-albert)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import metrics

from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import PreTrainedTokenizerFast, TFAlbertModel, AlbertConfig
from sklearn.model_selection import train_test_split

import re
import os
import random
import math
import pickle

tqdm.pandas()

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
# Token Input Length of AlBERT Model
SEQ_LENGTH = 512
# Global Random Seed
SEED = 42
# Models List
MODELS = []

In [None]:
features = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

# Add Ordinal Encoding
features['feature_num_ordinal'] = features['feature_num'].astype('category').cat.codes

N_LABELS = len(features)
print(f'N_LABELS: {N_LABELS}')

# Model

In [None]:
# Inference does not allow internet connection, define AlBERT model manually
albert_xxlarge_config = AlbertConfig(
  hidden_size = 4096,
  intermediate_size = 16384,
  max_position_embeddings = 512,
  model_type = 'albert',
  num_attention_heads = 64,
)

# Inference does not allow internet connection, define AlBERT model manually
albert_base_config = AlbertConfig(
  hidden_size = 768,
  intermediate_size = 3072,
  max_position_embeddings = 512,
  model_type = 'albert',
  num_attention_heads = 12,
)

In [None]:
def get_albert_model(file_name, config):
    # Input Layer
    input_ids = tf.keras.layers.Input(shape = (SEQ_LENGTH), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=SEQ_LENGTH, dtype=tf.int32, name='attention_mask')

    # AlBERT Model
    albert = TFAlbertModel(config)

    # Get the last hidden state
    last_hidden_state = albert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

    # Dropout Layer
    do = tf.keras.layers.Dropout(0.00, name='dropout')(last_hidden_state)

    # Output Layer gives probabilities of each token to belong to each feature
    output = tf.keras.layers.Dense(N_LABELS, activation='sigmoid', name='head/classifier')(do)

    # Define Model
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[output])
    
    # Load Weights
    if file_name is None:
        model.load_weights('/kaggle/input/nbme-albert-large-training-tpu-dataset/model.h5')
    else:
        model.load_weights(f'/kaggle/input/nbme-albert-model-assemble/{file_name}.h5')
    
    # Append to Models List
    MODELS.append(model)
    
    return model

In [None]:
# Clear Backend
tf.keras.backend.clear_session()

# enable XLA optmizations
tf.config.optimizer.set_jit(True)

albert_xxlarge_v18_model = get_albert_model(None, albert_base_config)

In [None]:
# Show Models
for m in MODELS:
    print(m.summary())
    print('\n' * 3)

# Clean Text

In [None]:
patient_notes = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

# Set Case Number and Patient Number as Index for Convenient Access
patient_notes = patient_notes.set_index(['case_num', 'pn_num'])

patient_notes['pn_history_clean'] = patient_notes['pn_history'].str.lower()

display(patient_notes.head())

display(patient_notes.info())

# Tokenize

In [None]:
# Load saved Tokenizer from preprocessing notebook
tokenizer = PreTrainedTokenizerFast.from_pretrained('../input/nbme-preprocessing-albert-public/tokenizer')

In [None]:
# This function tokenize the text according to a AlBERT model tokenizer
def tokenize(note):
    return tokenizer(
            note,
            padding = 'max_length',
            truncation = True,
            max_length = SEQ_LENGTH,
            return_offsets_mapping = True,
        )

# Sample Submission

In [None]:
# Let's take a look at the sample submission
sample_submission = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/sample_submission.csv')

display(sample_submission)

# Inference Helpers

In [None]:
def correct_prediction(t_dec_prev, t_dec, t_dec_next, row_feature_num, prob):    
    if row_feature_num == 70:
        if t_dec in ['ms']:
            return 1
    elif row_feature_num == 107:
        if re.fullmatch('\d{1}', t_dec) or t_dec in ['months', 'mot', 'nh', 's']:
            return 1
    elif row_feature_num == 103:
        if t_dec in ['unprotected', 'sex']:
            return 1
    elif row_feature_num == 92:
        if t_dec == 'as' and t_dec_next == 'tham':
            return 1
        elif t_dec_prev == 'as' and t_dec == 'tham':
            return 1
    elif row_feature_num == 93:
        if prob > 0.01 and t_dec in ['chest', 'pain']:
            return 1
        
    # Default to return original pred
    return prob

# Inference

In [None]:
# This is the threshold that gave the best F1 score, determined in the training notebook
THRESHOLD = 0.50
print(f'THRESHOLD: {THRESHOLD:.3f}')

In [None]:
test = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/test.csv')

test['feature_num_ordinal'] = features.set_index('feature_num').loc[test['feature_num'], 'feature_num_ordinal'].values

test = test.set_index(['case_num', 'pn_num'])

display(test.head(10))

# Token Feature Probabilities

Identify tokens which belong to a feature

In [None]:
pred_df = []

# Loop over all test rows
for group_idx, group in tqdm(test.groupby(['case_num', 'pn_num'])):
    # Patient Note
    pn_history_clean = patient_notes.loc[group_idx, 'pn_history_clean']
    
    # Tokenize Patient Note
    tokens = tokenize(pn_history_clean)
    
    # Token Properties
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    offset_mapping = tokens['offset_mapping']
    
    # Probabilities of each token belonging to each feature
    y_pred = np.zeros(shape=[N_LABELS, SEQ_LENGTH], dtype=np.float32)
    
    for m in MODELS:
        y_pred += m.predict_on_batch({
                'input_ids': np.array([input_ids]),
                'attention_mask': np.array([attention_mask]),
            }).squeeze().T / len(MODELS)
    
    # Iterate over all features
    for row_id, row_feature_num in group[['id', 'feature_num_ordinal']].itertuples(index=False, name=None):
        annotation_found = False
        
        # Prediction per Feature Number
        y_pred_row = y_pred[row_feature_num]
        
        om_pred = []
        # Iterate over all offset mappings, input tokens and prediction probabilities
        for idx, (om, t, prob) in enumerate(zip(offset_mapping, input_ids, y_pred_row)):
            # Decode Token
            t_dec = tokenizer.decode(t)
            
            t_dec_prev = tokenizer.decode(input_ids[idx - 1]) if idx > 0 else None
            t_dec_next = tokenizer.decode(input_ids[idx + 1]) if idx < len(input_ids) - 1 else None
            prob = correct_prediction(t_dec_prev, t_dec, t_dec_next, row_feature_num, prob)
            
            # Minimum prediction threshold and token should not be utlity token (START, END, PAD etc.)
            if prob > THRESHOLD and len(t_dec) > 0 and t > 4 :
                annotation_found = True
                pred_df.append({
                        'row_id': row_id,
                        'om': om,
                        't': t,
                        't_dec': t_dec,
                        'group_idx': group_idx,
                    })
                
        # Add Empty Annotation if no annotation is found
        if not annotation_found:
            pred_df.append({
                'row_id': row_id,
                'om': (-1,-1),
                't': -1,
                't_dec': chr(0),
                'group_idx': group_idx,
            })

In [None]:
# Show Prediction DataFrame
pred_df = pd.DataFrame.from_dict(pred_df)
display(pred_df.head(10))

# Decode Predictions

Assigns the patient note location of all predicted tokens

In [None]:
def find_all(a, b, offset=0):
    res = []
    # Find Ignoring Case
    start_idx = a.lower().find(b.lower())
    if start_idx != -1:
        return [offset + start_idx] + find_all(a[start_idx + len(b):], b, offset=offset + start_idx + len(b))
    else:
        return []

In [None]:
# Returns the character position in the patient note
def get_char_pos(row, return_str):
    annotation = row['t_dec']
    start_ann, end_ann = row['om']
    if start_ann == -1:
        if return_str:
             return chr(0)
        else:
            return (-1, -1)
    
    patient_note = patient_notes.loc[row['group_idx'], 'pn_history']
    
    starts_in_patient_note = find_all(patient_note, annotation)
    
    for start in starts_in_patient_note:
        end = start + len(annotation)
        if start >= start_ann - 1 and end <= end_ann + 1 and end <= len(patient_note):
            if return_str:
                 return patient_note[start:end]
            else:
                return start, end
    
pred_df['om_original'] = pred_df.progress_apply(get_char_pos, axis=1, return_str=False)
pred_df['om_original_str'] = pred_df.progress_apply(get_char_pos, axis=1, return_str=True)

display(pred_df.head(10))

# Submission Location

Groups token locations together

In [None]:
submission_rows = []

for group_idx, group in tqdm(pred_df.groupby('row_id')):
    start_prev = np.NINF
    end_prev = np.NINF
    location = ''
    
    for start, end in group['om_original']:
        # Previous token also belongs to location, increase end location
        # i.e. 3:5 followed by 6:10 will have start 3 and end will be increased from 5 -> 10
        if end_prev + 1 >= start and start <= end_prev + 2:
            end_prev = end
        else:
            # Previous token does not belong to location, add current location
            if end_prev != np.NINF:
                # After first location span is added, following location spans are delimited with a ";"
                if len(location) > 0:
                    location += f';{start_prev} {end_prev}'
                # First location span has no ";"
                else:
                    location += f'{start_prev} {end_prev}'
            # First location span, set start and end
            if start > -1 and end > -1:
                start_prev = start
                end_prev = end
            
    # Add last location
    if end_prev > -1:
        if len(location) > 0:
            location += f';{start_prev} {end_prev}'
        else:
            location += f'{start_prev} {end_prev}'
        
    submission_rows.append({ 'id': group_idx, 'location': location })

# Submission

In [None]:
# Create submission DataFrame
submission = pd.DataFrame.from_dict(submission_rows)

display(submission.head())

In [None]:
# Save submission as CSV
submission.to_csv('submission.csv', index=False)