Hello Fellow Kagglers,

This notebook demonstrates the preprocessing of texts for the NBME - Score Clinical Patient Notes competition.

First some properties of the training data are visualised, followed by the tokenization of the training texts and lastly the target labels are generated.

[TPU training notebook](https://www.kaggle.com/markwijkhuizen/nbme-albert-large-training-tpu)

Inference notebook coming soon!

**V3**
* Labels shape [Number of Tokens, Number of Features] to predict the features a token belongs to
* Saving labels as sparse tensors to reduce memory usage
* Added Train patient note corrections

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import TFAlbertModel, PreTrainedTokenizerFast
from textblob import TextBlob

import re
import ast

tqdm.pandas()

# Features

features.csv - The rubric of features (or key concepts) for each clinical case.

These are currently not used

In [None]:
features = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

# Sort For Reproducible Ordinal Encoding
features = features.sort_values('feature_num')

# Add Ordinal Encoding
features['feature_num_ordinal'] = features['feature_num'].astype('category').cat.codes

display(features.head())

display(features.info())

In [None]:
N_CLASSES = len(features)
print(f'N_CLASSES: {N_CLASSES}')

# Patient Notes

A collection of about 40,000 Patient Note history portions. Only a subset of these have features annotated. You may wish to apply unsupervised learning techniques on the notes without annotations. The patient notes in the test set are not included in the public version of this file.

In [None]:
patient_notes = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

# Set Case Number and Patient Number as Index for Convenient Access
patient_notes = patient_notes.set_index(['case_num', 'pn_num'])

# Clean Patient History
def clean_text(s, ret_n_char_window):
    s = str(s).lower()
    n_char_window = 0
    
    replacements = [
        ('20yof c/o abd pain', '20yo f c/o abd pain', 1),
        ('does endorse a historyof taking', 'does endorse a history of taking',  1),
        ('mr. dillon is a 17yo mm', 'mr. dillon is a 17yo m', 1),
        ('ms. montgomery is a 44 yof', 'ms. montgomery is a 44 yo f', 1),
        ('dolores montgomery is a 44yof', 'dolores montgomery is a 44yo f', 1),
        ('44 yof with history', '44 yo f with history', 1),
        ('s. montgomery is a 44yof', 's. montgomery is a 44yo f', 1),
        ('ms. moore is a 45yof', 'ms. moore is a 45yo f', 1),
        ('pt is a 45 y/o wf', 'pt is a 45 y/o w f', 1),
        ('45 yof who', '45 yo f who', 1),
        ('karin moore is a 45yof with', 'karin moore is a 45yo f with', 1),
        ('ms. moore is a 45 y/o wf', 'ms. moore is a 45 y/o w f', 1),
        ('mrs. moore is a 45 year old wf', 'mrs. moore is a 45 year old w f', 1),
        ('edie whelan is a 26 year old aaf', 'edie whelan is a 26 year old aa f', 1),
        ('ms tompkin is  35 yof with', 'ms tompkin is  35 yo f with', 1),
        ('35 yof', '35 yo f', 1),
        ('loraine wicks, a 67yof', 'loraine wicks, a 67yo f', 1),
        ('ms. madden 20yof w/ ha', 'ms. madden 20yo f w/ ha', 1),
        ('20yof with no', '20yo f with no', 1),
        ('ms. madden is a 20 yof female', 'ms. madden is a 20 yo f female', 1),
    ]
    
    for a, b, c in replacements:
        if a in s:
            n_char_window += s.count(a) * c
            s = s.replace(a, b)
    
    if ret_n_char_window:
        return n_char_window
    else:
        return s

patient_notes['pn_history_clean'] = patient_notes['pn_history'].transform(clean_text, ret_n_char_window=False)
patient_notes['n_char_window'] = patient_notes['pn_history'].transform(clean_text, ret_n_char_window=True)

display(patient_notes.head())

display(patient_notes.info())

In [None]:
# Note Word Count
patient_notes['n_words'] = patient_notes['pn_history'].progress_apply(word_tokenize).apply(len)

# Note Sentence Count
patient_notes['n_sentences'] = patient_notes['pn_history'].progress_apply(sent_tokenize).apply(len)

# Note Word Count Distribution

Input texts are around 160 words with a maximum of 225, that's good news as AlBERT has a maximum input lenght of 512 tokens!

In [None]:
display(patient_notes['n_words'].describe().to_frame())

In [None]:
plt.figure(figsize=(12, 5))
patient_notes['n_words'].plot(kind='hist', bins=32)
plt.title('Number of Words per Patient Note', size=24)
plt.xticks(size=16)
plt.yticks(size=16)
plt.xlabel('Note Word Count', size=18)
plt.ylabel('Frequency', size=18)
plt.grid()
pass

# Note Sentence Count Distribution

In [None]:
display(patient_notes['n_sentences'].describe().to_frame())

In [None]:
plt.figure(figsize=(12, 5))
patient_notes['n_sentences'].plot(kind='hist', bins=16)
plt.title('Number of Sentences per Patient Note', size=24)
plt.xticks(size=16)
plt.yticks(size=16)
plt.xlabel('Note Sentence Count', size=18)
plt.ylabel('Frequency', size=18)
plt.grid()
pass

# Train

In [None]:
train = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')

# Annotation Correction

!!! This is not my work but that of [Y.NAKAMA](https://www.kaggle.com/yasufuminakama), a big thanks for these corrections !!!

In [None]:
# incorrect annotation source: https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [None]:
def get_annotation_list(s):
    s = str(s).lower()
    
    for r in ['[\'', '\']', '["', '"]']:
        s = s.replace(r, '')
        
    for r in ['\', \'', '", \'', '\', "', '", "']:
        s = s.replace(r, '<split>')
        
    # Custom
    s = s.replace('\\\'', '\'')
        
    ss = s.split('<split>')
    
    return ss
    
train['annotation_list'] = train['annotation'].apply(get_annotation_list)

In [None]:
# Drop Empty Annotations
train.drop(train.loc[train['annotation'] == '[]'].index, inplace=True)

# Cast annotation Column to String
train['annotation'] = train['annotation'].astype(str)
train['location'] = train['location'].astype(str)

# Set Case Number and Patient Number as Index
train = train.set_index(['case_num', 'pn_num'])

# Add Feature Num Ordinal Encoded
train['feature_num_ordinal'] = features.set_index('feature_num').loc[train['feature_num'], 'feature_num_ordinal'].values

display(train.head(25))

display(train.info())

# AlBERT Tokenizer

In [None]:
# Maximum Token Input Size
SEQ_LENGTH = 512

# AlBERT Base is used here, but the tokenization process is the same for all AlBERT Sizes
tokenizer = PreTrainedTokenizerFast.from_pretrained('albert-base-v2')
tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Save Tokenizer for Offline Inference Process
tokenizer.save_pretrained('./tokenizer/')

In [None]:
# This function tokenize the text according to a AlBERT model tokenizer
def tokenize(note):
    return tokenizer(
            note,  # The input text
            padding = 'max_length', # Add Pad Tokens to Maximum Length
            truncation = True, # Truncate input texts if they are too long
            max_length = SEQ_LENGTH, # Maximum token length 
            return_offsets_mapping = True, # Return character offset to token mapping
        )

# Example Model

In [None]:
albert = TFAlbertModel.from_pretrained(
        'albert-large-v2',
        output_hidden_states = True,
        return_dict = True,
    )

In [None]:
input_ids = tf.keras.layers.Input(shape = (SEQ_LENGTH), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=SEQ_LENGTH, dtype=tf.int32, name='attention_mask')

# Get the last hidden state
last_hidden_state = albert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state

# Output Layer of Size [Tokens, Number of Features]
output = tf.keras.layers.Dense(N_CLASSES, activation='sigmoid')(last_hidden_state)

model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, expand_nested=False)

In [None]:
example_text = patient_notes.loc[(0, 0), 'pn_history_clean']
albert_tokens = tokenize(example_text)

# Show all tokenization outputs
for k, v in albert_tokens.items():
    print(f'k: {k}, v shape: {np.array(v).shape}')
    
model_output = model.predict_on_batch({
        'input_ids': np.array([albert_tokens['input_ids']]),
        'attention_mask': np.array([albert_tokens['attention_mask']])
    })

# Check Output Shape
print(f'model_output shape: {model_output.shape}')

# Check Model Output
plt.figure(figsize=(8, 4))
plt.xlim(0, 1)
plt.xticks(np.arange(0, 1.1, 0.1))
pd.Series(model_output.flatten()).plot(kind='hist')
plt.plot()
pass

# Offset Mapping

In [None]:
tokens = albert_tokens['input_ids'][:20]
offset_mapping = albert_tokens['offset_mapping'][:20]

print(f'example_text: {example_text}\n')

for idx, (start_char_index, end_char_index) in enumerate(offset_mapping):
    # The token tokenizes a part of the original text, this does not have to be a single word!!!
    # For example "-", "," and "." are also encoded as a single token
    substr = example_text[start_char_index:end_char_index]
    print(
          f'start_char_index: {start_char_index:02d}, \tend_char_index: {end_char_index:02d},'
          f'\ttoken: {tokens[idx]}, \tsubstr: {substr}'
        )

# Train Utility Features

In [None]:
# Returns the start location and string length of annotations
def get_location_start_offset(s):
    s = re.sub(r"(\['\s*|'\]|\s*')", '', s)
    s = s.split(',')
    s = [e.split(';') for e in s]
    
    res = []
    for eee in s:
        res_annotation = []    
        for ee in eee:
            
            ee = ee.split(';')
            for e in ee:
                start_idx, end_idx = e.split(' ')
                res_annotation.append([int(start_idx), int(end_idx) - int(start_idx)])
        res.append(res_annotation)
    
    return res

train['location_start_offset'] = train['location'].apply(get_location_start_offset)

# Note location is a string, wheras location_start_offset is a nested list
display(train.head())

In [None]:
# Returns the start index of the annotation location, used for sorting annotations
def get_location_start(location_start_offset):
    res = np.PINF
    for l in location_start_offset:
        for start, _ in l:
            res = min(res, start)
        
    return res

train['location_start'] = train['location_start_offset'].apply(get_location_start)

# location_start is simply the minimum start location off annotations
display(train.head())

# Training Labels

Labels are saved as sparse tensor indices to reduce memory usage by only saving the indices of the non-zero elements, have a look at the [Tensorflow SparseTensor documentation](https://www.tensorflow.org/api_docs/python/tf/sparse/SparseTensor).


In [None]:
# Maximum Annotations Per Patient Note
MAX_ANNOTATIONS = 256

In [None]:
# Find all function for multiple string occurances
def find_all(a, b, offset=0):
    if len(a) == 0:
        return []
    
    res = []
    start_idx = a.find(b)
    if start_idx != -1:
        return [offset + start_idx] + find_all(a[start_idx + len(b):], b, offset=offset + start_idx + len(b))
    else:
        return []
    
print(find_all('I like cats and cats and cats', 'cats'))

In [None]:
"""
    This function is the beating heart of the target label generation.
    The challenge finding the indices of the tokens that belong to the annotation
    Especially as the texts are cleaned, thus the annotations are almost off by a few characters
"""
def get_token_indices(patient_note, ann, om, start_offset, n_char_window, recursion=False):
    # Result, the token indices that belong to the annotation
    token_indices = []
    #
    starts_ann = find_all(patient_note, ann)

    # Check if annotation contains multiple parts
    # for example the patient "heart racing and pounding" has annoation "heart pounding" consisting of 2 character ranges
    if len(start_offset) > 1:
        offset = 0
        for start_idx, str_len in start_offset:
            ann_partial = ann[offset:offset+str_len]
            offset += str_len + 1
            # Go in recursion with partial annotation to get tokens of that annotation part
            token_indices += get_token_indices(patient_note, ann_partial, om, [[start_idx, str_len]], n_char_window)
        return token_indices
    else:
        # Annotation string start index and offset in ORIGINAL text, thus not the clean text!
        start_str, str_len = start_offset[0]
        # Minimum start is the original srtart position minus the removed character by cleaning
        start_min = start_str - n_char_window
        # Maximum string position if the start position plus the string length, a token can consist of a single character!
        start_max = start_str + str_len + n_char_window
    
    # Find the tokens belonging to the annoation, loop over all found instances of the annotation in the patient note
    for ann_start_idx in starts_ann:
        # End character position of current token
        ann_end_idx = ann_start_idx + len(ann)
        # Iterate over all token character positions
        for idx, (start, end) in enumerate(om):
            # Token text must fall within the annotation min and max possible position
            if not(start == 0 and end == 0) and (start >= start_min and start <= start_max):
                # Exact Match
                if start >= ann_start_idx and end <= ann_end_idx:
                    token_indices.append(idx)
                # Label is Subset of Token
                elif ann_start_idx >= start and ann_end_idx <= end:
                    token_indices.append(idx)
            
    return token_indices

In [None]:
# Function to get Training texts X and target labels y
# Use the debug flag to see what this function is doing!
def get_x_y(debug=False):
    N_TRAIN_SAMPLES = train.index.unique().size
    error_c = 0
    # Training texts and labels numpy arrays
    X = np.zeros(shape=[N_TRAIN_SAMPLES, SEQ_LENGTH], dtype=np.int32)
    y = np.full(shape=[N_TRAIN_SAMPLES, MAX_ANNOTATIONS, 2], fill_value=-1, dtype=np.int16)
    for idx, train_idx in enumerate(tqdm(train.index.unique())):
        # Define your training patient note index herer
        if debug:
            train_idx = (0, 82)
        
        # Patient Note
        patient_note = patient_notes.loc[train_idx, 'pn_history_clean']
        # Character Window of Character Removed/Added during patient note correction
        n_char_window = patient_notes.loc[train_idx, 'n_char_window']
        if debug:
            print(f'n_chars_removed: {n_char_window}')
            print(f'patient_note: {patient_note}')
        
        # Tokenize Patient Note
        tokens = tokenize(patient_note)
        
        input_ids = np.array(tokens['input_ids'], dtype=np.int32)
        offset_mapping = tokens['offset_mapping']
        
        X[idx] = input_ids
        
        # Get Annotation Mask
        annotation_labels = []
        for row in train.loc[train_idx].itertuples(index=False, name='Pandas'):
            annotations = row.annotation_list
            location_start_offsets = row.location_start_offset
            feature_num = row.feature_num_ordinal
            
            if debug:
                print('\n', f'annotations: {annotations}, location_start_offsets: {location_start_offsets}')
                
            for ann_idx, (ann, start_offset) in enumerate(zip(annotations, location_start_offsets)):
                if debug:
                    print(f'ann: {ann}, start_offset: {start_offset}')
                
                token_indices = get_token_indices(patient_note, ann, offset_mapping, start_offset, n_char_window)
                if debug:
                    print(f'token_indices: {token_indices}, decoded: {tokenizer.decode(input_ids[token_indices])}')
                    
                for t in token_indices:
                    annotation_labels.append((t, feature_num))
                
                if debug:
                    decoded = tokenizer.decode(input_ids[token_indices])
                    if ann != decoded and ann not in decoded:
                        print(error_c, train_idx, ann, '|', decoded, f' | same: {ann==decoded}', input_ids[token_indices])
                        error_c += 1
                        if '<pad>' in decoded:
                            return patient_note, ann, offset_mapping, start_offset
                        
        # Add Sorted Annotations
        unique_sorted_annotations = np.unique(np.array(sorted(annotation_labels, key=lambda tup: tup[0])), axis=0)
        y[idx, :len(unique_sorted_annotations)] = unique_sorted_annotations
        
        if debug:
            return input_ids, offset_mapping
        
    return X, y

X, y = get_x_y(debug=False)

In [None]:
# Save X and y
np.save('./X.npy', X)
np.save('./y.npy', y)

# Target Label Validation

This next function visualizes the assigned labels to check whether this whole notebook produces the desired output

In [None]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [None]:
for label_idx, train_idx in enumerate(train.index.unique()[:10]):
    # Print patient Note Clean
    print(f'===== PATIENT NOTE ===== | {train_idx}')
    patient_note = patient_notes.loc[train_idx, 'pn_history_clean']
    print(patient_note)
    
    # Print Train
    print('===== ANNOTATION LABELS =====')
    display(train.loc[train_idx].sort_values('location_start')[['annotation_list', 'location']])
    
    # Print Train
    print('===== TARGET ANNOTATION LABELS =====')
    X_train_idx = X[label_idx]
    y_train_idx = y[label_idx]
    
    # Convert Indices to Dense Tensor
    y_train_idx = tf.constant(y_train_idx, dtype=tf.int64)
    idxs = tf.math.reduce_any(y_train_idx > -1, axis=1)
    idxs = tf.gather_nd(y_train_idx, tf.where(idxs))

    sp = tf.SparseTensor(indices=idxs, values=tf.ones(shape=len(idxs)), dense_shape=[SEQ_LENGTH, 143])
    y_train_idx = tf.sparse.to_dense(sp).numpy().sum(axis=1)
    
    y_train_annotations = []
    c = 0
    annotation = []
    patient_note_colored = ''
    # Loop over all tokens and labels
    for x_token, y_label in zip(X_train_idx, y_train_idx):
        # Token 0,1 and 2 are start/end/pad tokens, ignore them
        if x_token > 2:
            # Annoated token, print it green!
            if y_label == 1:
                patient_note_colored += f'{bcolors.FAIL}{tokenizer.decode(x_token)}{bcolors.ENDC} '
            # Unannotated, simply print
            else:
                patient_note_colored += tokenizer.decode(x_token) + ' '
            
        # Add token to annoated list
        if y_label == 1:
            annotation.append(x_token)
        elif y_label == 0 and len(annotation) > 0:
            print(tokenizer.decode(annotation))
            annotation.clear()
            
    print('\n')
    print(patient_note_colored)
    print('\n')
            
    print('\n', '=' * 50, '\n\n')

# Save Train with Features

In [None]:
# FEATURES
features.to_pickle('features.pkl')

# PATIENT NOTES
patient_notes.to_pickle('patient_notes.pkl')

# TRAIN
train.to_pickle('train.pkl')