Hello Fellow Kagglers,
 
This notebook demonstrates the training process for the NBME - Score Clinical Patient Notes competition using the parameter efficient AlBERT Model. The model is trained on a 420TFLOPS TPU making the training process a matter of minutes.

AlBERT Paper: [ALBERT: A LITE BERT FOR SELF-SUPERVISED LEARNING OF LANGUAGE REPRESENTATIONS](https://arxiv.org/pdf/1909.11942.pdf)

AlBERT GitHub: [google-research/albert](https://github.com/google-research/albert)

AlBERT Hugging Face: [ALBERT - Hugging Face](https://huggingface.co/docs/transformers/model_doc/albert)

The model is trained as a token classiffier. For each token a probability is predicted for that token to be an annotation.

A two step training process is used where first only the classifier fully connected layer is trained, as this layer is initialized with random weights. Secondly the whole model, including the pretrained AlBERT model, is fine tuned.

This notebook first defines the model, followed by the data pipeline. Next the actual training process is done followed by a thorough validation performance analysis.

[Preprocessing Notebook](https://www.kaggle.com/markwijkhuizen/nbme-preprocessing-albert)

[Inference Notebook](https://www.kaggle.com/markwijkhuizen/nbme-albert-inference-public)

[Extra Training Data](https://www.kaggle.com/markwijkhuizen/nbme-albert-extra-training-data-public)

**V2**
* Per token feature prediction having a [Number of Tokens, Number of Features] output, predicting the probability of each token to belong to each feature
* [SigmoidFocalCrossEntropy](https://www.tensorflow.org/addons/api_docs/python/tfa/losses/SigmoidFocalCrossEntropy), special loss for high class inbalance. Of the label just 0.061% is positive (1). The loss of each prediction is scaled by $(1-p)^\gamma$ for positive labels(1) and $p^\gamma$ for  negative labels(0). Easy negative examples will quickly have a near 0 prediction, making the loss practically zero and thereby taking the easy negative example not into account when updating the loss. With the $\gamma$ parameter the scaling can be further adapted. For more details see the [Paper](https://arxiv.org/pdf/1708.02002.pdf)

**V8**
* Added clipnorm to get consistent optimization steps, solves loss spikes and slightly improves validation F1 score

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn import metrics

from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import TFAlbertModel
from sklearn.model_selection import train_test_split

import re
import os
import random
import math
import time
import sys

tqdm.pandas()

AUTO = tf.data.experimental.AUTOTUNE

print(f'Python Version: {sys.version}')
print(f'Tensorflow Version: {tf.__version__}')
print(f'Tensorflow Keras Version: {tf.keras.__version__}')

In [None]:
# Seed everything for deterministic behaviour, however TPU's are generally undeterministic by definition
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
SEED = 42
seed_everything(SEED)

In [None]:
# The Token Input length for the AlBERT Model
SEQ_LENGTH = 512
# A TPU V3-8 has 8 computing cores, the global batch size will be 1/16 x 8 = 8/128
BATCH_SIZE_BASE = 4
BATCH_SIZE_BASE_EXTRA = 16

# Different Training Dataset
# 'train': use the annotated 1000 training samples
# 'train_extra': use both annotated and soft labelled samples
TRAIN_MODE = 'train_extra'

# AlBERT Version
ALBERT_VERSION = 'base'

# Number of Test Samples to Use (0 or 100)
N_TEST_SAMPLES = 0

# Epsilon Value
EPSILON = tf.keras.backend.epsilon()

# Number of Label

In [None]:
features = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

N_LABELS = len(features)
print(f'N_LABELS: {N_LABELS}')

# Hardware Configuration

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', TPU.master())
except ValueError:
    print('Running on GPU')
    TPU = None

if TPU:
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

N_REPLICAS = strategy.num_replicas_in_sync
# Number of computing cores, is 8 for a TPU V3-8
print(f'N_REPLICAS: {N_REPLICAS}')

# F1 Score

In [None]:
def f1(y_true, y_pred):
    y_true = tf.reshape(y_true, [-1, N_LABELS])
    y_pred = tf.reshape(y_pred, [-1, N_LABELS])
    
    return f1_score(y_true, y_pred)

In [None]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        with strategy.scope():
            self.f1 = tfa.metrics.F1Score(num_classes=N_LABELS, average='micro', threshold=EPSILON)

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, [-1, N_LABELS])
        y_pred = tf.reshape(y_pred, [-1, N_LABELS])
        self.f1.update_state(y_true, y_pred)
        
    def reset_state(self):
        self.f1.reset_state()
    
    def result(self):
        return self.f1.result()

# Model

In [None]:
with strategy.scope():
    # METRICS
    model_metrics = [
        tf.keras.metrics.Precision(name='precision', thresholds=EPSILON),
        tf.keras.metrics.Recall(name='recall', thresholds=EPSILON),
        F1Score(),
    ]

In [None]:
# Returns the dropout rate of the model
def get_dropout_rate():
    for layer in model.layers:
        if layer.name == 'dropout':
            return layer.rate

In [None]:
def set_layers_trainable(train_strategy, alpha=0, gamma=1):
    with strategy.scope():
        for layer in model.layers:
            if train_strategy == 'classifier':
                layer.trainable = 'head/' in layer.name
                # OPTIMIZER
                model_optimizer = tf.keras.optimizers.Adam(learning_rate=5e-3, clipnorm=1.0)
                # LOSS
                model_loss = tfa.losses.SigmoidFocalCrossEntropy(from_logits=True, alpha=alpha, gamma=gamma)
                # No Dropout
                if layer.name == 'dropout':
                    layer.rate = 0.00
            elif train_strategy == 'train_extra':
                layer.trainable = True
                # OPTIMIZER
                model_optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, clipnorm=1.0)
                # LOSS
                model_loss = tfa.losses.SigmoidFocalCrossEntropy(from_logits=True, alpha=alpha, gamma=gamma)
                # 30% Dropout
                if layer.name == 'dropout':
                    layer.rate = 0.20
            elif train_strategy == 'train':
                layer.trainable = True
                # OPTIMIZER
                model_optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, clipnorm=1.0)
                # LOSS
                model_loss = tfa.losses.SigmoidFocalCrossEntropy(from_logits=True, alpha=alpha, gamma=gamma)
                # 30% Dropout
                if layer.name == 'dropout':
                    layer.rate = 0.20

        model.compile(optimizer=model_optimizer, loss=model_loss, metrics=model_metrics)        

In [None]:
def get_model():
    # Clear Backend
    tf.keras.backend.clear_session()

    # enable XLA optmizations
    tf.config.optimizer.set_jit(True)
    
    with strategy.scope():
        # Input Layers
        input_ids = tf.keras.layers.Input(shape = (SEQ_LENGTH), dtype=tf.int32, name='input_ids')
        attention_mask = tf.keras.layers.Input(shape=SEQ_LENGTH, dtype=tf.int32, name='attention_mask')
        
        # AlBERT Model
        albert = TFAlbertModel.from_pretrained(
            f'albert-{ALBERT_VERSION}-v2',
            output_hidden_states = True,
            return_dict = True,
        )

        # Get the last hidden state
        last_hidden_state = albert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        
        do = tf.keras.layers.Dropout(0.00, name='dropout')(last_hidden_state)
        
        # Custom Kernel Initializer
        initializer = tf.keras.initializers.GlorotNormal(seed=SEED)
        # Can be used to scale the random initialzied weights
        initializer.scale = 1.0

        # Classification layer
        output = tf.keras.layers.Dense(N_LABELS, kernel_initializer=initializer, activation=None, name='head/classifier')(do)
    
        # Define Model
        model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=[output])
    
    return model

In [None]:
model = get_model()

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, show_dtype=True, show_layer_names=True, expand_nested=False)

# Dataset Configuration

In [None]:
# Training configuration
BATCH_SIZE = BATCH_SIZE_BASE * N_REPLICAS
BATCH_SIZE_EXTRA = BATCH_SIZE_BASE_EXTRA * N_REPLICAS

print(f'BATCH_SIZE_BASE: {BATCH_SIZE_BASE}, BATCH SIZE: {BATCH_SIZE}, BATCH_SIZE_EXTRA: {BATCH_SIZE_EXTRA}')

In [None]:
X = np.load('/kaggle/input/nbme-preprocessing-albert-public/X.npy')
y = np.load('/kaggle/input/nbme-preprocessing-albert-public/y.npy')

if N_TEST_SAMPLES == 0:
    X_extra = np.load('/kaggle/input/nbme-albert-extra-training-data-public-dataset/X_extra.npy')
    y_extra_indices = np.load('/kaggle/input/nbme-albert-extra-training-data-public-dataset/y_extra_indices.npy')
    y_extra_values = np.load('/kaggle/input/nbme-albert-extra-training-data-public-dataset/y_extra_values.npy')
else:
    X_extra = np.load('/kaggle/input/nbme-albert-extra-training-data-public-dataset/X_extra_no_val.npy')
    y_extra_indices = np.load('/kaggle/input/nbme-albert-extra-training-data-public-dataset/y_extra_indices_no_val.npy')
    y_extra_values = np.load('/kaggle/input/nbme-albert-extra-training-data-public-dataset/y_extra_values_no_val.npy')

print(f'X shape: {X.shape}, y shape: {y.shape}')
print(f'X_extra shape: {X_extra.shape}, y_extra_indices shape: {y_extra_indices.shape}, y_extra_indices shape: {y_extra_indices.shape}')
print(f'X dtype: {X.dtype}, y dtype: {y.dtype}')
print(f'X_extra dtype: {X_extra.dtype}, y_extra_indices dtype: {y_extra_indices.dtype}, y_extra_indices dtype: {y_extra_indices.dtype}')

# Label Distribution

In [None]:
y_1s = (y[:,:,0].flatten() > -1).sum()
y_0s = len(y) * SEQ_LENGTH * N_LABELS - y_1s
y_value_counts_df = pd.DataFrame({
        'label': ['0', '1'],
        'count': [y_0s, y_1s],
    })

display(y_value_counts_df)

In [None]:
# Label Distribution
plt.figure(figsize=(8,8))
plt.title('Label Distribution', size=24)
pd.DataFrame(y_value_counts_df)['count'].plot(kind='pie', autopct='%1.3f%%', textprops={'fontsize': 16})
plt.ylabel('Label Distribution', size=18)
plt.show()

In [None]:
if N_TEST_SAMPLES > 0:
    # Train Test Split
    test_size = N_TEST_SAMPLES / len(X)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=SEED)
    print(f'X_train shape: {X_train.shape}, X_val shape: {X_val.shape}')
    print(f'y_train shape: {y_train.shape}, y_val shape: {y_val.shape}')
else:
    X_train = X
    y_train = y
    print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')

# Dataset Utility Functions

In [None]:
# Simple function to benchmark the dataset
def benchmark_dataset(dataset, num_epochs=3, n_steps_per_epoch=100):
    start_time = time.perf_counter()
    bs = None
    for epoch_num in range(num_epochs):
        for idx, (inputs, labels) in enumerate(dataset.take(n_steps_per_epoch + 1)):
            if bs is None:
                bs = len(labels)
            # Epoch Start Time
            if idx == 0:
                epoch_start = time.perf_counter()
            else:
                pass
        epoch_t = time.perf_counter() - epoch_start
        mean_step_t = round(epoch_t / n_steps_per_epoch * 1000, 1)
        n_texts_per_s = int(1 / (mean_step_t / 1000) * bs)
        print(f'epoch {epoch_num} took: {round(epoch_t, 2)} sec, mean step duration: {mean_step_t}ms, texts/s: {n_texts_per_s} (bs={bs})')

In [None]:
@tf.function()
def add_attention_mask(X_sample, y_sample):
    input_ids_sample = X_sample['input_ids']
    # Attention Mask
    attention_mask_sample = tf.where(tf.math.equal(input_ids_sample, 0), x=0, y=1)
    # Cast y_sample to float
    y_sample_i = y_sample['indices']
    y_sample_v = y_sample['values']
    
    y_sparse_idxs_int64 = tf.cast(y_sample_i, dtype=tf.int64)
    y_idxs = tf.math.reduce_any(y_sparse_idxs_int64 > -1, axis=1)
    y_idxs = tf.where(y_idxs)
    y_indices = tf.gather_nd(y_sparse_idxs_int64, y_idxs)
    y_values = tf.gather_nd(y_sample_v, y_idxs)


    y_dense = tf.SparseTensor(indices=y_indices, values=y_values, dense_shape=[SEQ_LENGTH, N_LABELS])
    y_dense = tf.sparse.to_dense(y_dense)
    y_dense = tf.cast(y_dense, tf.float32)
        
    return { 'input_ids': input_ids_sample, 'attention_mask': attention_mask_sample }, y_dense

In [None]:
def get_dataset(X, y_indices, shuffle_repeat, y_values=None, bs=BATCH_SIZE, dr=True):
    if y_values is None:
        y_values = np.ones(y_indices.shape[:2], dtype=np.int8)
    
    # Create dataset from numpy arrays
    dataset = tf.data.Dataset.from_tensor_slices((
        # Input
        { 'input_ids': X },
        # Label
        { 'indices': y_indices, 'values': y_values },
    ))
    
    if shuffle_repeat:
        dataset = dataset.shuffle(len(X))
        dataset = dataset.repeat()
    
    # Add Attention Mask
    dataset = dataset.map(add_attention_mask, num_parallel_calls=AUTO, deterministic=False)
    
    # Prefetech to not map the whole dataset
    dataset = dataset.prefetch(AUTO)
    
    # Batch Samples
    dataset = dataset.batch(bs, drop_remainder=dr)
    
    # Always have a batch ready
    dataset = dataset.prefetch(1)
    
    return dataset

# Train Dataset

In [None]:
if TRAIN_MODE == 'train':
    print(f'Using Train Dataset With {len(X_train)} Samples')
    # TRAIN DATASET
    train_dataset = get_dataset(X_train, y_train, shuffle_repeat=True)
elif TRAIN_MODE == 'train_extra':
    print(f'Using Train Extra Dataset With {len(X_extra)} Samples')
    # TRAIN EXTRA DATASET
    train_dataset = get_dataset(X_extra, y_extra_indices, y_values=y_extra_values, shuffle_repeat=True, bs=BATCH_SIZE_EXTRA)
else:
    raise Exception('Training Mode Invalid!!!')

In [None]:
# Example of a batch
train_x, train_y = next(iter(train_dataset))
print(f'train_x keys: {list(train_x.keys())}')
print(f'train_x input ids shape: {train_x["input_ids"].shape}')
print(f'train_x input ids dtype: {train_x["input_ids"].dtype}')
print(f'train_y shape: {train_y.shape}, train_y dtype: {train_y.dtype}')

print('\n===== Benchmarking Dataset =====')
benchmark_dataset(train_dataset)

# Validation Dataset

In [None]:
if N_TEST_SAMPLES > 0:
    # TRAIN DATASET
    val_dataset = get_dataset(X_val, y_val, shuffle_repeat=False, bs=TEST_SIZE)

    # Example of a batch
    val_x, val_y = next(iter(val_dataset))
    print(f'val_x keys: {list(val_x.keys())}')
    print(f'val_x input ids shape: {val_x["input_ids"].shape}')
    print(f'val_x input ids dtype: {val_x["input_ids"].dtype}')
    print(f'val_y shape: {val_y.shape}, val_y dtype: {val_y.dtype}')

# Weight Initialization Test

This is a sanity check for the model architecture, the output should be between 0 and 1. Several seeds are tried to get an initial bias towards 0, which is in line with the label distribution. This should speed up training. The output should also be roughly evenly distributed between 0-1, if all output is 0 or 1 this is a red flag for the model architecture/weight initialization.

In [None]:
if TRAIN_MODE == 'train':
    TRAIN_STEPS_PER_EPOCH = len(X_train) // BATCH_SIZE
else:
        TRAIN_STEPS_PER_EPOCH = len(X_extra) // BATCH_SIZE_EXTRA

print(f'TRAIN_STEPS_PER_EPOCH: {TRAIN_STEPS_PER_EPOCH}')

In [None]:
# Sanity check for output layer
outputs = []
train_dataset_iter = iter(get_dataset(X_train, y_train, False, bs=100))
for X_batch, _ in tqdm(train_dataset_iter, total=len(X_train) // 100):
    output_batch = model.predict_on_batch(X_batch)
    outputs += output_batch.flatten().tolist()

display(pd.Series(outputs, dtype=np.float32).describe(percentiles=[0.05, 0.10, 0.90, 0.95]).to_frame())

plt.figure(figsize=(15,6))
plt.title('Output Distribution', size=24)
pd.Series(outputs).plot(kind='hist', bins=16)
plt.grid()
plt.yticks(size=16)
plt.xticks(size=16)
plt.xlabel('Sigmoid Output', size=18)
plt.ylabel('Count', size=18)
plt.show()

# Callbacks

In [None]:
if N_TEST_SAMPLES > 0:
    # Checkpoint Callback, only save the best model weights with respect to the validation loss
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(f'model.h5', monitor='val_f1', save_best_only=True, save_weights_only=True, verbose=1, mode='max')
else:
    # Monitor Training F1 when only using training data
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(f'model.h5', monitor='f1', save_best_only=True, save_weights_only=True, verbose=1, mode='max')

# Training

In [None]:
N_EPOCHS_CLASSIFIER = 10 if TRAIN_MODE == 'train' else 1
N_EPOCHS_ALL = 10 if TRAIN_MODE == 'train' else 8
N_TOTAL_EPOCHS = N_EPOCHS_CLASSIFIER + N_EPOCHS_ALL
print(f'N_EPOCHS_CLASSIFIER: {N_EPOCHS_CLASSIFIER}, N_EPOCHS_ALL: {N_EPOCHS_ALL}')

# Classifier Only

In [None]:
# Set Only The Classification Layer Trainable
set_layers_trainable('classifier')

# Verify Learning Rate
print(f'Optimzier Learning Rate: {model.optimizer.learning_rate.numpy():.1e}')
print(f'Optimzier Epsilon: {model.optimizer.epsilon:.1e}')
print(f'Dropout Rate: {get_dropout_rate():.2f} \n')

# Verify we only train the head/classifier layer!
print(model.summary())

In [None]:
# Train Classifier Only, a decent performance can be achieved with just training 1025 neurons!
history_classifier = model.fit(
    train_dataset,
    steps_per_epoch = TRAIN_STEPS_PER_EPOCH,
    validation_data = val_dataset if N_TEST_SAMPLES > 0 else None,
    epochs = N_EPOCHS_CLASSIFIER,
    verbose = 1,
    callbacks = [
        checkpoint_callback,
    ],
)

# Fine Tuning Whole Model

In [None]:
# Set Only The Classification Layer Trainable
set_layers_trainable(TRAIN_MODE)

# Verify Learning Rate
print(f'Optimzier Learning Rate: {model.optimizer.learning_rate.numpy():.1e}')
print(f'Optimzier Epsilon: {model.optimizer.epsilon:.1e}')
print(f'Dropout Rate: {get_dropout_rate():.2f} \n')

# Verify we only train the head/classifier layer!
print(model.summary())

In [None]:
# Train the whole model, thus also the AlBERT model
history_whole_model = model.fit(
    train_dataset,
    steps_per_epoch = TRAIN_STEPS_PER_EPOCH,
    validation_data = val_dataset if N_TEST_SAMPLES > 0 else None,
    epochs = N_EPOCHS_ALL,
    verbose = 1,
    callbacks = [
        checkpoint_callback,
    ],
)

In [None]:
# Load Best Weights
model.load_weights('model.h5')

# Training History

In [None]:
# Merge History
HISTORY = {}
h1 = history_classifier.history.items()
h2 = history_whole_model.history.items()
for (k, v1), (_, v2) in zip(h1, h2):
    HISTORY[k] = v1 + v2

In [None]:
def plot_history_metric(metric, f_best=np.argmax, yscale='linear'):
    x = np.arange(1, len(HISTORY[metric]) + 1)
    y_train = HISTORY[metric]
    plt.figure(figsize=(20, 8))
    # TRAIN
    plt.plot(x, y_train, color='tab:blue', lw=3, label='train')
    plt.title(f'Training {metric}', fontsize=24, pad=10)
    plt.ylabel(metric, fontsize=20, labelpad=10)
    plt.xlabel('epoch', fontsize=20, labelpad=10)
    plt.xticks([1] + np.arange(5, N_TOTAL_EPOCHS + 1, 5).tolist(), fontsize=16) # set tick step to 1 and let x axis start at 1
    plt.yticks(fontsize=16)
    plt.yscale(yscale)
    
    # Train Best Marker
    x_best = f_best(y_train)
    y_best = y_train[x_best]
    plt.scatter(x_best + 1, y_best, color='purple', s=100, marker='o', label=f'train best: {y_best:.4f}')
    
    if N_TEST_SAMPLES > 0:
        # VALIDATION
        y_val = HISTORY[f'val_{metric}']
        plt.plot(x, y_val, color='tab:orange', lw=3, label='validation')

        # Validation Best Marker
        x_best = f_best(y_val)
        y_best = y_val[x_best]
        plt.scatter(x_best + 1, y_best, color='red', s=100, marker='o', label=f'validation best: {y_best:.4f}')
    
    # Calssifier Part
    plt.vlines(N_EPOCHS_CLASSIFIER + 1, *plt.ylim(), color='black', lw=3, linestyles='dashed', alpha=0.50, label='Training Classifier/Whole Model')

    plt.grid()
    plt.legend(prop={'size': 18})
    plt.show()

In [None]:
plot_history_metric('loss', f_best=np.argmin, yscale='log')

In [None]:
plot_history_metric('precision', f_best=np.argmax)

In [None]:
plot_history_metric('recall', f_best=np.argmax)

In [None]:
plot_history_metric('f1', f_best=np.argmax)

# Per Validation Sample Metrics

The validation metrics are means, you should always be careful with means!!! This next function plots the validation metrics for each individual sample to check for variability in performance. As can be observed the performance is quite consistent, giving confidence in the model. A red flag would be if the performance is split in two groups, one with near 0 metrics and one group with near perfect performance. This would indicate there are errors in for example the data processing.

These metrics exclude the start/end/pad token, resulting in worse performance as it is trivial for a model to learn that these tokens should result in 0.

In [None]:
def get_y_true_and_y_pred():
    y_true = []
    y_pred = []
    val_metrics = {
        'focal_crossentropy': [],
        'precision': [],
        'recall': [],
        'f1': [],
    }

    for X_batch, y_true_batch in tqdm(get_dataset(X_val, y_val, False, bs=1)):
        
        y_pred_batch = model.predict_on_batch(X_batch)
        y_pred_batch_sigmoid = tf.nn.sigmoid(y_pred_batch).numpy()
        y_pred += y_pred_batch_sigmoid.flatten().tolist()
        y_true += y_true_batch.numpy().flatten().tolist()

        val_metrics['focal_crossentropy'].append(tfa.losses.SigmoidFocalCrossEntropy()(y_true_batch, y_pred_batch_sigmoid).numpy().mean())
        val_metrics['precision'].append(tf.keras.metrics.Precision()(y_true_batch, y_pred_batch_sigmoid).numpy())
        val_metrics['recall'].append(tf.keras.metrics.Recall()(y_true_batch, y_pred_batch_sigmoid).numpy())
        val_metrics['f1'].append(F1Score()(y_true_batch, y_pred_batch).numpy())
        
    # Convert y to uint8 and y_pred to float32 to reduce memory usage
    y_true = np.array(y_true, dtype=np.uint8)
    y_pred = np.array(y_pred, dtype=np.float32)
        
    return y_true, y_pred, val_metrics

if N_TEST_SAMPLES > 0:
    y_true, y_pred, val_metrics = get_y_true_and_y_pred()
    print(f'y_true shape: {y_true.shape}, y_pred shape: {y_pred.shape}')

# Save Unflattened Predictions

In [None]:
if N_TEST_SAMPLES > 0:
    y_true_tensor = y_true.reshape([len(X_val), SEQ_LENGTH, N_LABELS])
    y_pred_tensor = y_pred.reshape([len(X_val), SEQ_LENGTH, N_LABELS])

    np.save('y_true_tensor.npy', y_true_tensor)
    np.save('y_pred_tensor.npy', y_pred_tensor)

# Per Sample Validation Analysis

In [None]:
if N_TEST_SAMPLES > 0:
    for metric, values in val_metrics.items():
        plt.figure(figsize=(15, 8))
        plt.title(f'Validation {metric}', size=24)
        pd.Series(values).plot(kind='hist', bins=16, color='tab:orange')

        if metric != 'focal_crossentropy':
            plt.xlim(0, 1)

        plt.xticks(size=16)
        plt.yticks(size=16)
        plt.xlabel(metric, size=18)
        plt.ylabel('Count', size=18)
        plt.grid()
        plt.plot()

# Precision/Recall Curve

**Precision:** this can be thought of as number of hits when shooting. In a binary classification shooting means assigning a 1 to the token and hits is the ratio of the token actually being labelled as 1.

**Recall:** this can be though of as the number of found targets. In a binary classification this is the ratio of tokens labelled as 1 which are actually predicted as 1.

The trade-off here is that generally whenever a high precision is achieved the model is oversecure, resulting in a low recall. Whenever a high recall is achieved the model is generally overconfident, resulting in a low precision.

The art is to find a threshold, the sigmoid output above which a token is classified as 1, which has both a high precision and a high recall. For this model this seems to be 0.60.

In [None]:
if N_TEST_SAMPLES > 0:
    precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_pred)
    thresholds = np.concatenate(([0], thresholds))

In [None]:
if N_TEST_SAMPLES > 0:
    plt.figure(figsize=(15,8))
    plt.plot(precision, recall, color='darkorange', label='Precision/Recall', linewidth=3)
    plt.title('Precision/Recall Curve', size=24)
    plt.xlabel('Precision', size=18)
    plt.ylabel('Recall', size=18)
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.grid()
    plt.legend(prop={'size': 16})
    plt.show()

In [None]:
if N_TEST_SAMPLES > 0:
    plt.figure(figsize=(15,8))
    plt.plot(recall, thresholds,  color='darkorange', label='Recall/Threshold', linewidth=3)
    plt.title('Threshold/Recall Curve', size=24)
    plt.xlabel('Threshold', size=18)
    plt.ylabel('Recall', size=18)
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.grid()
    plt.legend(prop={'size': 16})
    plt.show()

In [None]:
if N_TEST_SAMPLES > 0:
    plt.figure(figsize=(15,8))
    plt.plot(thresholds, precision,  color='darkorange', label='Precision/Threshold', linewidth=3)
    plt.title('Threshold/Precision Curve', size=24)
    plt.xlabel('Threshold', size=18)
    plt.ylabel('Precision', size=18)
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.grid()
    plt.legend(prop={'size': 16})
    plt.show()

In [None]:
if N_TEST_SAMPLES > 0:
    f1 = 2 * (precision * recall) / (precision + recall)
    f1_best_threshold = thresholds[np.argmax(f1)]
    f1_best_value = f1.max()
    print(f'Threshold Best F1({f1_best_value:.3f}): {f1_best_threshold:.3f}')

    plt.figure(figsize=(15,8))
    plt.plot(thresholds, f1,  color='darkorange', label='Threshold/F1', linewidth=3)
    plt.scatter(f1_best_threshold, f1_best_value, color='red', s=100, marker='o', label=f'F1 best: {f1.max():.3f}')
    plt.title('Threshold/F1', size=24)
    plt.xlabel('Threshold', size=18)
    plt.ylabel('F1', size=18)
    plt.xticks(size=16)
    plt.yticks(size=16)
    plt.grid()
    plt.legend(prop={'size': 16})
    plt.show()