In [None]:
import os
%run ../../base-notebook.ipynb

In [None]:
extensions_loaded = False
if not extensions_loaded:
    %load_ext autoreload
    %load_ext tensorboard
    extensions_loaded = True

%autoreload 2
%matplotlib inline

In [None]:
import sys
import os
import random

sys.path.append("./ext")

import ext.ml_callback as mlc
import ext.ml_io as mli
import ext.ml_layer as mll
import ext.ml_plot as ml_plot
import ext.ml_util as mlu
import ext.ml_data as ml_data
import ext.ml_view as mlv
import ext.ml_analyze as mla
import ext.ml_shell as mls
import ext.ml_nlp as ml_nlp

import tensorflow as tf
import pandas as pd
import numpy as np

import tensorflow_hub as hub

from keras import Model
from keras import layers
from keras import losses
from keras import optimizers

# Milestone Project 2: SkimLit

The purpose of this notebook is to build an NLP model to make reading medical abstracts easier.

The paper we're replicating (the source of the dataset we'll be using) is available here: https://arxiv.org/abs/1710.06071

And reading through the paper above, we see that the model architecture that they use to achieve their best result is available here: https://aclanthology.org/E17-2110.pdf

## Get Data

Since we'll be replicating the paper above (PubMed 200K RTC), let's download the dataset they used.

We can do so from the authors Github: https://github.com/Franck-Dernoncourt/pubmed-rct

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git

In [None]:
!ls pubmed-rct

In [None]:
# Check wat kind of files are in the PubMed_20K
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign

In [None]:
# Start our experiments using the 20K data set with numbers replaced with @ sign.
data_dir = './pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign'

In [None]:
# Check all the filenames in the target directory
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

## Preprocess data

Now we've got some text data, it's time to become one with it.

And one of the best ways to become one with the data is to... visualize visualize visualize...

So with that in mind, let's write a function to read in all of the lines of a target text file.

In [None]:
# Create a function to read the lines of a document
def get_lines(filename: str) -> list[str]:
    """
    Reads filename (a text filename) and returns all of the lines of text as a list.
    :param filename:
    :return:
    """
    with open(filename, 'r') as f:
        return f.readlines()

In [None]:
# Let's read in the training lines
train_lines = get_lines(f"{data_dir}/train.txt")
train_lines[:20]

In [None]:
len(train_lines)

Let's think about how we want our data to look...

How I think our data would be best represented...

[{line_number: 0, target: 'BACKGROUND', text: 'Emotional eating is associated with overeating and the development of obesity .', total_lines: 11}]

In [None]:
def preprocess_text_with_line_numbers(filepath: str) -> list[dict]:
    """
    takes in filenanem reads it contents and sorts through each line, extracting things like the target label, the text of the sentence, how many sentences are in the current abstract and what sentence number the target line is.

    :param filepath: the file path
    :return: list of dictionaries
    """
    lines = get_lines(filepath)

    abstract_lines = ""
    abstract_samples = []

    for line in lines:
        if line.startswith('###'):
            abstract_id = line
            abstract_lines = ""
        elif line.isspace():
            abstract_lines_split = abstract_lines.splitlines()

            for abstract_line_number, abstract_line in enumerate(abstract_lines_split):
                target, text = abstract_line.split('\t')

                line_data = {
                    "line_number": abstract_line_number,
                    "target": target,
                    "text": text.lower(),
                    "total_lines": len(abstract_lines_split)
                }
                abstract_samples.append(line_data)
        else:
            abstract_lines += line

    return abstract_samples

In [None]:
%%time
train_samples = preprocess_text_with_line_numbers(f"{data_dir}/train.txt")
val_samples = preprocess_text_with_line_numbers(f"{data_dir}/dev.txt")
test_samples = preprocess_text_with_line_numbers(f"{data_dir}/test.txt")
len(train_samples), len(val_samples), len(test_samples)

In [None]:
# Check the first abstract of our training data
train_samples[:14]

Now that our data is in the format of dictionaries, how about we turn it into a dataframe to further visualize it.

In [None]:
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
# Distributions of labels in the training set
train_df.target.value_counts()

In [None]:
# Check the length of different lines
train_df.total_lines.plot.hist()

In [None]:
### Get list of sentences
train_sentences = train_df['text'].to_list()
val_sentences = val_df['text'].to_list()
test_sentences = test_df['text'].to_list()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
train_sentences[:10]

In [None]:
train_df['text'].map(lambda l: len(l.split())).plot.hist(figsize=(16,4), bins=300)

## Making numeric labels (ML Models require numeric labels)

In [None]:
# One hot encode labels
encoder, train_labels_one_hot, val_labels_one_hot, test_labels_one_hot = \
    ml_data.one_hot_encode_column(
        train_df['target'],
        val_df['target'],
        test_df['target']
    )

train_labels_one_hot

In [None]:
train_df['target'].to_numpy().ndim, train_df['target'].to_numpy().reshape(-1, 1).ndim

### Label encode labels

In [None]:
# Extract labels ("target" column) and encode them into integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = label_encoder.fit_transform(val_df["target"].to_numpy())
test_labels_encoded = label_encoder.fit_transform(test_df["target"].to_numpy())

# check what training labels look like
train_labels_encoded

In [None]:
# Get class names and number of classes from Label Encoder instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

In [None]:
test_df['target'].to_numpy()

## Starting a series of modelling experiments...

As usual, we're going to be trying out a bunch of different models and seeing which one works best. And as always, we're going to start with a baseline (TF-IDF) Multinomial Naive Bayes classifier.

#Model 0: Getting a baseline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline
model_0 = Pipeline([
    ("tf-idf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(X=train_sentences, y=train_labels_encoded) # We need to use the encoded labels and not the one-hot labels, it expects a sparse result

In [None]:
# Evaluate our base pipeline on validation dataset
model_0.score(X=val_sentences, y=val_labels_encoded)

In [None]:
# Make predictions using our baseline model
model_0_preds = model_0.predict(X=val_sentences)

In [None]:
model_0_results = ml_plot.table_quality_metrics(y_true=val_labels_encoded, y_pred=model_0_preds)
model_0_results

## Preparing our data (the text) for deep sequence model

Before we start building deep models we got to create vectorization and embedding layers

In [None]:
# How long is each sentence on average
average_sentence_length = ml_nlp.calculate_average_word_length(train_sentences)
average_sentence_length

In [None]:
# How long of a sentence length covers 95% of examples?
output_sequence_length = ml_nlp.calculate_q_precentile_word_lengths(lines=train_sentences, q=95)
output_sequence_length

### Create Text Vectorizer layer

We want to make a layer which maps our text from words to numbers.

In [None]:
# How many words are there in our vocabulary? (taken from table 2 in the paper: https://arxiv.org/pdf/1710.06071.pdf )
max_tokens = ml_nlp.count_unique_words(train_sentences)

text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=output_sequence_length,
    pad_to_max_tokens=True
)

max_tokens

In [None]:
# Adapt TextVectorizer to training sentences
text_vectorizer.adapt(train_sentences)

In [None]:
# Test out text vectorizer on random sentences
target_sentence = random.choice(train_sentences)
len(target_sentence.split()), target_sentence

In [None]:
text_vectorizer(target_sentence)

In [None]:
# How many words in our vocabulary
len(text_vectorizer.get_vocabulary())

In [None]:
# What are the most common words in our vocabulary
text_vectorizer.get_vocabulary()[:5]

In [None]:
# What are the least common words in our vocabulary
text_vectorizer.get_vocabulary()[-5:]

In [None]:
# Get the config of our TextVectorizer
text_vectorizer.get_config()

## Create custom text embedding

In [None]:
# Create token embedding layer
token_embedding = layers.Embedding(
    input_dim=max_tokens, # Length of vocabulary
    output_dim=128, # Note: Different embedding sizing results in drastically different number of parameters to train.
    mask_zero=True, # Use masking to handle variable sequence lengths (save space)
    input_length=output_sequence_length,
    name="token_embedding"
)
token_embedding

In [None]:
print(f"Original text: {target_sentence} \
      \n\rEmbedded version:")

# Embedd the random sentence (turn it into dense vectors of fixed size)
sample_embed = token_embedding(text_vectorizer([target_sentence]))
sample_embed, sample_embed.shape

## Creating Datasets (making sure our data loads as fast as possible)

https://www.tensorflow.org/guide/data_performance
https://www.tensorflow.org/guide/data

We're going to setup our data to run as fast as possible with the TensorFlow tf.data API, many of the steps here are discussed at length in these two resources.

In [None]:
# Turn our data into TensorFlow datasets.
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset

In [None]:
train_labels_one_hot.shape, train_labels_one_hot[:1]

In [None]:
# Take the TensorSliceDataset's and turn them into prefetched datasets.
train_dataset = train_dataset.batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)

train_dataset

# Model 1: Conv1D model to process sequences

In [None]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = token_embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5, activation="relu", padding="same")(x)
x = layers.GlobalAveragePooling1D()(x) # condense the output from feature vector from conv layer
outputs = layers.Dense(units=num_classes, activation="softmax")(x)

model_1 = Model(inputs, outputs, name="model_0_conv1d")


In [None]:
model_1.compile(loss=losses.CategoricalCrossentropy(),
                optimizer=optimizers.Adam(),
                metrics=["accuracy"])
model_1.summary()

In [None]:
# Fit the model
model_1_history = model_1.fit(train_dataset,
                              steps_per_epoch=int(0.1*len(train_dataset)), # Only use 10% of the dataset.
                              epochs=3,
                              validation_data=val_dataset,
                              validation_steps=int(0.1*len(val_dataset)) # Only use 10% of the dataset.
                              )

In [None]:
ml_plot.plot_history(model_1_history)

In [None]:
# Evaluate dataset
model_1.evaluate(val_dataset)

In [None]:
# Make some predictions (our model predicts prediction probabilities for each class)
model_1_pred_probs = model_1.predict(val_dataset)
model_1_pred_probs, model_1_pred_probs.shape

In [None]:
# Convert pred probs to classes
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
model_1_preds

In [None]:
model_1_results = ml_plot.table_quality_metrics(y_true=val_labels_encoded,
                                                y_pred=model_1_preds)
model_1_results

In [None]:
# TODO: Create function to show difference including percentage.
model_0_results

## Feature Extaction with pretrained token embeddings

Now lets use pretrained word embeddings from TensorFlow Hun, more specifically the uneiversal sentence encoder.

The paper originally used GloVe embeddings, however, we're going to stick with the later created USE pretrained embeddings.

In [None]:
# Download pretrained TensorFlow HUB USE
# hub.load(handle="https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
# Create Keras Layer using the USE pretrained layer from TensorFlow Hub
tf_hub_embedding_layer = hub.KerasLayer(handle='https://tfhub.dev/google/universal-sentence-encoder/4',
                                        input_shape=[], # Since the input is defined as english text of variable length the layer in itself will make sure it fits
                                        dtype=tf.string,
                                        trainable=False, # we are doing feature extraction so we do not want to train this layer
                                        name='USE'
                                      )

In [None]:
# Test out the pretrained embedding on some random sentence
tf_hub_embedding_layer([target_sentence])[0][:30]

# Model 2: Building and fitting future extraction model using pretrained embeddings TensorFlow Hub

In [None]:
 # Create model using the Sequential API

input = layers.Input(shape=[], dtype=tf.string)
x = tf_hub_embedding_layer(input)
x = layers.Dense(units=128, activation='relu')(x) # Using 64 layers did not achieve as nice results.
output = layers.Dense(units=num_classes, activation='softmax')(x)

model_2 = Model(input, output)

model_2.compile(loss=losses.categorical_crossentropy,
                optimizer=optimizers.Adam(),
                metrics=['accuracy'])

model_2.summary()

In [None]:
model_2_history = model_2.fit(train_dataset,
                              epochs=3,
                              steps_per_epoch=int(0.1 * len(train_dataset)),
                              validation_data=val_dataset,
                              validation_steps=int(0.1 * len(val_dataset)))

In [None]:
ml_plot.plot_history(model_2_history)

In [None]:
# Evaluate the feature extraction model
model_2.evaluate(val_dataset)

In [None]:
# Let's make some predictions with the feature extraction model
model_2_pred_probs = model_2.predict(val_dataset)
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
model_2_preds

In [None]:
# Calculate results from TensorFlow Hub pretrained embedding
model_2_results = ml_plot.table_quality_metrics(y_true=val_labels_encoded,
                                                y_pred=model_2_preds)
model_2_results

In [None]:
model_0_results

# Model 3: Conv1D with character embeddings

The paper which we're replicating states they used a combination of token and character-level embeddings. Previously we've token0level embeddings but we'll need to do similar steps for characters if we want to use char-level embeddings.

## Creating a character-level tokenizer

In [None]:
train_sentences[:5]

In [None]:
# make function to split sentences into characters
def split_chars(text: str):
    return " ".join(list(text))

# Text splitting non-character level sequence into characters
split_chars(target_sentence)

In [None]:
# Split sequence-level data splits into character level data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]
train_chars[:5]

In [None]:
# Find what character length covers 95% of sequences
output_sequence_char_length = ml_nlp.calculate_q_precentile_character_lengths(train_sentences)
num_char_tokens, _ = ml_nlp.count_unique_chars(train_sentences)

output_sequence_char_length, num_char_tokens, str(_)

In [None]:
# Create char-level token vectorizer instance
char_vectorizer = layers.TextVectorization(max_tokens=num_char_tokens + 1, # Add one for OOV, space is already included
                                           output_sequence_length=output_sequence_char_length,
                                           standardize="lower_and_strip_punctuation",
                                           name="char_vectorizer")

In [None]:
# Adapt character vectorizer to training characters
char_vectorizer.adapt(train_chars)

In [None]:
character_vocabulary = char_vectorizer.get_vocabulary()
print(f"Number of different characters in vocabulary: {len(character_vocabulary)}")
print(f"Characters: {character_vocabulary}")

In [None]:
# Test out character vectorization
target_chars = random.choice(train_chars)
print(f"Value of target_chars:\n {target_chars}")
print(f"\nLength of target_chars : {len(target_chars.split())}")

vectorized_chars = char_vectorizer([target_chars])
print(f"\nVectorized of target_chars : {vectorized_chars}")
print(f"\nLength of vectorized target_chars : {len(vectorized_chars[0])}")

## Creating a character level embedding

In [None]:
# Create character embedding layer
char_embedding = layers.Embedding(
    input_dim=len(char_vectorizer.get_vocabulary()), # Length of vocabulary
    output_dim=25, # Note: Character embedding (how many features per character)
    mask_zero=True, # Use masking to handle variable sequence lengths (save space)
    name="char_embedding"
)

In [None]:
# Test our character embedding layer
print(f"Charified text (length: {len(target_chars)}):\n {target_chars}")
target_chars_embedded = char_embedding(char_vectorizer([target_chars]))
print(f"Embedded charified text (after vectorization):\n {target_chars_embedded}")
print(f"Embedded charified text shape: {target_chars_embedded.shape}")

## Build the model

In [None]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = char_vectorizer(inputs)
x = char_embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5, activation="relu", padding="same")(x)
x = layers.GlobalMaxPooling1D()(x) # condense the output from feature vector from conv layer
outputs = layers.Dense(units=num_classes, activation="softmax")(x)

model_3 = Model(inputs, outputs, name="model_4_conv1d")

In [None]:
model_3.compile(loss=losses.CategoricalCrossentropy(),
                optimizer=optimizers.Adam(),
                metrics=['accuracy'])
model_3.summary()

In [None]:
# Create datasets with batching and prefetching
train_chars_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot))
val_chars_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot))
test_chars_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot))

train_chars_dataset = train_chars_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_chars_dataset = val_chars_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_chars_dataset = test_chars_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_chars_dataset

In [None]:
# Fit the model on characters only
model_3_history = model_3.fit(train_chars_dataset,
                              steps_per_epoch=int(0.1 * len(train_chars_dataset)),
                              validation_data=val_chars_dataset,
                              validation_steps=int(0.1 * len(val_chars_dataset)),
                              epochs=3)

In [None]:
# Make predictions with character model 3
model_3_pred_props = model_3.predict(val_chars_dataset)
model_3_preds = tf.argmax(model_3_pred_props, axis=1)
model_3_preds

In [None]:
model_3_results = ml_plot.table_quality_metrics(y_true=val_labels_encoded, y_pred=model_3_preds)
model_3_results

# Model 4: Combining pretrained token embeddings + character embeddings (hybrid embeddings)

1. Create a token level embedding model (similar to model_1)
2. Create a character level model (similar to model_3 with a slight modification)
3. Combine 1 and 2 with a concatenate layer (layers.Concatenate)
4. Build a series of output layers on top of 3 similar to figure 1 section 4.2 of the paper https://aclanthology.org/E17-2110.pdf)
5. Construct a model which takes token and character-level sequences as input and produces sequence label probabilities as output.

In [None]:
# 1. Setup token inputs/model
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token-input")
token_embeddings = tf_hub_embedding_layer(token_inputs)
token_output = layers.Dense(units=128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(inputs=token_inputs, outputs=token_output)

# 2. Setup charaxcter inputs/models
character_inputs = layers.Input(shape=[], dtype=tf.string, name="character-input")
character_vectors = char_vectorizer(character_inputs)
character_embeddings = char_embedding(character_vectors)
character_bi_lstm = layers.Bidirectional(layers.LSTM(units=24))(character_embeddings) # bi-lstm as shown in Figure 1 (https://aclanthology.org/E17-2110.pdf)
characer_model = tf.keras.Model(inputs=character_inputs, outputs=character_bi_lstm)

# 3. Concatentate token and character inputs (create hybrid token embedding)
token_character_concat = layers.Concatenate(name="token_character_hybrid")([token_model.output, characer_model.output])

# 4. Create output layers adding in dropout (this is discussed in section 4.2 of paper (https://aclanthology.org/E17-2110.pdf)
combined_dropout = layers.Dropout(rate=.5)(token_character_concat)
combined_dense = layers.Dense(units=128, activation="relu")(combined_dropout)
final_dropout = layers.Dropout(rate=.5)(combined_dense)
output = layers.Dense(units=5, activation="softmax")(final_dropout)

# 5. Construct model with character and token inputs
model_4 = tf.keras.Model(inputs=[token_model.inputs,characer_model.inputs], outputs=output, name="model_4_token_and_character_embeddings")

In [None]:
# Get a summary
model_4.summary()

In [None]:
# Plot hybrid token and character model
from keras.utils import plot_model
plot_model(model_4, show_shapes=True)

In [None]:
# Compile token / character model
model_4.compile(loss=losses.CategoricalCrossentropy(),
                optimizer=optimizers.Adam(),
                metrics=["accuracy"])

## Combining token and character data into a tf.Dataset

In [None]:
# Important the order must be the same as the order specified in the inputs of model_4 (token then character)
train_token_character_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # Make data
train_token_character_label = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # Make labels
train_token_character_dataset = tf.data.Dataset.zip((train_token_character_data, train_token_character_label)) # Combine data and label
train_token_character_dataset = train_token_character_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # Setup batching / prefetching

val_token_character_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars)) # Make data
val_token_character_label = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) # Make labels
val_token_character_dataset = tf.data.Dataset.zip((val_token_character_data, val_token_character_label)) # Combine data and label
val_token_character_dataset = val_token_character_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # Setup batching / prefetching

In [None]:
# Check our training char and token embedding dataset
train_token_character_dataset, val_token_character_dataset

## Fitting a model on token and character sequences

In [None]:
model_4_history = model_4.fit(train_token_character_dataset,
                              steps_per_epoch=int(.1 * len(train_token_character_dataset)),
                              validation_data=val_token_character_dataset,
                              validation_steps=int(.1 * len(val_token_character_dataset)),
                              epochs=3)

In [None]:
# Evaluate on the whole validation set
model_4.evaluate(val_token_character_dataset)

In [None]:
# Make predictions using the token-character model hybrid
model_4_pred_props = model_4.predict(val_token_character_dataset)
model_4_preds = tf.argmax(model_4_pred_props, axis=1)
model_4_results = ml_plot.table_quality_metrics(y_true=val_labels_encoded, y_pred=model_4_preds)
model_4_results

## Model 5: Transfer learning with pretrained token embedding + character embedding + positional embeddings

In [None]:
train_df.head()

> **Note**: Any engineered features used to train the model need to be available during test-time. In our case, line-numbers and total lines are available.

### Positional embeddings

In [None]:
# How many different line nubmers are there.
train_df['line-number'].value_counts()

In [None]:
# Check the distribution of "line_number" column
train_df['line-number'].plot.hist()

In [None]:
# Use TensorFlow to create one-hot encoded tensors of our line number column. Reason to use one-hot encoding is to not give the impression that line number 2 is more important then line number 1 using for example MinMaxScaler.
train_line_numbers_one_hot = tf.one_hot(train_df['line-number'].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_df['line-number'].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df['line-number'].to_numpy(), depth=15)
train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape

In [None]:
train_df['total_lines'].value_count()