<a href="https://colab.research.google.com/github/shitote/go_crud/blob/main/Skimlit_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SkimLit

This is an NLP model to make reading medical abstracts easier.

The paper replicated and the source of the dataset is at https://arxiv.or/abs/171006071 and https://arxiv.org/abs/1612.05251

In [None]:
!invidia-smi -L

## Get data

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct
!ls pubmed-rct

In [None]:
# Check what files are in the pulmed 20k dataset
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/

In [139]:
# Start experimenting with numbers replaced by @ sing
data_dir = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

In [None]:
# Check all the filenames in the target directory
import os
filenames = [data_dir + filenames for filenames in os.listdir(data_dir)]
filenames

## Preprocess data

become one with the data (visualize, visualize, visualize)

In [141]:
# Create a function to read and write the lines of a document.
def get_lines(filename):
  """
  Read filenames (a text filename) and returns the lines of text as a list.

  Args:
    filename: a strin containing the target filpath.

  returns:
    A list of strings with one string per line from the target filename.
  """
  with open(filename, 'r') as f:
    return f.readlines()

In [None]:
# Read in the train lines
train_lines = get_lines(data_dir+"train.txt")
train_lines[:10]

In [None]:
len(train_lines)

##### Think about the format of the data.

How the data should be represented.

```
`[{'line_number': 0,
   'target': 'BACKGROUND',
   'text': 'Emotional eating is associated with overeatin and the development of obasity .\n',
   'totoal lines': 11},
  -----]

```

Function to create the dataset to be in the same format as above.

In [144]:
def preprocess_text_with_line_numbers(filename):
  """
  Returns a list of directories of abstract line data.

  Takes in dilname, reads it's contents and sorts throuh each line,
  extractin thins like the taret label, the text of the sentense,
  how many sentences are in the current abstract and what sentence
  number the target line is.
  """
  input_lines = get_lines(filename)
  abstract_lines = ""
  abstract_samples = []

  for line in input_lines:
    if line.startswith("###"):
      abstract_id = line
      abstract_lines = ""
    elif line.isspace():
      abstract_line_split = abstract_lines.splitlines()

      for abstract_line_number, abstract_lines in enumerate(abstract_line_split):
        line_data = {}
        target_text_split = abstract_lines.split("\t")
        line_data['target'] = target_text_split[0]
        line_data['text'] = target_text_split[1].lower()
        line_data['line_number'] = abstract_line_number
        line_data['total_lines'] = len(abstract_line_split) - 1
        abstract_samples.append(line_data)
    else:
      abstract_lines += line

  return abstract_samples

In [None]:
# Get data from file and preprocess it
%%time
train_samples = preprocess_text_with_line_numbers(data_dir + "train.txt")
val_samples = preprocess_text_with_line_numbers(data_dir + 'dev.txt')
test_samples = preprocess_text_with_line_numbers(data_dir + 'test.txt')
print(len(train_samples), len(val_samples), len(test_samples))

In [None]:
# Check the abstract of the training data
train_samples[:20]

Now that the data is in the form a dict, it is easy to turn it into a dataframe.

In [None]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
# Find the distributions of labels.
train_df.target.value_counts()

In [None]:
# Check the langth of different lines.
train_df.total_lines.plot.hist()

### Get list of sentences.

In [None]:
# Convert abstract text lines into lists
train_sentences = train_df["text"].tolist()
val_sentences = val_df['text'].tolist()
test_sentences = test_df['text'].tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
train_sentences[:10]

### Make numerical labels (ML models require numeric labels)

In [None]:
# One hot encode labels
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_df['target'].to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_df['target'].to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_df['target'].to_numpy().reshape(-1, 1))

# Check what one hot encoder labels look like.
train_labels_one_hot

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
import tensorflow as tf
tf.constant(train_labels_one_hot)

### Label encode labels


In [None]:
# Extract labels ("Target" columns) and encode them into integers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df['target'].to_numpy())
val_labels_encoded = label_encoder.transform(val_df['target'].to_numpy())
test_labels_encoded = label_encoder.transform(test_df['target'].to_numpy())

train_labels_encoded

In [None]:
# Get class names and number of classes form LabelEncoder instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

## Startin a series of modellling eperiments...

Try Different models and see what works the best to the problem at hand.

 start with the baseline model.

## Model 0: ettin a baseline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create a pipeline.
model_0 = Pipeline([
    ('tf-idf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Fit the pipeline to the training data.
model_0.fit(X=train_sentences,
            y=train_labels_encoded)

In [158]:
model_0.score(X=val_sentences,
              y=val_labels_encoded)

0.7218323844829869

In [None]:
# Make predictions using the baseline model.
baseline_preds = model_0.predict(val_sentences)
baseline_preds

In [None]:
val_labels_encoded

#### Dowload the helper functions.

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

In [162]:
from helper_functions import calculate_results

In [None]:
# Calculate baseline results
baseline_results = calculate_results(y_true=val_labels_encoded,
                                     y_pred=baseline_preds)
baseline_results

### Preparing the data (that is the texts) for deep seqeunce models

Before any deep learning modelin you need to create vectorization and embedding layers

In [164]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
# How long is each sentence on average?
sent_lens = [len(sentence.split()) for sentence in train_sentences]
avg_sent_len = np.mean(sent_lens)
avg_sent_len

In [None]:
# What's the distribution look like?
import matplotlib.pyplot as plt
plt.hist(sent_lens, bins=25);

In [None]:
# How long of a sentence length covers 95% os examples?
output_seq_len = int(np.percentile(sent_lens, 95))
output_seq_len

In [None]:
# Find the maximum length of the sentences
max(sent_lens)

### Create text vectorizer layer

Make a layer which maps texts from words to numbers.

In [169]:
max_tokens = 68000

In [170]:
# Create a text vectorizer
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Use the default TextVectorization parameters.
text_vectorizer = TextVectorization(max_tokens=max_tokens,
                                    output_sequence_length=output_seq_len
                                    )

In [171]:
text_vectorizer.adapt(train_sentences)

In [None]:
# Tes tht text vectorizer on random sentences
import random
target_sentence = random.choice(train_sentences)
print(f'Text: \n {target_sentence}')
print(f'\nLength of text: {len(target_sentence.split())}')
print(f'\nVectorized text: {text_vectorizer([target_sentence])}')

In [None]:
 # Number of words in the training vocabulary
 rct_20k_text_vocab = text_vectorizer.get_vocabulary()
 print(f"Number of words in vocab: {len(rct_20k_text_vocab)}")
 print(f'Most common words in the vocab: {rct_20k_text_vocab[:5]}')
 print(f'Least common words in the vocab: {rct_20k_text_vocab[-5:]}')

In [None]:
# Get the configuration of the text vectorizer
text_vectorizer.get_config()

### Create Custom Text Embeddin

In [175]:
# Create token embeddin layer
token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab),
                                   output_dim=128,
                                   mask_zero=True,
                                   name="token_embedding")

In [None]:
# Example of a sentence that have undergone embedding.
print(f'Sentence before vectorization: \n {target_sentence}')
vectorized_sentence = text_vectorizer([target_sentence])
print(f"The vectorized text: /n {vectorized_sentence}")
embedded_sentence = token_embed(vectorized_sentence)
print(f'The embedded sentence:\n\n {embedded_sentence}')
print(f'Embedding sentence shape: {embedded_sentence.shape}')

## Creating dataset (Make sure the data loads as fastr a possible)

Use: tf.data API

In [None]:
# Turn the data into tensorflow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

len(train_dataset)

In [None]:
# Take the TensorSliceDataset's and turn them into prefected datasets
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

len(train_dataset)

## Model 1: Conv1D with token embeddingsm

In [179]:
# Create 1D conv model
inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = text_vectorizer(inputs)
token_embeddings = token_embed(text_vectors)
x = layers.Conv1D(64, kernel_size=5, padding='same', activation='relu')(token_embeddings)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)
model_1 = tf.keras.Model(inputs, outputs)

# Compile the model
model_1.compile(loss='categorical_crossentropy',
                optimizer = tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
model_1.summary()

In [None]:
len(train_dataset)

In [None]:
# Fit model_1
history_model_1 = model_1.fit(train_dataset,
                              steps_per_epoch=int(0.1*len(train_dataset)),
                              epochs=3,
                              validation_data=valid_dataset,
                              validation_steps=int(0.1*len(valid_dataset))  # Validaton on onnly 10% of the datches.
                              )

In [None]:
# Make Evaluation on the whole validation dataset
model_1.evaluate(valid_dataset)

In [None]:
# Make prediction (The model displays a prediction probability to each class.)
model_1_pred_probs = model_1.predict(valid_dataset)
model_1_pred_probs

In [None]:
# Conver the prediction probabilities to classes
# Argmax return the position in the array where there is a high prediction probability.
model_1_preds = tf.argmax(model_1_pred_probs, axis=1)
model_1_preds[:10]

In [None]:
class_names

In [None]:
# Calculate model_1 results
model_1_results = calculate_results(y_true=val_labels_encoded,
                                    y_pred=model_1_preds)
model_1_results

 ## Model 2: Feature extraction with pretrained token embeddings

 The use of universal sentence encoder.


In [188]:
# Download pretrained TensorFlow Hub use
import tensorflow_hub as hub
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
tf_hub_embedding_layer = hub.KerasLayer(model_url,
                                        trainable=False,
                                        name="Universal_sentence_encoder")

In [None]:
# TRest the pretrained embedding on a sentence.
random_train_sentence = random.choice(train_sentences)
print(f"Random sentences:\n {random_train_sentence}")
use_embedding_sentence = tf_hub_embedding_layer([random_train_sentence])
print(f'sentence after embedding:\n{use_embedding_sentence[0][:30]} \n')
print(f'Length of sentence embedding: {len(use_embedding_sentence[0])}')

### Building and fitting NLP ffeature extraction model using pretrained embedding

In [190]:
# Define feature extraction layer.
inputs = layers.Input(shape=[], dtype=tf.string)
pretrained_embedding = tf_hub_embedding_layer(inputs)
x = layers.Dense(128, activation='relu')(pretrained_embedding)
outputs = layers.Dense(num_classes, activation='softmax')(x)
model_2 = tf.keras.Model(inputs=inputs,
                         outputs=outputs,
                         name='model_2_USE_feature_extraction')

# Compile the model.
model_2.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
model_2.summary()

In [None]:
# Fit the model 2 to the data
history_model_2 = model_2.fit(train_dataset,
                              epochs=3,
                              steps_per_epoch=int(0.1 * len(train_dataset)),
                              validation_data=valid_dataset,
                              validation_steps=int(0.1 * len(valid_dataset)))

In [None]:
# Evaluate on the whole validation dataset
model_2.evaluate(valid_dataset)

In [None]:
# Model 2 predictions
model_2_pred_probs = model_2.predict(valid_dataset)
model_2_pred_probs

In [None]:
# Convert the prediction probabilities to labels.
model_2_preds = tf.argmax(model_2_pred_probs, axis=1)
model_2_preds

In [None]:
# Culculate results from on the validation dataset.
model_2_results = calculate_results(y_true=val_labels_encoded,
                                    y_pred=model_2_preds)
model_2_results

## Model 3: Conv1D with charcter embedding

In the reaserch paper they used both charactor and token level embedding


### Creating a character-level tokenizer

In [None]:
# Make fuctions to spit sentences into characters.
def split_chars(text):
  return ' '.join(list(text))

# Test the split
split_chars(random_train_sentence)

In [None]:
 # Split sequence-level data splits into character-level data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]
train_chars[:5]

In [199]:
# What is the average characer length?
char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)
mean_char_len

149.3662574983337

In [None]:
# Check the distribution of the sequences at character-level
import matplotlib.pyplot as plt
plt.hist(char_lens, bins=10)

In [201]:
# Find what character length coves 95% of sequences
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len

290

In [202]:
# Get all keyboard characters
import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet


'abcdefghijklmnopqrstuvwxyz0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [203]:
# Create a char-level token vectorizer instance
NUM_CHAR_TOKENS = len(alphabet) + 2
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS,
                                    output_sequence_length=output_seq_char_len,
                                    standardize='lower_and_strip_punctuation',
                                    name='char_vectorizer')

In [204]:
# Adapt character vectorizer to training character
char_vectorizer.adapt(train_chars)

In [None]:
# Check char vocab start
char_vocab = char_vectorizer.get_vocabulary()
print(f"number of diff chars: {len(char_vocab)}")
print(f'5 most common char: {char_vocab[:5]}')
print(f'5 least common char: {char_vocab[-5:]}')

In [None]:
# Test out character vectorizer
random_train_chars = random.choice(train_chars)
print(f"charified text:\n {random_train_chars}")
print(f"Length of random_train_chars: {len(random_train_chars.split())}")
vectorized_chars = char_vectorizer([random_train_chars])
print(f'\nvectorized chars: \n{vectorized_chars}')
print(f'\n Length of the vectorized chars: {len(vectorized_chars[0])}')

### Creating char embedding layer


In [207]:
char_embed = layers.Embedding(input_dim=len(char_vocab),
                              output_dim=25,
                              mask_zero=True,
                              name="char_embed")


In [None]:
# Test character embedding layer.
print(f"Charified text:\n {random_train_chars} \n")
char_embed_example = char_embed(char_vectorizer([random_train_chars]))
print(f"Embedded chars. \n {char_embed_example}")
print(f'Character embedding shape: {char_embed_example.shape}')

### CHarector level embedding model

In [209]:
inputs = layers.Input(shape=(1,), dtype='string')
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(64, kernel_size=5, padding='same', activation='relu')(char_embeddings)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)
model_3 = tf.keras.Model(inputs=inputs,
                         outputs=outputs,
                         name='model_3_conv1d_char_embeddings')

# Compile the model.
model_3.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

In [None]:
model_3.summary()

In [None]:
# Create character level datasets
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

train_char_dataset

In [None]:
# Fit the model on chars only
model_3_history = model_3.fit(train_char_dataset,
                              steps_per_epoch=int(0.1*len(train_char_dataset)),
                              epochs=3,
                              validation_data=val_char_dataset,
                              validation_steps=int(0.1*len(val_char_dataset)))

In [None]:
# Make predictions with charaster models
model_3_pred_probs = model_3.predict(val_char_dataset)
model_3_pred_probs

In [214]:
# Conver the prediction probabilities to class label's
model_3_preds = tf.argmax(model_3_pred_probs, axis=1)
model_3_preds

<tf.Tensor: shape=(30212,), dtype=int64, numpy=array([1, 1, 2, ..., 4, 4, 0])>

In [None]:
# Calc the results of model 3
model_3_results = calculate_results(y_true=val_labels_encoded,
                                    y_pred=model_3_preds)
model_3_results

 ## Model 4: Combining pretrained token embedding and the charactor embedding (hybrid embedding layer)

1. Create a token-level embedding model (similar `model_1`)
2. Create a character-level model (similar to `model_3` with a slight modification)
3. Combine 1 & 2 with a concatenate (`layers.Concatenate`)
4. Build a series of output layers on top of 3 similar to Figure 1 and section 4.2 of the paper
5. Construct a model that takes token and character-level sequences as inputs and produce a sequence labels probabilities

In [216]:
# 1. Setup token inputs.model
token_inputs = layers.Input(shape=[], dtype=tf.string, name='token_input')
token_embeddings = tf_hub_embedding_layer(token_inputs)
token_outputs = layers.Dense(128, activation='relu')(token_embeddings)
token_model = tf.keras.Model(inputs=token_inputs,
                             outputs=token_outputs)

# 2. Setup char inputs/model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name='char_input')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24)) (char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)

# 3. Concatenate token and char inputs (create hybrid token embeddin)
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output,
                                                                  char_model.output])

# 4. Create an output layer - adding in Dropout
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(128, activation='relu')(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(num_classes, activation='softmax')(final_dropout)

# 5. construct model with char and token inputs
model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input],
                      outputs=output_layer,
                      name='model_4_token_and_char'
                      )

In [None]:
model_4.summary()

In [None]:
# Plot the hybrid token and character model
from keras.utils import plot_model
plot_model(model_4, show_shapes=True)

In [219]:
# Compile token char model
model_4.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(),  # you can as well use SGD
                metrics=["accuracy"])

### Combining token and sharacter data into a tf.data.Dataset


In [220]:
# Combinr chars and tokens into a dataset.
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data.
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)   # make labels.
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels))  # Combines the data and labels

# Prefetch and batch train data.
train_char_token_dataset = train_char_token_dataset.batch(23).prefetch(tf.data.AUTOTUNE)

In [221]:
# Repeat the process for the validation dataset.
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
val_char_token_label = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_label))

val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Chesk the trainin char and token embeddin dataset.
train_char_token_dataset, val_char_token_dataset

### Fiting a model on token and char level sequences

In [None]:
# FIt the model on tokena and char datasets.
history_model_4 = model_4.fit(train_char_token_dataset,
                              steps_per_epoch=int(0.1*len(train_char_token_dataset)),
                              epochs=3,
                              validation_data=val_char_token_dataset,
                              validation_steps=int(0.1 * len(val_char_token_dataset)))

In [None]:
# Evaluate the model.
model_4.evaluate(val_char_token_dataset)

In [None]:
# Make predictions for model 4 on the validation char token dataset.
model_4_pred_probs = model_4.predict(val_char_token_dataset)
model_4_pred_probs[:5]

In [None]:
# Format the predection probablities into labels.
model_4_preds = tf.argmax(model_4_pred_probs, axis=1)
model_4_preds

In [None]:
# Get results of token-char-hybrid model.
model_4_results = calculate_results(y_true=val_labels_encoded,
                                   y_pred=model_4_preds)
model_4_results

## Model 5: Transfer learnin with pretrained token embeddings + character embeddings + positional embeddings

In [None]:
train_df.head()

**NOTE:** Any eningineered feature that was used at train time need to be availabel at test time.

Like the line numbers and total lines are available.

In [None]:
# How many different line numbers are there?
train_df['line_number'].value_counts()

In [None]:
# Find the distribution of the line number
train_df.line_number.plot.hist()

In [None]:
# Create one-hot-encoded tensors using tensorflow for the line number in the dataset.
train_line_numbers_one_hot = tf.one_hot(train_df['line_number'].to_numpy(), depth=15)
val_line_number_one_hot = tf.one_hot(val_df['line_number'].to_numpy(), depth=15)
test_line_number_one_hot = tf.one_hot(test_df['line_number'].to_numpy(), depth=15)
train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape

In [None]:
# One hot encode the total numbers of lines in the dataset.
train_df['total_lines'].value_counts()

In [None]:
# Check the coverae of a total_line value of 20
np.percentile(train_df.total_lines, 98)

In [None]:
train_total_lines_one_hot = tf.one_hot(train_df['total_lines'].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_df['total_lines'].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df['total_lines'].to_numpy(), depth=20)
train_total_lines_one_hot[:10], train_total_lines_one_hot.shape

### Building a tribrid embeddin model

1. Create a token-level model
2. Create a character-level model.
3. Create a model for the 'line_number' features.
4. Create a model for the 'total_lines' features.
5. Combine the outputs of one and two usin tf.keras.layers.Concatenate
6. Combine the outputs of 3, 4, 5 using tf.keras.layers.Concatinate.
7. Create an output layer to accept the tribried embedding and output label probabilities
8. Combine the inputs of 1, 2, 3, 4 and the outputs of 7 into tf.keras.Model.

In [249]:
# Token inputs
token_inputs = layers.Input(shape=[], dtype=tf.string, name='token_inputs')
token_embeddings = tf_hub_embedding_layer(token_inputs)
token_outputs = layers.Dense(128, activation='relu')(token_embeddings)
token_model = tf.keras.Model(inputs=token_inputs,
                             outputs=token_outputs)

# Char inputs
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name='char_input')
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24)) (char_embeddings)
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)

# Line mubers model
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name='line_number_input')
x = layers.Dense(32, activation='relu')(line_number_inputs)
line_number_model = tf.keras.Model(inputs=line_number_inputs,
                                   outputs=x)

# Total lines model.
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name='total_kines_input')
y = layers.Dense(32, activation='relu')(total_lines_inputs)
total_lines_model = tf.keras.Model(inputs=total_lines_inputs,
                                  outputs=y)

# Combine the outputs of steps one and two by pass their outputs as a list.
combined_embeddings = layers.Concatenate(name='char_token_hybrid_embedding')([token_model.output,
                                                                             char_model.output])

z = layers.Dense(256, activation='relu')(combined_embeddings)
z = layers.Dropout(0.5)(z)

# Combine positional embedding with combined token and char embeddings
tribrid_embeddings = layers.Concatenate(name='char_token_positional_embedding')([line_number_model.output,
                                                                                 total_lines_model.output,
                                                                                 z])
# Create output layer.
output_layer = layers.Dense(5, activation='softmax', name='output_layer')(tribrid_embeddings)

# Finaly put it all together
model_5 = tf.keras.Model(inputs=[line_number_model.input,
                                 total_lines_model.input,
                                 token_model.input,
                                 char_model.input],
                         outputs=output_layer,
                         name='tribrid_embedding')

In [None]:
# Get the summary for the model.
model_5.summary()

In [None]:
# Plot model_5 to explore it visualy
from tensorflow.keras.utils import plot_model
plot_model(model_5, show_shapes=True)

In [251]:
# Compile the token, char, and positional embeddin model.
# Label smoothing to prevent overfitting by improving generalization
model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

### Create tribrid embedding data using tf.data

In [246]:
# Create training and validation dataset (with all four kinds of input data)

train_char_token_pos_data = tf.data.Dataset.from_tensor_slices((train_line_numbers_one_hot,
                                                               train_total_lines_one_hot,
                                                               train_sentences,
                                                               train_chars))
train_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_char_token_pos_dataset = tf.data.Dataset.zip((train_char_token_pos_data, train_char_token_pos_labels))
train_char_token_pos_dataset = train_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# for the validation dataset.
val_char_token_pos_data = tf.data.Dataset.from_tensor_slices((val_line_number_one_hot,
                                                             val_total_lines_one_hot,
                                                             val_sentences,
                                                             val_chars))
val_char_token_pos_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_char_token_pos_dataset = tf.data.Dataset.zip((val_char_token_pos_data, val_char_token_pos_labels))
val_char_token_pos_dataset = val_char_token_pos_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Check input shapes
train_char_token_pos_dataset, val_char_token_pos_dataset

In [None]:
# Fit the model.
hitory_model_5 = model_5.fit(train_char_token_pos_dataset,
                             steps_per_epoch=int(0.1 * len(train_char_token_pos_dataset)),
                             epochs=5,
                             validation_data=val_char_token_pos_dataset,
                             validation_steps=int(0.1*len(val_char_token_pos_dataset)))

In [None]:
# Make predicions with the char token pos model.
model_5_pred_probs = model_5.predict(val_char_token_pos_dataset, verbose=1)
model_5_pred_probs

In [260]:
# Convert pred probs to pred labels'
model_5_preds = tf.argmax(model_5_pred_probs, axis=1)
model_5_preds

<tf.Tensor: shape=(30212,), dtype=int64, numpy=array([0, 0, 3, ..., 4, 4, 1])>

In [None]:
# Calculate the results of model_5
model_5_results = calculate_results(y_true=val_labels_encoded,
                                   y_pred=model_5_preds)
model_5_results

In [266]:
### Compare the models results.
all_model_results = pd.DataFrame({"model_0_baseline": baseline_results,
                                  "model_1_custom_token_embedding": model_1_results,
                                  "model_2_pretrained_token_embeddin": model_2_results,
                                  "model_3_custom_char_embedding": model_3_results,
                                  "model_4_hybrid_char_token_embedding": model_4_results,
                                  "model_5_pos_char_token_embedding": model_5_results,
                                 })

In [None]:
all_model_results = all_model_results.transpose()
all_model_results

In [275]:
all_model_results.accuracy = all_model_results['accuracy']/100

In [None]:
all_model_results.plot(kind='bar', figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0))

In [None]:
# Sort the results by f1 score
all_model_results.sort_values('f1', ascending=True)['f1'].plot(kind='bar', figsize=(10, 7))

## Save and load model

In [282]:
model_5.save("skimlit_tribrid_model")

In [283]:
loaded_model = tf.keras.models.load_model("skimlit_tribrid_model")

In [None]:
loaded_model_pred_probs = loaded_model.predict(val_char_token_pos_dataset)
loaded_preds = tf.argmax(loaded_model_pred_probs, axis=1)
loaded_preds[:10]

In [None]:
loaded_model_results = calculate_results(y_true=val_labels_encoded,
                                         y_pred=loaded_preds)
loaded_model_results

In [287]:
assert model_5_results == loaded_model_results