# Chapter 3 - Transfer Learning

In this chapter we'll be exploring *transfer learning*, where a model trained for one purpose is used for another. We'll take our initial model and enhance it using text and source code embeddings.

Since we have already explained the workings of this model in the previous chapter, the comments for the model basics have been removed. See [Chapter 2 (add link)]() and [Weakly Supervised Learning - Stack Overflow Tag Labeler.ipynb](../ch02/Weakly%20Supervised%20Learning%20-%20Stack%20Overflow%20Tag%20Labeler.ipynb) to learn more about the model itself.

In [1]:
import gc
import json
import math
import os
import re
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow
import tensorflow as tf
import tensorflow_hub as hub

# Add parent directory to path
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

import lib.utils

# Disable all warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /home/rjurney/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rjurney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
np.random.seed(seed=1337)

In [3]:
gpu_avail = tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)
print(f'1 or more GPUs is available: {gpu_avail}')

avail_gpus = tf.compat.v2.config.experimental.list_physical_devices('GPU')
print(f'GPUs on tap: {avail_gpus}')

1 or more GPUs is available: True
GPUs on tap: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
COLUMN_WIDTH = 50
pd.set_option('display.max_colwidth', COLUMN_WIDTH)

In [6]:
BATCH_SIZE  = 128
MAX_LEN     = 200
TOKEN_COUNT = 10000
EMBED_SIZE  = 300
TEST_SPLIT  = 0.3

In [8]:
# Tag limit defines which dataset to load - those with tags having at least 50K, 20K, 10K, 5K or 2K instances
TAG_LIMIT = 2000

# Pre-computed sorted list of tag/index pairs
sorted_all_tags = json.load(open(f'../data/stackoverflow/sorted_all_tags.{TAG_LIMIT}.json'))
max_index = sorted_all_tags[-1][0] + 1

# Load the parquet file using pyarrow for this tag limit, using the sorted tag index to specify the columns
posts_df = pd.read_parquet(
    f'../data/stackoverflow/Questions.Stratified.Final.{TAG_LIMIT}.parquet',
    columns=['_Body'] + ['label_{}'.format(i) for i in range(0, max_index)],
    engine='pyarrow'
)
posts_df.head(2)

Unnamed: 0,_Body,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,...,label_776,label_777,label_778,label_779,label_780,label_781,label_782,label_783,label_784,label_785
0,"[How, animate, Flutter, layout, keyboard, appe...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[Creating, Carousel, using, FutureBuilder, I, ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print(
    '{:,} Stack Overflow questions with a tag having at least 2,000 occurrences'.format(
        len(posts_df.index)
    )
)

1,554,788 Stack Overflow questions with a tag having at least 2,000 occurrences


In [10]:
test_matrix = posts_df[[f'label_{i}' for i in range(0, max_index)]].as_matrix()

tests = np.count_nonzero(test_matrix.sum(axis=1)), \
        test_matrix.sum(axis=1).shape[0], \
        test_matrix.sum(axis=1).min(), \
        test_matrix.sum(axis=1).max()

print(f'Non-zero rows: {tests[0]:,}, Total rows: {tests[1]:,}, Non-zero ratio: {tests[0]/tests[1]:,}, Least tags: {tests[2]:,}, Most tags: {tests[3]:,}')

Non-zero rows: 1,554,788, Total rows: 1,554,788, Non-zero ratio: 1.0, Least tags: 1, Most tags: 6


In [11]:
tag_index = json.load(open(f'../data/stackoverflow/tag_index.{TAG_LIMIT}.json'))
index_tag = json.load(open(f'../data/stackoverflow/index_tag.{TAG_LIMIT}.json'))

# Sanity check the different files
assert( len(tag_index.keys()) == len(index_tag.keys()) == len(sorted_all_tags) )

In [12]:
# Convert label columns to numpy array
labels = posts_df[list(posts_df.columns)[1:]].to_numpy()

# Training_count must be a multiple of the BATCH_SIZE times the MAX_LEN for the Elmo embedding layer
highest_factor = math.floor(len(posts_df.index) / (BATCH_SIZE * MAX_LEN))
training_count = highest_factor * BATCH_SIZE * MAX_LEN
print('Highest Factor: {:,} Training Count: {:,}'.format(highest_factor, training_count))

documents = []
for body in posts_df[0:training_count]['_Body'].values.tolist():
    words = body.tolist()
    documents.append(' '.join(words))

labels = labels[0:training_count]

# Conserve RAM
del posts_df
gc.collect()

# Lengths for x and y match
assert( len(documents) == training_count == labels.shape[0] )

Highest Factor: 60 Training Count: 1,536,000


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(
    num_words=TOKEN_COUNT + 1,
    oov_token='__PAD__'
)
tokenizer.fit_on_texts(documents)
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= TOKEN_COUNT}

sequences = tokenizer.texts_to_sequences(documents)

padded_sequences = pad_sequences(
    sequences,
    maxlen=MAX_LEN,
    dtype='int32',
    padding='post',
    truncating='post',
    value=0,
)
tokenizer.sequences_to_matrix(padded_sequences, mode='tfidf')

# Conserve RAM
del documents
del sequences
gc.collect()

# Verify that all padded documents are now the same length
assert( min([len(x) for x in padded_sequences]) == MAX_LEN == max([len(x) for x in padded_sequences]) )

padded_sequences.shape

(1536000, 200)

## Load GloVe Embeddings

Stanford defines [GloVe Embeddings](https://nlp.stanford.edu/projects/glove/) as:

> GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

We'll try them out to see if they can beat our own embedding, specific to our data.

In [14]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('../data/GloVe/glove.6B.300d.txt'))

In [15]:
# Create embeddings matrix
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

# Create embedding matrix using our vocabulary
word_index = tokenizer.word_index
nb_words = min(TOKEN_COUNT, len(word_index))
print(nb_words)

# Random normal for missing entries
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, EMBED_SIZE))
# for word, i in word_index.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: 
#         embedding_matrix[i] = embedding_vector

# Zero for missing entries
embedding_matrix = np.zeros((nb_words, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_matrix.shape, word_index

10000


((10000, 300),
 {'__PAD__': 1,
  'pad': 2,
  'i': 3,
  'using': 4,
  'like': 5,
  'code': 6,
  'the': 7,
  'get': 8,
  'use': 9,
  'how': 10,
  'file': 11,
  'want': 12,
  'would': 13,
  'way': 14,
  'error': 15,
  'one': 16,
  'data': 17,
  'is': 18,
  '1': 19,
  'need': 20,
  '2': 21,
  'following': 22,
  'problem': 23,
  'trying': 24,
  'this': 25,
  'app': 26,
  'work': 27,
  'know': 28,
  'user': 29,
  'function': 30,
  'c': 31,
  'class': 32,
  'what': 33,
  'but': 34,
  'tried': 35,
  'application': 36,
  'example': 37,
  'so': 38,
  'also': 39,
  'method': 40,
  'time': 41,
  'set': 42,
  'new': 43,
  'server': 44,
  '0': 45,
  'in': 46,
  'something': 47,
  'run': 48,
  'thanks': 49,
  'if': 50,
  'value': 51,
  'make': 52,
  'my': 53,
  'it': 54,
  'help': 55,
  'create': 56,
  'works': 57,
  'project': 58,
  '3': 59,
  'first': 60,
  'could': 61,
  'however': 62,
  'page': 63,
  'working': 64,
  'find': 65,
  'see': 66,
  'object': 67,
  'list': 68,
  'files': 69,
  'questio

## Load Elmo Embedding layer

Here we load the Elmo embedding layer using Tensorflow Hub.

In [16]:
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
embeddings = elmo(
    ["the cat is on the mat", "dogs are in the fog"],
    signature="default",
    as_dict=True)["elmo"]

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    labels,
    test_size=TEST_SPLIT,
    random_state=1337
)

# Conserve RAM
del padded_sequences
del labels
gc.collect()

assert(X_train.shape[0] == y_train.shape[0])
assert(X_train.shape[1] == MAX_LEN)
assert(X_test.shape[0] == y_test.shape[0]) 
assert(X_test.shape[1] == MAX_LEN)

In [None]:
train_weight_vec = list(np.max(np.sum(y_train, axis=0)) / np.sum(y_train, axis=0))
train_class_weights = {i: train_weight_vec[i] for i in range(y_train.shape[1])}

test_weight_vec = list(np.max(np.sum(y_test, axis=0)) / np.sum(y_test, axis=0))
test_class_weights = {i: test_weight_vec[i] for i in range(y_test.shape[1])}

sorted(list(train_class_weights.items()), key=lambda x: x[1]), sorted(list(test_class_weights.items()), key=lambda x: x[1])

## Create a Performance Log for the Model

We will log the original performance as a reference point as well as the performance of the latest model to the current run.

In [17]:
try:
    simple_log
except NameError:
    simple_log = []

try:
    with open('simple_log.jsonl') as f:
        for line in f:
            simple_log.append(json.loads(line))
except FileNotFoundError:
    pass

SEQUENCE = simple_log[-1]['sequence'] if len(simple_log) > 0 else 0

SEQUENCE

0

## Try a Simple CNN Model to Classify Questions to their Corresponding Tags

Now we’re ready to train a model to classify/label questions with tag categories. We start with a simple model with one `Conv1D`/`GlobalMaxPool1D`. We use the functional API and we’ve heavily parametrized the code so as to facilitate experimentation.

In [None]:
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import binary_crossentropy, kld
from tensorflow.keras.optimizers import Adam

import lib.utils


FILTER_COUNT        = 128
FILTER_SIZE         = 3
EPOCHS              = 8
ACTIVATION          = 'selu'
CONV_PADDING        = 'same'
STRIDES             = 1
EMBED_SIZE          = 300
EMBED_DROPOUT_RATIO = 0.1
CONV_DROPOUT_RATIO  = 0.1

EXPERIMENT_NAME = 'simple_cnn_again'

if len(simple_log) > 0 and EXPERIMENT_NAME == simple_log[-1]['name']:
    print('RENAME YOUR EXPERIMENT')
    raise Exception('RENAME YOUR EXPERIMENT')

SEQUENCE += 1


# Weights and Biases Monitoring
# import wandb
# from wandb.keras import WandbCallback
# wandb.init(project="weakly-supervised-learning", name=EXPERIMENT_NAME)
# config = wandb.config

# config_dict = {
#     'name': EXPERIMENT_NAME,
#     'embedding': 'own',
#     'architecture': 'Simple Conv1D',
#     'epochs': EPOCHS,
#     'batch_size': BATCH_SIZE,
#     'filter_count': FILTER_COUNT,
#     'filter_size': FILTER_SIZE,
#     'activation': ACTIVATION,
#     'conv_padding': CONV_PADDING,
#     'sequence': SEQUENCE
# }
# print(config_dict)
# config.update(
#     config_dict
# )

titles = ['Own Embedding', 'Static GloVe', 'Retrained GloVe']
embedding_layers = [
    
    # Randomly Initialized Embedding
    Embedding(
        TOKEN_COUNT,
        EMBED_SIZE, 
        input_length=X_train.shape[1],
        embeddings_initializer=RandomUniform(),
    ),
    
    # Static transfer of GloVe embedding
    Embedding(
        TOKEN_COUNT,
        EMBED_SIZE,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False
    ),
    
    # Retraining of GloVe embedding
    Embedding(
        TOKEN_COUNT,
        EMBED_SIZE,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=True
    ),
]

for title, emb_layer in zip(titles, embedding_layers):
    model = Sequential()

    model.add(emb_layer)
    model.add(Dropout(0.1))
    model.add(
        Conv1D(
            FILTER_COUNT, 
            FILTER_SIZE, 
            padding=CONV_PADDING, 
            activation=ACTIVATION, 
            strides=1
        )
    )
    model.add(GlobalMaxPool1D())
    model.add(
        Dense(
            y_train.shape[1],
            activation='sigmoid',
        )
    )

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[
            tf.keras.metrics.CategoricalAccuracy(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tf.keras.metrics.AUC(),
            tf.keras.metrics.TruePositives(),
            tf.keras.metrics.FalsePositives(),
            tf.keras.metrics.TrueNegatives(),
            tf.keras.metrics.FalseNegatives(),
        ]
    )
    model.summary()

    callbacks = [
        ReduceLROnPlateau(
            monitor='val_categorical_accuracy',
            factor=0.1,
            patience=1,
            verbose=1,
        ), 
        EarlyStopping(
            monitor='val_categorical_accuracy',
            patience=2,
            verbose=1,
        ), 
        ModelCheckpoint(
            filepath='models/cnn_tagger.weights.hdf5',
            monitor='val_categorical_accuracy',
            save_best_only=True,
            verbose=1,
        ),
        # WandbCallback()
    ]

    history = model.fit(X_train, y_train,
                        class_weight=train_class_weights,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        validation_split=TEST_SPLIT,
                        callbacks=callbacks)
    
    model = tf.keras.models.load_model('models/cnn_tagger.weights.hdf5')
    metrics = model.evaluate(X_test, y_test)
    
    log = {}
    for name, val in zip(model.metrics_names, metrics):

        repeat_name, py_val = lib.utils.fix_metric(name, val)
        log[repeat_name] = py_val

    # Add a name and sequence number and an F1 score
    log.update({'name': title})
    log.update({'sequence': SEQUENCE})
    log.update({'f1': (log['precision'] * log['recall']) / (log['precision'] + log['recall'])})

    simple_log.append(log)

    # Overwrite the old log
    with open('simple_log.jsonl', 'w') as f:
        [f.write(json.dumps(l) + '\n') for l in simple_log]

pd.DataFrame(simple_log)

## Plot the Epoch Accuracy

We want to know the performance at each epoch so that we don't train needlessly large numbers of epochs. 

In [None]:
%matplotlib inline

new_history = {}
for key, metrics in history.history.items():
    new_history[lib.utils.fix_metric_name(key)] = metrics

import matplotlib.pyplot as plt


# summarize history for accuracy
fig = plt.gcf()
fig.set_size_inches(12, 8, forward=True)

viz_keys = ['val_categorical_accuracy', 'val_precision', 'val_recall']
for key in viz_keys:
    plt.plot(new_history[key])
plt.title('model accuracy')
plt.ylabel('metric')
plt.xlabel('epoch')
plt.legend(viz_keys, loc='upper left')
plt.show()


# summarize history for loss
fig = plt.gcf()
fig.set_size_inches(12, 8, forward=True)

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Train a Kim-CNN Model to Label Stack Overflow Questions

Once again we’re ready to train a model to classify/label questions with tag categories. The model is based on [Kim-CNN](https://arxiv.org/abs/1408.5882), a commonly used convolutional neural network for sentence and document classification. We use the functional API and we’ve heavily parametrized the code so as to facilitate experimentation. 

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras.layers import (
    Dense, Activation, Embedding, Flatten, MaxPool1D, GlobalMaxPool1D, 
    Dropout, Conv1D, Input, concatenate, Reshape
)
from tensorflow.keras.losses import binary_crossentropy, kld
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# from keras_radam import RAdam

tf.compat.v1.disable_eager_execution()

EXPERIMENT_NAME = 'kim_cnn_2000_3_4_5_7_again_2'

FILTER_COUNT        = 128
FILTER_SIZE         = [3, 4, 5, 7]
EPOCHS              = 8
ACTIVATION          = 'selu'
CONV_PADDING        = 'same'
EMBED_SIZE          = 50
EMBED_DROPOUT_RATIO = 0.1
CONV_DROPOUT_RATIO  = 0.1

if len(simple_log) > 0 and EXPERIMENT_NAME == simple_log[-1]['name']:
    print('RENAME YOUR EXPERIMENT')
    raise Exception('RENAME YOUR EXPERIMENT')

SEQUENCE += 1

# # Weights and Biases Monitoring
# import wandb
# from wandb.keras import WandbCallback
# wandb.init(project="weakly-supervised-learning", name=EXPERIMENT_NAME)
# config = wandb.config

# config.update(
#     {
#         'name': EXPERIMENT_NAME,
#         'embedding': 'own',
#         'architecture': 'Kim CNN',
#         'epochs': EPOCHS,
#         'batch_size': BATCH_SIZE,
#         'filter_count': FILTER_COUNT,
#         'filter_size': FILTER_SIZE,
#         'activation': ACTIVATION,
#         'conv_padding': CONV_PADDING,
#         'sequence': SEQUENCE
#     }
# )

padded_input = Input(
    shape=(X_train.shape[1],),
    dtype='int32'
)

emb = Embedding(
    TOKEN_COUNT, 
    EMBED_SIZE,
    embeddings_initializer=RandomUniform(),
    input_length=X_train.shape[1]
)(padded_input)
# emb = Embedding(
#     TOKEN_COUNT,
#     EMBED_SIZE,
#     weights=[embedding_matrix],
#     input_length=MAX_LEN,
#     trainable=True,
# )(padded_input)
drp = Dropout(0.1)(emb)

# Create convlutions of different sizes
convs = []
for filter_size in FILTER_SIZE:
    f_conv = Conv1D(
        filters=FILTER_COUNT,
        kernel_size=filter_size,
        padding=CONV_PADDING,
        activation=ACTIVATION
    )(drp)
    f_shape = Reshape((MAX_LEN * EMBED_SIZE, 1))(f_conv)
    # f_pool = GlobalMaxPool1D()(f_shape)
    f_pool = MaxPool1D(filter_size)(f_conv)
    convs.append(f_pool)

l_merge = concatenate(convs, axis=1)
l_conv = Conv1D(
    128,
    5,
    activation=ACTIVATION
)(l_merge)
l_pool = GlobalMaxPool1D()(l_conv)
l_flat = Flatten()(l_pool)
l_drp  = Dropout(CONV_DROPOUT_RATIO)(l_flat)
l_dense = Dense(
    60,
    activation=ACTIVATION
)(l_drp)
out_dense = Dense(
    y_train.shape[1],
    activation='sigmoid'
)(l_dense)

model = Model(inputs=padded_input, outputs=out_dense)

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.CategoricalAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.AUC(),
        tf.keras.metrics.TruePositives(),
        tf.keras.metrics.FalsePositives(),
        tf.keras.metrics.TrueNegatives(),
        tf.keras.metrics.FalseNegatives(),
    ]
)
model.summary()

callbacks = [
    ReduceLROnPlateau(
        monitor='val_categorical_accuracy',
        factor=0.1,
        patience=1,
        verbose=1,
    ), 
    EarlyStopping(
        monitor='val_categorical_accuracy',
        patience=2,
        verbose=1,
    ), 
    ModelCheckpoint(
        filepath='models/cnn_tagger.weights.hdf5',
        monitor='val_categorical_accuracy',
        save_best_only=True,
        verbose=1,
    ),
    # WandbCallback()
]

history = model.fit(X_train, y_train,
                    class_weight=train_class_weights,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

In [None]:
model = tf.keras.models.load_model('models/cnn_tagger.weights.hdf5')
metrics = model.evaluate(X_test, y_test)

In [None]:
log = {}
for name, val in zip(model.metrics_names, metrics):
    
    repeat_name, py_val = lib.utils.fix_metric(name, val)
    log[repeat_name] = py_val

# Add a name and sequence number and an F1 score
log.update({'name': EXPERIMENT_NAME})
log.update({'sequence': SEQUENCE})
log.update({'f1': (log['precision'] * log['recall']) / (log['precision'] + log['recall'])})

simple_log.append(log)

# Overwrite the old log
with open('simple_log.jsonl', 'w') as f:
    [f.write(json.dumps(l) + '\n') for l in simple_log]

pd.DataFrame([log])

In [None]:
%matplotlib inline

new_history = {}
for key, metrics in history.history.items():
    new_history[lib.utils.fix_metric_name(key)] = metrics

import matplotlib.pyplot as plt

fig = plt.gcf()
fig.set_size_inches(12, 8, forward=True)

viz_keys = ['val_categorical_accuracy', 'val_precision', 'val_recall']
# summarize history for accuracy
for key in viz_keys:
    plt.plot(new_history[key])
plt.title('model accuracy')
plt.ylabel('metric')
plt.xlabel('epoch')
plt.legend(viz_keys, loc='upper left')
plt.show()

fig = plt.gcf()
fig.set_size_inches(12, 8, forward=True)

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Compare this Run to the 1st and Previous Run

To get an idea of performance we need to see where we started and where we just came from.

In [None]:
# Compare to original
if len(simple_log) > 1:
    d2 = simple_log[-1]
    d1 = simple_log[0]
else:
    d1 = simple_log[0]
    d2 = simple_log[0]
log_diff_1 = {key: d2.get(key, 0) - d1.get(key, 0) for key in d1.keys() if key not in ['name', 'sequence']}
log_diff_1['current'] = d2['name']
log_diff_1['previous'] = d1['name']

# Compare to last run
if len(simple_log) > 1:
    d1 = simple_log[-2]
    d2 = simple_log[-1]
else:
    d1 = simple_log[0]
    d2 = simple_log[0]
    
log_diff_2 = {key: d2.get(key, 0) - d1.get(key, 0) for key in d1.keys() if key not in ['name', 'sequence']}
log_diff_2['current'] = d2['name']
log_diff_2['previous'] = d1['name']

df = pd.DataFrame.from_dict([log_diff_1, log_diff_2])
cols = df.columns.tolist()
cols.remove('previous')
cols.remove('current')
show_cols = ['previous', 'current'] + cols
df[show_cols]

## View the Last 10 Experiments

It can be helpful to see trends of performance among experiments.

In [None]:
log_df = pd.DataFrame(simple_log)
log_df['f1'] = (log_df['precision'] * log_df['recall']) / (log_df['precision'] + log_df['recall'])

log_df[[
    'sequence',
    'name',
    'loss',
    'categorical_accuracy',
    'precision',
    'recall',
    'f1',
    'auc',
    'true_positives',
    'false_positives',
    'true_negatives',
    'false_negatives',
    'hinge',
    'mean_absolute_error',
]][0:10 if len(log_df) > 9 else len(log_df)]

## Check the Actual Prediction Outputs

It is not enough to know theoretical performance. We need to see the actual output of the tagger at different confidence thresholds.

In [None]:
TEST_COUNT = 1000

X_test_text = tokenizer.sequences_to_texts(X_test[:TEST_COUNT])

y_test_tags = []
for row in y_test[:TEST_COUNT].tolist():
    tags = [index_tag[str(i)] for i, col in enumerate(row) if col == 1]
    y_test_tags.append(tags)

## Adjust the threshold for classification

This lets us see how well the model generalizes to labeling more classes.

In [None]:
CLASSIFY_THRESHOLD = 0.5

y_pred = model.predict(X_test)
y_pred = (y_pred > CLASSIFY_THRESHOLD) * 1

y_pred_tags = []
for row in y_pred[:TEST_COUNT].tolist():
    tags = [index_tag[str(i)] for i, col in enumerate(row) if col > CLASSIFY_THRESHOLD]
    y_pred_tags.append(tags)

## See How Far off we are per Class

In [None]:
np.around(y_pred, 0).sum(axis=0) - y_test.sum(axis=0)

### View Prediction Results

It is better to view the results in a DataFrame.

In [None]:
prediction_tests = []
for x, y, z in zip(X_test_text, y_pred_tags, y_test_tags):
    prediction_tests.append({
        'Question': x,
        'Actual': ' '.join(sorted(z)),
        'Predictions': ' '.join(sorted(y)),
    })

pd.set_option('display.max_colwidth', 300)
pd.DataFrame(prediction_tests)[['Question', 'Actual', 'Predictions']]

## The Big Finish

That is the big finish!