<a href="https://colab.research.google.com/github/prp20/tensorflow_learning/blob/main/nlp_learning_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

tf.__version__

'2.9.2'

In [40]:
train_data, val_data, test_data = tfds.load(name="ag_news_subset", split=('train[:80%]', 'train[80%:]', 'test'), batch_size=-1, as_supervised=True)

In [41]:
train_examples, train_labels = tfds.as_numpy(train_data)
train_labels = np.asarray(train_labels).astype('float32').reshape((-1,1))
val_examples, val_labels = tfds.as_numpy(val_data)
val_labels = np.asarray(val_labels).astype('float32').reshape((-1,1))
test_examples, test_labels = tfds.as_numpy(test_data)
test_labels = np.asarray(test_labels).astype('float32').reshape((-1,1))

In [63]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_labels)
val_labels_one_hot = one_hot_encoder.transform(val_labels)
test_labels_one_hot = one_hot_encoder.transform(test_labels)

# Check what training labels look like
train_labels_one_hot

array([[0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [42]:
len(train_examples), len(val_examples), len(test_examples)

(96000, 24000, 7600)

In [43]:
train_examples[:10]

array([b'AMD #39;s new dual-core Opteron chip is designed mainly for corporate computing applications, including databases, Web services, and financial transactions.',
       b'Reuters - Major League Baseball\\Monday announced a decision on the appeal filed by Chicago Cubs\\pitcher Kerry Wood regarding a suspension stemming from an\\incident earlier this season.',
       b'President Bush #39;s  quot;revenue-neutral quot; tax reform needs losers to balance its winners, and people claiming the federal deduction for state and local taxes may be in administration planners #39; sights, news reports say.',
       b'Britain will run out of leading scientists unless science education is improved, says Professor Colin Pillinger.',
       b'London, England (Sports Network) - England midfielder Steven Gerrard injured his groin late in Thursday #39;s training session, but is hopeful he will be ready for Saturday #39;s World Cup qualifier against Austria.',
       b'TOKYO - Sony Corp. is banking on

In [44]:
train_labels[:10]

array([[3.],
       [1.],
       [2.],
       [3.],
       [1.],
       [0.],
       [3.],
       [0.],
       [0.],
       [1.]], dtype=float32)

In [15]:
from tensorflow.keras.layers import TextVectorization

In [45]:
output_seq_length = round(sum([len(i.split()) for i in train_examples])/len(train_examples))

In [46]:
output_seq_length

31

In [47]:
tv_layer = TextVectorization(max_tokens=10000, standardize="lower_and_strip_punctuation", split="whitespace", output_mode="int", output_sequence_length=output_seq_length, pad_to_max_tokens=True)
tv_layer.adapt(train_examples)

In [48]:
from tensorflow.keras import layers

embedding_layer = layers.Embedding(input_dim=10000, output_dim=128, input_length=output_seq_length)

## Model 0: Baseline Model

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])
model_0.fit(train_examples, train_labels)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [50]:
#Evaluate the baseline model

model_0_score = model_0.score(val_examples, val_labels)
model_0_score

0.8945833333333333

In [51]:
model_0_preds = model_0.predict(val_examples)

In [52]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {
      "accuracy": model_accuracy,
      "precision":model_precision,
      "recall": model_recall,
      "f1_score": model_f1_score
  }
  return model_results

def return_callbacks(model_name):
  callbacks_list = []
  callbacks_list.append(tf.keras.callbacks.ModelCheckpoint("saved_models/"+model_name, monitor='val_loss', save_best_only='True', verbose=1))
  callbacks_list.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights='True'))
  return callbacks_list

In [53]:
model_0_results = calculate_results(val_labels, model_0_preds)
model_0_results

{'accuracy': 89.45833333333333,
 'precision': 0.8942110979847542,
 'recall': 0.8945833333333333,
 'f1_score': 0.8942914902694554}

In [54]:
model_results_dataset = {}
model_results_dataset['model_0'] = model_0_results

In [64]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((val_examples, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_examples, test_labels_one_hot))

# Take the TensorSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 4), dtype=tf.float64, name=None))>

## Model 1: Simple NN Model

In [65]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = tv_layer(inputs)
x = embedding_layer(x)
x = layers.GlobalAveragePooling1D()(x)
output = layers.Dense(4, activation='softmax')(x)

model_1 = tf.keras.Model(inputs, output, name="nlp_model_1")
model_1.summary()

Model: "nlp_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 31)               0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 31, 128)           1280000   
                                                                 
 global_average_pooling1d_3   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_3 (Dense)             (None, 4)                 516       
                                                                 
Total params: 1,280,516
Trainable params: 1,280,516
Non

In [66]:
model_1.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history_1 = model_1.fit(train_dataset, epochs=20, validation_data=valid_dataset, callbacks=return_callbacks(model_1.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.29563, saving model to saved_models/nlp_model_1
Epoch 2/20
Epoch 2: val_loss did not improve from 0.29563
Epoch 3/20
Epoch 3: val_loss did not improve from 0.29563
Epoch 4/20
Epoch 4: val_loss did not improve from 0.29563
Epoch 5/20
Epoch 5: val_loss did not improve from 0.29563
Epoch 6/20
Epoch 6: val_loss did not improve from 0.29563
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [69]:
model_1.evaluate(valid_dataset)
model_1_loaded = tf.keras.models.load_model("saved_models/nlp_model_1")
model_1_preds = model_1_loaded.predict(valid_dataset)



In [71]:
model_1_results = calculate_results(val_labels, tf.argmax(model_1_preds, axis=1))
model_results_dataset['model_1'] = model_1_results

In [72]:
model_results_dataset

{'model_0': {'accuracy': 89.45833333333333,
  'precision': 0.8942110979847542,
  'recall': 0.8945833333333333,
  'f1_score': 0.8942914902694554},
 'model_1': {'accuracy': 90.15416666666667,
  'precision': 0.9018170319238884,
  'recall': 0.9015416666666667,
  'f1_score': 0.9014919230703585}}

In [73]:
tf.keras.backend.clear_session()

## Model 2

In [74]:
from tensorflow.keras import layers
model_2_embedding = layers.Embedding(input_dim=10000, output_dim=128, embeddings_initializer="uniform", input_length=output_seq_length, name="embedding_2")
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = tv_layer(inputs)
x = model_2_embedding(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation='relu')(x)
output = layers.Dense(4, activation='softmax')(x)

model_2 = tf.keras.Model(inputs, output, name="nlp_model_2")
model_2.summary()

Model: "nlp_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 31)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 31, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 4)                 

In [75]:
model_2.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history_2 = model_2.fit(train_dataset, epochs=20, validation_data=valid_dataset,callbacks=return_callbacks(model_2.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.29245, saving model to saved_models/nlp_model_2
Epoch 2/20
Epoch 2: val_loss did not improve from 0.29245
Epoch 3/20
Epoch 3: val_loss did not improve from 0.29245
Epoch 4/20
Epoch 4: val_loss did not improve from 0.29245
Epoch 5/20
Epoch 5: val_loss did not improve from 0.29245
Epoch 6/20
Epoch 6: val_loss did not improve from 0.29245
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [76]:
model_2.evaluate(valid_dataset)
model_2_loaded = tf.keras.models.load_model("saved_models/nlp_model_2")
model_2_preds = model_2_loaded.predict(valid_dataset)
model_2_results = calculate_results(val_labels, tf.argmax(model_2_preds, axis=1))
model_results_dataset['model_2'] = model_2_results



## Model 3: LSTM

In [77]:
tf.keras.backend.clear_session()

In [78]:
from tensorflow.keras import layers
model_3_embedding = layers.Embedding(input_dim=10000, output_dim=128, embeddings_initializer="uniform", input_length=output_seq_length, name="embedding_2")

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = tv_layer(inputs)
x = model_3_embedding(x)
x = layers.LSTM(64)(x)
x = layers.Dense(64, activation='relu')(x)
output = layers.Dense(4, activation='softmax')(x)

model_3 = tf.keras.Model(inputs, output, name="nlp_model_3")
model_3.summary()

Model: "nlp_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 31)               0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 31, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                       

In [80]:
model_3.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history_3 = model_3.fit(train_dataset, epochs=20, validation_data=valid_dataset,callbacks=return_callbacks(model_3.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.32339, saving model to saved_models/nlp_model_3




Epoch 2/20
Epoch 2: val_loss did not improve from 0.32339
Epoch 3/20
Epoch 3: val_loss did not improve from 0.32339
Epoch 4/20
Epoch 4: val_loss did not improve from 0.32339
Epoch 5/20
Epoch 5: val_loss did not improve from 0.32339
Epoch 6/20
Epoch 6: val_loss did not improve from 0.32339
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [81]:
model_3.evaluate(valid_dataset)
model_3_loaded = tf.keras.models.load_model("saved_models/nlp_model_3")
model_3_preds = model_3_loaded.predict(valid_dataset)
model_3_results = calculate_results(val_labels, tf.argmax(model_3_preds, axis=1))
model_results_dataset['model_3'] = model_3_results



## Model 4: GRU

In [82]:
tf.keras.backend.clear_session()

In [83]:
from tensorflow.keras import layers
model_4_embedding = layers.Embedding(input_dim=10000, output_dim=128, embeddings_initializer="uniform", input_length=output_seq_length, name="embedding_4")

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = tv_layer(inputs)
x = model_4_embedding(x)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.GRU(64)(x)
x = layers.Dense(64, activation='relu')(x)
output = layers.Dense(4, activation='softmax')(x)

model_4 = tf.keras.Model(inputs, output, name="nlp_model_4")
model_4.summary()

Model: "nlp_model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 31)               0         
 ectorization)                                                   
                                                                 
 embedding_4 (Embedding)     (None, 31, 128)           1280000   
                                                                 
 lstm (LSTM)                 (None, 31, 64)            49408     
                                                                 
 gru (GRU)                   (None, 64)                24960     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                       

In [84]:
model_4.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history_4 = model_4.fit(train_dataset, epochs=20, validation_data=valid_dataset,callbacks=return_callbacks(model_4.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.31742, saving model to saved_models/nlp_model_4




Epoch 2/20
Epoch 2: val_loss did not improve from 0.31742
Epoch 3/20
Epoch 3: val_loss did not improve from 0.31742
Epoch 4/20
Epoch 4: val_loss did not improve from 0.31742
Epoch 5/20
Epoch 5: val_loss did not improve from 0.31742
Epoch 6/20
Epoch 6: val_loss did not improve from 0.31742
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [85]:
model_4.evaluate(valid_dataset)
model_4_loaded = tf.keras.models.load_model("saved_models/nlp_model_4")
model_4_preds = model_4_loaded.predict(valid_dataset)
model_4_results = calculate_results(val_labels, tf.argmax(model_4_preds, axis=1))
model_results_dataset['model_4'] = model_4_results



## Model 5: Bi-Directional RNN

In [86]:
tf.keras.backend.clear_session()

In [87]:
from tensorflow.keras import layers
model_5_embedding = layers.Embedding(input_dim=10000, output_dim=128, embeddings_initializer="uniform", input_length=output_seq_length, name="embedding_5")

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = tv_layer(inputs)
x = model_5_embedding(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
x = layers.Dense(64, activation='relu')(x)
output = layers.Dense(4, activation='softmax')(x)

model_5 = tf.keras.Model(inputs, output, name="nlp_model_5")
model_5.summary()

Model: "nlp_model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 31)               0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, 31, 128)           1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 31, 128)          98816     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                                       

In [88]:
model_5.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"])
history_5 = model_5.fit(train_dataset, epochs=20, validation_data=valid_dataset,callbacks=return_callbacks(model_5.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.29922, saving model to saved_models/nlp_model_5




Epoch 2/20
Epoch 2: val_loss did not improve from 0.29922
Epoch 3/20
Epoch 3: val_loss did not improve from 0.29922
Epoch 4/20
Epoch 4: val_loss did not improve from 0.29922
Epoch 5/20
Epoch 5: val_loss did not improve from 0.29922
Epoch 6/20
Epoch 6: val_loss did not improve from 0.29922
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [89]:
model_5.evaluate(valid_dataset)
model_5_loaded = tf.keras.models.load_model("saved_models/nlp_model_5")
model_5_preds = model_5_loaded.predict(valid_dataset)
model_5_results = calculate_results(val_labels, tf.argmax(model_5_preds, axis=1))
model_results_dataset['model_5'] = model_5_results



## Model 6: CNN

In [90]:
tf.keras.backend.clear_session()

In [91]:
model_6_embedding = layers.Embedding(input_dim=10000, output_dim=128, embeddings_initializer="uniform", input_length=output_seq_length, name="embedding_6")

# Create 1D convolutional model to process sequences
inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = tv_layer(inputs) # vectorize text inputs
token_embeddings = model_6_embedding(text_vectors) # create embedding
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings)
x = layers.GlobalAveragePooling1D()(x) # condense the output of our feature vector
outputs = layers.Dense(4, activation="softmax")(x)
model_6 = tf.keras.Model(inputs, outputs, name="nlp_model_6")

# Compile
model_6.compile(loss="categorical_crossentropy", # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
history_6 = model_6.fit(train_dataset, epochs=20, validation_data=valid_dataset,callbacks=return_callbacks(model_6.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.30199, saving model to saved_models/nlp_model_6




Epoch 2/20
Epoch 2: val_loss did not improve from 0.30199
Epoch 3/20
Epoch 3: val_loss did not improve from 0.30199
Epoch 4/20
Epoch 4: val_loss did not improve from 0.30199
Epoch 5/20
Epoch 5: val_loss did not improve from 0.30199
Epoch 6/20
Epoch 6: val_loss did not improve from 0.30199
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping


In [92]:
model_6.evaluate(valid_dataset)
model_6_loaded = tf.keras.models.load_model("saved_models/nlp_model_6")
model_6_preds = model_6_loaded.predict(valid_dataset)
model_6_results = calculate_results(val_labels, tf.argmax(model_6_preds, axis=1))
model_results_dataset['model_6'] = model_6_results



## Model 7: Pre-trained Layer

In [93]:
tf.keras.backend.clear_session()

In [94]:
# Download pretrained TensorFlow Hub USE
import tensorflow_hub as hub
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

In [96]:
model_7 = tf.keras.Sequential([
  layers.Input(shape=[], dtype=tf.string),
  tf_hub_embedding_layer, # take in sentences and then encode them into an embedding
  layers.Dense(128, activation="relu"),
  layers.Dense(4, activation="softmax")
], name="nlp_model_7")

model_7.summary()

Model: "nlp_model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 universal_sentence_encoder   (None, 512)              256797824 
 (KerasLayer)                                                    
                                                                 
 dense_2 (Dense)             (None, 128)               65664     
                                                                 
 dense_3 (Dense)             (None, 4)                 516       
                                                                 
Total params: 256,864,004
Trainable params: 66,180
Non-trainable params: 256,797,824
_________________________________________________________________


In [97]:
model_7.compile(loss="categorical_crossentropy", # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
history_7 = model_7.fit(train_dataset, epochs=20, validation_data=valid_dataset,callbacks=return_callbacks(model_7.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.31757, saving model to saved_models/nlp_model_7
Epoch 2/20
Epoch 2: val_loss improved from 0.31757 to 0.30670, saving model to saved_models/nlp_model_7
Epoch 3/20
Epoch 3: val_loss improved from 0.30670 to 0.29768, saving model to saved_models/nlp_model_7
Epoch 4/20
Epoch 4: val_loss improved from 0.29768 to 0.29041, saving model to saved_models/nlp_model_7
Epoch 5/20
Epoch 5: val_loss improved from 0.29041 to 0.28544, saving model to saved_models/nlp_model_7
Epoch 6/20
Epoch 6: val_loss improved from 0.28544 to 0.28303, saving model to saved_models/nlp_model_7
Epoch 7/20
Epoch 7: val_loss improved from 0.28303 to 0.28269, saving model to saved_models/nlp_model_7
Epoch 8/20
Epoch 8: val_loss did not improve from 0.28269
Epoch 9/20
Epoch 9: val_loss did not improve from 0.28269
Epoch 10/20
Epoch 10: val_loss did not improve from 0.28269
Epoch 11/20
Epoch 11: val_loss did not improve from 0.28269
Epoch 12/20
Epoch 12: val_loss did not i

In [98]:
model_7.evaluate(valid_dataset)
model_7_loaded = tf.keras.models.load_model("saved_models/nlp_model_7")
model_7_preds = model_7_loaded.predict(valid_dataset)
model_7_results = calculate_results(val_labels, tf.argmax(model_7_preds, axis=1))
model_results_dataset['model_7'] = model_7_results



## Model 8: Conv1D with character embeddings

In [99]:
tf.keras.backend.clear_session()

In [104]:
# Make function to split sentences into characters
def split_chars(text):
  text = text.decode()
  return " ".join(list(text))

# Split sequence-level data splits into character-level data splits
train_chars = [split_chars(sentence) for sentence in train_examples]
val_chars = [split_chars(sentence) for sentence in val_examples]
test_chars = [split_chars(sentence) for sentence in test_examples]
print(train_chars[0])
char_lens = [len(sentence) for sentence in train_examples]
mean_char_len = np.mean(char_lens)
mean_char_len
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len


A M D   # 3 9 ; s   n e w   d u a l - c o r e   O p t e r o n   c h i p   i s   d e s i g n e d   m a i n l y   f o r   c o r p o r a t e   c o m p u t i n g   a p p l i c a t i o n s ,   i n c l u d i n g   d a t a b a s e s ,   W e b   s e r v i c e s ,   a n d   f i n a n c i a l   t r a n s a c t i o n s .


296

In [105]:
import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet

'abcdefghijklmnopqrstuvwxyz0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [106]:
# Create char-level token vectorizer instance
NUM_CHAR_TOKENS = len(alphabet) + 2 # num characters in alphabet + space + OOV token
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS,  
                                    output_sequence_length=output_seq_char_len,
                                    standardize="lower_and_strip_punctuation",
                                    name="char_vectorizer")

# Adapt character vectorizer to training characters
char_vectorizer.adapt(train_chars)

In [107]:
# Create char embedding layer
char_embed = layers.Embedding(input_dim=NUM_CHAR_TOKENS, # number of different characters
                              output_dim=25, # embedding dimension of each character (same as Figure 1 in https://arxiv.org/pdf/1612.05251.pdf)
                              mask_zero=False, # don't use masks (this messes up model_5 if set to True)
                              name="char_embed")

In [108]:
# Make Conv1D on chars only
inputs = layers.Input(shape=(1,), dtype="string")
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(4, activation="softmax")(x)
model_8 = tf.keras.Model(inputs=inputs,
                         outputs=outputs,
                         name="nlp_model_8")

# Compile model
model_8.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [109]:
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

train_char_dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None, 4), dtype=tf.float64, name=None))>

In [110]:
history_8 = model_8.fit(train_char_dataset, epochs=20, validation_data=val_char_dataset,callbacks=return_callbacks(model_8.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.60444, saving model to saved_models/nlp_model_8




Epoch 2/20
Epoch 2: val_loss improved from 0.60444 to 0.55433, saving model to saved_models/nlp_model_8




Epoch 3/20
Epoch 3: val_loss improved from 0.55433 to 0.53091, saving model to saved_models/nlp_model_8




Epoch 4/20
Epoch 4: val_loss improved from 0.53091 to 0.51683, saving model to saved_models/nlp_model_8




Epoch 5/20
Epoch 5: val_loss improved from 0.51683 to 0.50799, saving model to saved_models/nlp_model_8




Epoch 6/20
Epoch 6: val_loss improved from 0.50799 to 0.50322, saving model to saved_models/nlp_model_8




Epoch 7/20
Epoch 7: val_loss improved from 0.50322 to 0.49822, saving model to saved_models/nlp_model_8




Epoch 8/20
Epoch 8: val_loss improved from 0.49822 to 0.49356, saving model to saved_models/nlp_model_8




Epoch 9/20
Epoch 9: val_loss improved from 0.49356 to 0.49057, saving model to saved_models/nlp_model_8




Epoch 10/20
Epoch 10: val_loss improved from 0.49057 to 0.48777, saving model to saved_models/nlp_model_8




Epoch 11/20
Epoch 11: val_loss improved from 0.48777 to 0.48587, saving model to saved_models/nlp_model_8




Epoch 12/20
Epoch 12: val_loss improved from 0.48587 to 0.48456, saving model to saved_models/nlp_model_8




Epoch 13/20
Epoch 13: val_loss improved from 0.48456 to 0.48218, saving model to saved_models/nlp_model_8




Epoch 14/20
Epoch 14: val_loss improved from 0.48218 to 0.48016, saving model to saved_models/nlp_model_8




Epoch 15/20
Epoch 15: val_loss improved from 0.48016 to 0.47911, saving model to saved_models/nlp_model_8




Epoch 16/20
Epoch 16: val_loss improved from 0.47911 to 0.47703, saving model to saved_models/nlp_model_8




Epoch 17/20
Epoch 17: val_loss improved from 0.47703 to 0.47653, saving model to saved_models/nlp_model_8




Epoch 18/20
Epoch 18: val_loss improved from 0.47653 to 0.47639, saving model to saved_models/nlp_model_8




Epoch 19/20
Epoch 19: val_loss improved from 0.47639 to 0.47459, saving model to saved_models/nlp_model_8




Epoch 20/20
Epoch 20: val_loss improved from 0.47459 to 0.47411, saving model to saved_models/nlp_model_8






In [111]:
model_8.evaluate(valid_dataset)
model_8_loaded = tf.keras.models.load_model("saved_models/nlp_model_8")
model_8_preds = model_8_loaded.predict(valid_dataset)
model_8_results = calculate_results(val_labels, tf.argmax(model_8_preds, axis=1))
model_results_dataset['model_8'] = model_8_results



  _warn_prf(average, modifier, msg_start, len(result))


## Model 9: Combining pretrained token embeddings + character embeddings

In [112]:
tf.keras.backend.clear_session()

In [113]:
# 1. Setup token inputs/model
token_inputs = layers.Input(shape=[], dtype=tf.string, name="token_input")
token_embeddings = tf_hub_embedding_layer(token_inputs)
token_output = layers.Dense(128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(inputs=token_inputs,
                             outputs=token_output)

# 2. Setup char inputs/model
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
char_vectors = char_vectorizer(char_inputs)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(25))(char_embeddings) # bi-LSTM shown in Figure 1 of https://arxiv.org/pdf/1612.05251.pdf
char_model = tf.keras.Model(inputs=char_inputs,
                            outputs=char_bi_lstm)

# 3. Concatenate token and char inputs (create hybrid token embedding)
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, 
                                                                  char_model.output])

# 4. Create output layers - addition of dropout discussed in 4.2 of https://arxiv.org/pdf/1612.05251.pdf
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(200, activation="relu")(combined_dropout) # slightly different to Figure 1 due to different shapes of token/char embedding layers
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(4, activation="softmax")(final_dropout)

# 5. Construct model with char and token inputs
model_9 = tf.keras.Model(inputs=[token_model.input, char_model.input],
                         outputs=output_layer,
                         name="nlp_model_9")

In [114]:
# Compile token char model
model_9.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(), # section 4.2 of https://arxiv.org/pdf/1612.05251.pdf mentions using SGD but we'll stick with Adam
                metrics=["accuracy"])


In [115]:
# Combine chars and tokens into a dataset
train_char_token_data = tf.data.Dataset.from_tensor_slices((train_examples, train_chars)) # make data
train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels
train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # combine data and labels

# Prefetch and batch train data
train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE) 

# Repeat same steps validation data
val_char_token_data = tf.data.Dataset.from_tensor_slices((val_examples, val_chars))
val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels))
val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
history_9 = model_9.fit(train_char_token_dataset, epochs=20, validation_data=val_char_token_dataset, callbacks=return_callbacks(model_9.name))

Epoch 1/20
Epoch 1: val_loss improved from inf to 0.30257, saving model to saved_models/nlp_model_9




Epoch 2/20
Epoch 2: val_loss improved from 0.30257 to 0.29026, saving model to saved_models/nlp_model_9




Epoch 3/20
Epoch 3: val_loss improved from 0.29026 to 0.28210, saving model to saved_models/nlp_model_9




Epoch 4/20
Epoch 4: val_loss improved from 0.28210 to 0.27618, saving model to saved_models/nlp_model_9




Epoch 5/20
Epoch 5: val_loss improved from 0.27618 to 0.27489, saving model to saved_models/nlp_model_9






In [None]:
model_9.evaluate(valid_dataset)
model_9_loaded = tf.keras.models.load_model("saved_models/nlp_model_9")
model_9_preds = model_9_loaded.predict(valid_dataset)
model_9_results = calculate_results(val_labels, tf.argmax(model_9_preds, axis=1))
model_results_dataset['model_9'] = model_9_results