## BERT Embeddings on SMS Spam Dataset

In [1]:
%matplotlib inline

import pandas as pd
import tensorflow_text as text
import numpy as np

2023-11-30 02:08:59.109618: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-30 02:08:59.109652: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-30 02:08:59.110571: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-30 02:08:59.116434: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import tensorflow_hub as hub
import tensorflow as tf
from bert.tokenization import FullTokenizer
from tensorflow.keras.models import Model
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.15.0
Hub version:  0.15.0


In [21]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model

max_seq_length = 256  # Your choice here.

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

# Load BERT model with the correct input signature
bert_url = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/multi-cased-preprocess/versions/3"  # Adjust the URL to the specific BERT model you want
bert_layer = hub.KerasLayer(bert_url, trainable=True)

# Ensure the BERT layer has the correct input signature
bert_layer_resized = hub.KerasLayer(
    "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/multi-cased-preprocess/versions/3",
    trainable=True,
    input_shape=[max_seq_length],
    dtype=tf.int32,
)

pooled_output, sequence_output = bert_layer_resized([input_word_ids, input_mask, segment_ids])

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])


In [20]:


max_seq_length = 256  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])



TypeError: You are passing KerasTensor(type_spec=TensorSpec(shape=(None, 256), dtype=tf.int32, name='input_word_ids'), name='input_word_ids', description="created by layer 'input_word_ids'"), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as `tf.cond`, `tf.function`, gradient tapes, or `tf.map_fn`. Keras Functional model construction only supports TF API calls that *do* support dispatching, such as `tf.math.add` or `tf.reshape`. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer `call` and calling that layer on this symbolic input/output.

In [None]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
def get_ids_masks_segmenets(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    return input_ids, input_masks, input_segments


def build_ids_masks_segments(sentences):
    input_ids_ = []
    input_masks_ = []
    input_segments_ = []
    for s in sentences:
        input_ids, input_masks, input_segments = get_ids_masks_segmenets(s)
        input_ids_.append(input_ids)
        input_masks_.append(input_masks)
        input_segments_.append(input_segments)
    return input_ids_, input_masks_, input_segments_

def get_embeddings(sentences):
    input_ids, input_masks, input_segments = build_ids_masks_segments(sentences)
    pool_embs, all_embs = model.predict([input_ids,input_masks,input_segments])
    return all_embs

In [None]:
print(get_embeddings(["hey", "yay"]).shape)

In [None]:
df = pd.read_csv('./data/SMSSpamCollection', sep='\t', names=['label', 'data'])
df.head()

In [None]:
X = df['data']
X_train = X[0:int(X.shape[0]*.6)]
X_val = X[int(X.shape[0]*.6):int(X.shape[0]*.75)]
X_test = X[int(X.shape[0]*.75):]


In [None]:
X_train = get_embeddings(X_train.values)
X_val = get_embeddings(X_val.values)
X_test = get_embeddings(X_test.values)

In [None]:
y = df['label']
y_train = y[0:int(y.shape[0]*.6)]
y_val = y[int(y.shape[0]*.6):int(y.shape[0]*.75)]
y_test = y[int(y.shape[0]*.75):]

print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

HAM = 0
SPAM = 1

y_train = np.array(list(map(lambda x: HAM if x == 'ham' else SPAM, y_train)))
y_val = np.array(list(map(lambda x: HAM if x == 'ham' else SPAM, y_val)))
y_test = np.array(list(map(lambda x: HAM if x == 'ham' else SPAM, y_test)))

In [None]:
import tensorflow as tf
print(X_test.shape)

model = tf.keras.Sequential([
    tf.keras.layers.LSTM(256, input_shape=(X_test.shape[1], X_test.shape[2])),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(2e-5),
              metrics=['accuracy'])

In [None]:
history = model.fit(x=X_train, y=y_train, epochs=10,
                    validation_data=(X_val, y_val),
                    shuffle=True,
                    validation_steps=30)

In [None]:
import matplotlib.pyplot as plt
training_loss = history.history['loss']
test_loss = history.history['val_loss']
epoch_count = range(1, len(training_loss) + 1)

plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Training Loss', 'Test Loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
test_results = model.test_on_batch(
    X_test,
    y=y_test,
    sample_weight=None,
    reset_metrics=True
)
print(list(zip(model.metrics_names, test_results)))

In [None]:
y_pred = model.predict(X_test).round()
y_pred = list(map(lambda x: int(x[0]), y_pred))
confusion = tf.math.confusion_matrix(labels=y_test, predictions=y_pred, num_classes=2)
confusion = confusion.numpy()

In [None]:
fp = confusion[0][1]
tn = confusion[0][0]
fp_rate = float(fp / (fp+tn))
print(f"False Positive Rate: {round(fp_rate*100,3)}%")

In [None]:
import seaborn as sns

df_cm = pd.DataFrame(confusion, index = ["HAM", "SPAM"],
                  columns = ["HAM", "SPAM"])
plt.figure(figsize = (12,8))
sns.set(font_scale=1.5)
annot_kws = {"ha": 'left',"va": 'bottom'}
hm = sns.heatmap(df_cm, cmap="Pastel1", fmt="d", annot=True, annot_kws=annot_kws)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()