# About

keras-bert

## Setup


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [None]:
pip install keras-bert


Collecting keras-bert
  Downloading https://files.pythonhosted.org/packages/e2/7f/95fabd29f4502924fa3f09ff6538c5a7d290dfef2c2fe076d3d1a16e08f0/keras-bert-0.86.0.tar.gz
Collecting keras-transformer>=0.38.0
  Downloading https://files.pythonhosted.org/packages/89/6c/d6f0c164f4cc16fbc0d0fea85f5526e87a7d2df7b077809e422a7e626150/keras-transformer-0.38.0.tar.gz
Collecting keras-pos-embd>=0.11.0
  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz
Collecting keras-multi-head>=0.27.0
  Downloading https://files.pythonhosted.org/packages/e6/32/45adf2549450aca7867deccfa04af80a0ab1ca139af44b16bc669e0e09cd/keras-multi-head-0.27.0.tar.gz
Collecting keras-layer-normalization>=0.14.0
  Downloading https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz
Collecting keras-position-wise-feed-forward>=0.6.0
  Downloading

## prepare dataset


In [None]:
vocab_size = 283 # includes special tokens
# Pretend maxlen is 20
maxlen = 100

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("/content/drive/My Drive/Research/CyberBERT/data/train_multiclass.csv")
valid = pd.read_csv("/content/drive/My Drive/Research/CyberBERT/data/valid_multiclass.csv")
test = pd.read_csv("/content/drive/My Drive/Research/CyberBERT/data/test_multiclass.csv")

In [None]:
def split_and_convert(row):

  return [int(x) for x in row.split(" ")]


In [None]:
train["calls"] = train["calls"].apply(lambda x: split_and_convert(x))
valid["calls"] = valid["calls"].apply(lambda x: split_and_convert(x))
test["calls"] = test["calls"].apply(lambda x: split_and_convert(x))


In [None]:
y_train = np.asarray(train["label"])
x_train = np.stack(np.asarray(train["calls"]),axis=0)

y_val = np.asarray(valid["label"])
x_val = np.stack(np.asarray(valid["calls"]),axis=0)


In [None]:
y_test = np.asarray(test["label"])
x_test = np.stack(np.asarray(test["calls"]),axis=0)

## pre-train a BERT model on our data



In [None]:
from random import randrange

sep_id = randrange(10,90)

training_input = []

for _, row in train[["calls"]].iterrows():
  sep_id = randrange(10,90)
  row_value = row[0]
  list_input = [row_value[:sep_id], row_value[sep_id:]]
  training_input.append(list_input)


In [None]:
import keras
from keras_bert import get_base_dict, get_model, compile_model, gen_batch_inputs


# Use training_input

In [None]:

### SET MAX LEN
maxlen = 100

# Build token dictionary
token_dict = get_base_dict()   # A dict that contains some special tokens
for pairs in training_input:
    for token in pairs[0] + pairs[1]:
        if token not in token_dict:
            token_dict[token] = len(token_dict)
token_list = list(token_dict.keys())  # Used for selecting a random word
len(token_dict)

253

In [None]:
# # Ensure token dict has all the required tokens
## ONLY NEEDED IF TRIMMING
# for token in x_train.flatten():
#   if token not in token_dict:
#     token_dict[token] = len(token_dict)
# token_list = list(token_dict.keys())  # Used for selecting a random word

In [None]:


# Build & train the model
model = get_model(
    token_num=len(token_dict),
    head_num=2,
    transformer_num=2,
    embed_dim=20,
    feed_forward_dim=25,
    seq_len=maxlen,
    pos_num=maxlen,
    dropout_rate=0.05,
)
compile_model(model)
model.summary()


Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 20), (2 5060        Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 20)      40          Input-Segment[0][0]              
_______________________________________________________________________________________

In [None]:

def _generator():
    while True:
        yield gen_batch_inputs(
            training_input,
            token_dict,
            token_list,
            seq_len=maxlen,
            mask_rate=0.3,
            swap_sentence_rate=0.0, # don't apply sentence swapping
        )

model.fit_generator(
    generator=_generator(),
    steps_per_epoch=10,
    epochs=3,
    validation_data=_generator(),
    validation_steps=5,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    ],
)

Epoch 1/3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f6d987f6668>

In [None]:

# # Use the trained model
# inputs, output_layer = get_model(
#     token_num=len(token_dict),
#     head_num=5,
#     transformer_num=12,
#     embed_dim=25,
#     feed_forward_dim=100,
#     seq_len=maxlen,
#     pos_num=maxlen,
#     dropout_rate=0.05,
#     training=False,      # The input layers and output layer will be returned if `training` is `False`
#     trainable=False,     # Whether the model is trainable. The default value is the same with `training`
#     output_layer_num=4,  # The number of layers whose outputs will be concatenated as a single output.
#                          # Only available when `training` is `False`.
# )

## Fine tune the model

In [None]:
# Make a copy of the model
classification_model = model

In [None]:
classification_model.summary()

Model: "functional_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 20), (2 5060        Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 20)      40          Input-Segment[0][0]              
_______________________________________________________________________________________

In [None]:
inputs = classification_model.inputs[:2]
dense = classification_model.get_layer('NSP-Dense').output
outputs = keras.layers.Dense(units=8, activation='softmax')(dense)

classification_model = keras.models.Model(inputs, outputs)
classification_model.compile(
    "adam",
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy', 'accuracy'],
)

In [None]:
classification_model.summary()

Model: "functional_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 20), (2 5060        Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 20)      40          Input-Segment[0][0]              
______________________________________________________________________________________

## Prepare input data

In [None]:
def apply_tokenizer(x):

  try:
    return TOKEN_DICT[x]
  except:
    # Assign unknown
    print(f"assigning unknown to {x}")
    return 1

In [None]:
# Apply tokenizer to data
# Super hacky!
# Need to figure out how to tokenize everything properly

TOKEN_DICT = token_dict

def split_convert_tokenize(row):

  seq =  [int(x) for x in row.split(" ")]

  return [apply_tokenizer(x) for x in seq]

train = pd.read_csv("/content/drive/My Drive/Research/CyberBERT/data/train_multiclass.csv")
valid = pd.read_csv("/content/drive/My Drive/Research/CyberBERT/data/valid_multiclass.csv")
test = pd.read_csv("/content/drive/My Drive/Research/CyberBERT/data/test_multiclass.csv")

train["calls"] = train["calls"].apply(lambda x: split_convert_tokenize(x))
valid["calls"] = valid["calls"].apply(lambda x: split_convert_tokenize(x))
test["calls"] = test["calls"].apply(lambda x: split_convert_tokenize(x))


y_train = np.asarray(train["label"])
x_train = np.stack(np.asarray(train["calls"]),axis=0)

y_val = np.asarray(valid["label"])
x_val = np.stack(np.asarray(valid["calls"]),axis=0)

y_test = np.asarray(test["label"])
x_test = np.stack(np.asarray(test["calls"]),axis=0)


assigning unknown to 260
assigning unknown to 237
assigning unknown to 237
assigning unknown to 239
assigning unknown to 239
assigning unknown to 239
assigning unknown to 257
assigning unknown to 257
assigning unknown to 257
assigning unknown to 257
assigning unknown to 257
assigning unknown to 239
assigning unknown to 239
assigning unknown to 239


In [None]:
def format_data(x_train):
    return [x_train, np.zeros_like(x_train)]

In [None]:
x_train_formatted = format_data(x_train)
x_val_formatted = format_data(x_val)
x_test_formatted = format_data(x_test)

In [None]:
history = classification_model.fit(
    x_train_formatted, y_train, epochs=10, batch_size=maxlen, validation_data=(x_val_formatted,y_val)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt 

In [None]:
y_test_pred = classification_model.predict(x_test_formatted)
y_classes = y_test_pred.argmax(axis=-1)
cm = confusion_matrix(y_test, y_classes)

plot_confusion_matrix(conf_mat=cm,
                      show_absolute=True,
                      show_normed=True,
                      colorbar=True)
plt.show()

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.grid()
plt.savefig("accuracy.png")
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.grid()
plt.savefig("loss.png")
plt.show()