## Setup


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install keras-bert




In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import pandas as pd
import numpy as np

import keras
from keras_bert import get_base_dict, get_model, compile_model, gen_batch_inputs

from random import randrange


## prepare dataset


In [None]:
vocab_size = 283 # includes special tokens
maxlen = 100

In [None]:
def split_and_convert(row):

  return [int(x) for x in row.split(" ")]


def load_data(class_label):

  train = pd.read_csv(f"/content/drive/My Drive/Research/CyberBERT/data/train_over_{class_label}.csv")
  valid = pd.read_csv(f"/content/drive/My Drive/Research/CyberBERT/data/valid_over_{class_label}.csv")
  test = pd.read_csv(f"/content/drive/My Drive/Research/CyberBERT/data/test_{class_label}.csv")

  return train, valid, test

def process_data(train, valid, test):

  train["calls"] = train["calls"].apply(lambda x: split_and_convert(x))
  valid["calls"] = valid["calls"].apply(lambda x: split_and_convert(x))
  test["calls"] = test["calls"].apply(lambda x: split_and_convert(x))

  y_train = np.asarray(train["label"])
  x_train = np.stack(np.asarray(train["calls"]),axis=0)

  y_val = np.asarray(valid["label"])
  x_val = np.stack(np.asarray(valid["calls"]),axis=0)

  y_test = np.asarray(test["label"])
  x_test = np.stack(np.asarray(test["calls"]),axis=0)

  return y_train, x_train, y_val, x_val, y_test, x_test


def create_training_input_one_class(train_df):

  sep_id = randrange(10,90)

  training_input = []

  for _, row in train_df[["calls"]].iterrows():
    sep_id = randrange(10,90)
    row_value = row[0]
    list_input = [row_value[:sep_id], row_value[sep_id:]]
    training_input.append(list_input)

  return training_input

def load_and_process_data(class_label):

  train, valid, test = load_data(class_label)

  y_train, x_train, y_val, x_val, y_test, x_test = process_data(train, valid, test)

  training_input = create_training_input_one_class(train)

  return y_train, x_train, y_val, x_val, y_test, x_test, training_input



## pre-train a BERT model on our data



In [None]:
# Token dict is the same for all datasets but let's build separately anyways
### SET MAX LEN

def build_token_dict(training_input):
  maxlen = 100

  # Build token dictionary
  token_dict = get_base_dict()   # A dict that contains some special tokens
  for pairs in training_input:
      for token in pairs[0] + pairs[1]:
          if token not in token_dict:
              token_dict[token] = len(token_dict)
  token_list = list(token_dict.keys())  # Used for selecting a random word
  
  return token_list, token_dict

In [None]:
def build_model(token_dict):

  # Build & train the model
  model = get_model(
      token_num=len(token_dict),
      head_num=2,
      transformer_num=2,
      embed_dim=20,
      feed_forward_dim=25,
      seq_len=maxlen,
      pos_num=maxlen,
      dropout_rate=0.05,
  )
  compile_model(model)
  model.summary()
  return model


In [None]:
def train_model(class_label, training_input, token_dict, token_list, model):


  def _generator():
      while True:
          yield gen_batch_inputs(
              training_input,
              token_dict,
              token_list,
              seq_len=maxlen,
              mask_rate=0.3,
              swap_sentence_rate=0.0, # don't apply sentence swapping
          )

  history = model.fit_generator(
      generator=_generator(),
      steps_per_epoch=10,
      epochs=3,
      validation_data=_generator(),
      validation_steps=5,
      callbacks=[
          keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
      ],
  )
  return history, model

## Train and evaluate the pre-training model

In [None]:
CLASSES = ["Adware", "Backdoor", "Downloader", "Dropper","Spyware", "Trojan", "Virus", "Worms"]

## Fine tune the model

## Prepare input data

TO DO: update this so it's done for each binary classification model

In [None]:

def format_data(x_train):
    return [x_train, np.zeros_like(x_train)]

def apply_tokenizer(x):

  try:
    return TOKEN_DICT[x]
  except:
    # Assign unknown
    print(f"assigning unknown to {x}")
    return 1

def split_convert_tokenize(row):

  seq =  [int(x) for x in row.split(" ")]

  return [apply_tokenizer(x) for x in seq]

def prepare_for_finetuning(class_label):

  train = pd.read_csv(f"/content/drive/My Drive/Research/CyberBERT/data/train_over_{class_label}.csv")
  valid = pd.read_csv(f"/content/drive/My Drive/Research/CyberBERT/data/valid_over_{class_label}.csv")
  test = pd.read_csv(f"/content/drive/My Drive/Research/CyberBERT/data/test_{class_label}.csv")

  train["calls"] = train["calls"].apply(lambda x: split_convert_tokenize(x))
  valid["calls"] = valid["calls"].apply(lambda x: split_convert_tokenize(x))
  test["calls"] = test["calls"].apply(lambda x: split_convert_tokenize(x))

  y_train = np.asarray(train["label"])
  x_train = np.stack(np.asarray(train["calls"]),axis=0)

  y_val = np.asarray(valid["label"])
  x_val = np.stack(np.asarray(valid["calls"]),axis=0)

  y_test = np.asarray(test["label"])
  x_test = np.stack(np.asarray(test["calls"]),axis=0)

  x_train_formatted = format_data(x_train)
  x_val_formatted = format_data(x_val)
  x_test_formatted = format_data(x_test)

  return x_train_formatted, x_val_formatted, x_test_formatted, y_train, y_val, y_test



In [None]:
def get_prediction_accuracy_bert(x_test_formatted, y_test, class_label, history):
  y_test_pred = classification_model.predict(x_test_formatted)
  y_classes = y_test_pred.argmax(axis=-1)
  cm = confusion_matrix(y_test, y_classes)

  plot_confusion_matrix(conf_mat=cm,
                        show_absolute=True,
                        show_normed=True,
                        colorbar=True)
  plt.show()
  plt.savefig(f"confusion_matrix_bert_{class_label}")
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'validation'], loc='upper left')
  plt.grid()
  plt.savefig(f"accuracy_bert_{class_label}.png")
  plt.show()

  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'validation'], loc='upper left')
  plt.grid()
  plt.savefig(f"loss_bert_{class_label}.png")
  plt.show()

In [None]:
def fine_tune_model(class_label, model):

  # SAVE_PATH = "/content/drive/My Drive/Research/CyberBERT/model/{filename}.csv"
  # finetuned_bert = keras.models.load_model(SAVE_PATH.format(filename= f"{class_label}_bert_pretrained_model"))
  finetuned_bert = model
  inputs = finetuned_bert.inputs[:2]
  dense = finetuned_bert.get_layer('NSP-Dense').output
  outputs = keras.layers.Dense(units=2, activation='softmax')(dense)

  finetuned_bert = keras.models.Model(inputs, outputs)
  history = finetuned_bert.compile(
      "adam",
      loss='sparse_categorical_crossentropy',
      metrics=['sparse_categorical_accuracy', 'accuracy'],
  )
  # get data

  x_train_formatted, x_val_formatted, x_test_formatted, y_train, y_val, y_test = prepare_for_finetuning(class_label)

  history = finetuned_bert.fit(
    x_train_formatted, y_train, epochs=10, batch_size=maxlen, validation_data=(x_val_formatted,y_val)
  )
  finetuned_bert.save(SAVE_PATH.format(filename= f"{class_label}_bert_finetuned_model"))

  get_prediction_accuracy_bert(x_test_formatted, y_test, class_label, history)
  return finetuned_bert


In [None]:
from sklearn.metrics import confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt 

In [None]:

SAVE_PATH = "/content/drive/My Drive/Research/CyberBERT/model/{filename}.csv"

for class_label in CLASSES[:1]: 

  y_train, x_train, y_val, x_val, y_test, x_test, training_input = load_and_process_data(class_label)
  # build token dict
  token_list, token_dict = build_token_dict(training_input)
  # build model
  bert = build_model(token_dict)
  # train
  history, bert = train_model(class_label, training_input, token_dict, token_list, bert)
  finetuned_bert = fine_tune_model(class_label, bert)
  print(f"save pretraeind bert model for {class_label}")
  bert.save(SAVE_PATH.format(filename= f"{class_label}_bert_finetuned_model"))

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 100)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 100, 20), (2 4620        Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 100, 20)      40          Input-Segment[0][0]              
_______________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 103
assigning unknown to 25
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 103
assigning unknown to 103
assigning unknown to 103
assigning unknown to 103
assigning unknown to 103
assigning unknown to 103
assigning unknown to 25
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
assigning unknown to 103
assigning unknown to 27
a