# This notebook formats the data, and runs it through our bidirectional transformer architecture.

### Data formatting:

Refer to this notebook: https://github.com/ocatak/lstm_malware_detection/blob/master/deep_learnin_lstm_malware_detection.ipynb

### Bidirectional architecture:

... to be added

In [1]:
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.insert(0, './keras_bert_rec')
import importlib
import keras_bert_rec as rec
importlib.reload(rec)

import argparse
import keras
import numpy as np
import pandas as pd
import pickle
import sys
import tensorflow as tf
import importlib

from itertools import chain
from keras import backend as K
from keras.models import load_model, Sequential
from keras.layers import Dense, Dropout, Activation, Flatten

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Dropout, Embedding
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import SpatialDropout1D

Using TensorFlow backend.


## Formatting data

In [None]:
DATA_PATH = "../data/"

In [None]:
# Read in data
## Calls
malware_calls_df = pd.read_csv(f"{DATA_PATH}calls.zip", compression="zip",
                               sep="\t", names=["API_Calls"])
## Labels ("types")
malware_labels_df = pd.read_csv(f"{DATA_PATH}types.zip", compression="zip",
                               sep="\t", names=["API_Labels"])

In [None]:
## Concat data
malware_calls_df["API_Labels"] = malware_labels_df.API_Labels
malware_calls_df["API_Calls"] = malware_calls_df.API_Calls.apply(lambda x: " ".join(x.split(",")))

malware_calls_df["API_Labels"] = malware_calls_df.API_Labels.apply(lambda x: 1 if x == "Virus" else 0)

In [None]:
max_words = 800
max_len = 100

X = malware_calls_df.API_Calls
Y = malware_calls_df.API_Labels.astype('category').cat.codes

In [None]:
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X)
print('Found %s unique tokens.' % len(tok.word_index))
X = tok.texts_to_sequences(X.values)
X = sequence.pad_sequences(X, maxlen=max_len)
print('Shape of data tensor:', X.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.15)

le = LabelEncoder()
Y_train_enc = le.fit_transform(Y_train)
Y_train_enc = np_utils.to_categorical(Y_train_enc)

Y_test_enc = le.transform(Y_test)
Y_test_enc = np_utils.to_categorical(Y_test_enc)

## Bidirectional model

Start by following the tutorial here: https://pypi.org/project/keras-bert/#Train-&-Use

In [20]:
from keras_bert_rec import get_base_dict, get_model, compile_model, gen_batch_inputs


In [21]:
# A toy input example
sentence_pairs = [
    [['all', 'work', 'and', 'no', 'play'], ['makes', 'jack', 'a', 'dull', 'boy']],
    [['from', 'the', 'day', 'forth'], ['my', 'arm', 'changed']],
    [['and', 'a', 'voice', 'echoed'], ['power', 'give', 'me', 'more', 'power']],
]


# Build token dictionary
token_dict = get_base_dict()  # A dict that contains some special tokens
for pairs in sentence_pairs:
    for token in pairs[0] + pairs[1]:
        if token not in token_dict:
            token_dict[token] = len(token_dict)
token_list = list(token_dict.keys())  # Used for selecting a random word


# Build & train the model
model = rec.get_model(
    token_num=len(token_dict),
    head_num=5,
    transformer_num=12,
    embed_dim=25,
    feed_forward_dim=100,
    seq_len=20,
    pos_num=20,
    dropout_rate=0.05,
)
compile_model(model)
model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 20)           0                                            
__________________________________________________________________________________________________
Input-Sequence (InputLayer)     (None, 20)           0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 20, 25), (28 700         Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 20, 25)       50          Input-Sequence[0][0]             
__________________________________________________________________________________________________
Embedding-

In [22]:
def _generator():
    while True:
        yield gen_batch_inputs(
            sentence_pairs,
            token_dict,
            token_list,
            seq_len=20,
            mask_rate=0.3,
            swap_sentence_rate=1.0,
        )

model.fit_generator(
    generator=_generator(),
    steps_per_epoch=100,
    epochs=1,
    validation_data=_generator(),
    validation_steps=10,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    ],
)



Epoch 1/1


TypeError: unhashable type: 'list'

In [None]:

# Use the trained model
inputs, output_layer = get_model(
    token_num=len(token_dict),
    head_num=5,
    transformer_num=12,
    embed_dim=25,
    feed_forward_dim=100,
    seq_len=20,
    pos_num=20,
    dropout_rate=0.05,
    training=False,      # The input layers and output layer will be returned if `training` is `False`
    trainable=False,     # Whether the model is trainable. The default value is the same with `training`
    output_layer_num=4,  # The number of layers whose outputs will be concatenated as a single output.
)                        # Only available when `training` is `False`.