In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint
import itertools

In [2]:
max_length = 0
corpus = list()
labels = list()

with open(Path('./output/benign/all_benign.txt'), 'r') as f:
  lines = f.read().split('\n')[:-1]
  # print(lines[0][33:])
for line in lines:
  doc = line[33:]
  corpus.append(doc)
  labels.append(0)
  

In [3]:
with open(Path('./output/malware/all_malware.txt'), 'r') as f:
  lines = f.read().split('\n')[:-1]
  # print(lines[0][33:])
for line in lines:
  doc = line[33:]
  corpus.append(doc)
  labels.append(1)

In [4]:
labels = np.array(labels)

In [5]:
def tokenize(sentences, encode_start_end = False):
  if encode_start_end:
    setneces = ["문장시작 " + s + "문장 끝" for s in sentences]
    
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(sentences)
  tokenized_sentences = tokenizer.texts_to_sequences(sentences)
  
  return tokenized_sentences, tokenizer

In [6]:
def pad(sentences, length=None):
  if length is None:
    length = max([len(s) for s in sentences])
  
  padded_sentences = pad_sequences(sentences,
                                   maxlen = length,
                                   padding = 'post',
                                   truncating = 'post')
  
  return padded_sentences

In [7]:
X_tokenized, X_tokenizer = tokenize(corpus)

In [8]:
X_encoded = pad(X_tokenized)

In [9]:
X_vocab_size = len(X_tokenizer.word_index)

In [11]:
print("총 opcode vocab size: ", X_vocab_size)

총 opcode vocab size:  25


In [12]:
X_seq_len = len(X_encoded)
print("가장 긴 opcode 문장 길이: ", X_seq_len)

가장 긴 opcode 문장 길이:  2800


In [15]:
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model

encoder_input = Input(shape = (None, ),
                      name="encoder_input")
embedding_dim = 200
embedded_input = Embedding(input_dim = X_vocab_size,
                           output_dim = embedding_dim,
                           name = "Embedding_Layer")(encoder_input)
encoder_lstm = LSTM(units=256,
                    activation  = 'relu',
                    return_sequences=False,
                    return_state=True,
                    name="Encoder_LSTM")

_, last_h_encoder, last_c_encoder = encoder_lstm(embedded_input)

decoder_input = Input(shape=(None, 1),
                      name = 'Decoder_Input')

decoder_lstm = LSTM(units=256,
                    activation='relu',
                    return_sequences=True,
                    return_state=True,
                    name='Decoder_LSTM')

all_h_decoder, _, _ = decoder_lstm(decoder_input,
                                   initial_state=[last_h_encoder, last_c_encoder])

final_dense = Dense(1, activation='sigmoid', name='final_dense_layer')

logits = final_dense(all_h_decoder)

seq2seq_model = Model([encoder_input, decoder_input],
                      logits)

seq2seq_model.compile(loss='binary_crossentopy', optimizer='adam', metrics=['accuracy'])



In [None]:
import tensorflow_addons as tfa
import tqdm

decoder_X_input = X_encoded.reshape((-1, X_seq_len, 1))
decoder_X_target = labels.reshape((-1, X_seq_len, 1))

tqdm_callback = tfa.callbacks.TQDMProgressBar()
mc = ModelCheckpoint('./log/s2q_model.h5', save_best_only=True)

seq2seq_model.fit([X_encoded, decoder_X_input],
                  [X],
                  epochs = 100,
                  batch_size = 512,
                  validation_split=0.1,
                  callbacks=[mc, tqdm_callback])