In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
import random
import tensorflow as tf
import pandas as pd
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation, BatchNormalization
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100

dga_file="/Users/leeyn/desktop/work/prac/dga.txt"
alexa_file="/Users/leeyn/desktop/work/prac/top-1m.csv"

def load_alexa():
    x=[]
    data = pd.read_csv(alexa_file, sep=",",header=None)
    x=[i[1] for i in data.values]
    return x

def load_dga():
    x=[]
    data = pd.read_csv(dga_file, sep="\t", header=None)
    x=[i[1] for i in data.values]
    return x

good_data = load_alexa()
bad_data= load_dga()


data = []
data.extend(good_data)
data.extend(bad_data)

labels = []
labels.extend([0] * len(good_data))
labels.extend([1] * len(bad_data))

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(data)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(data)

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

token_path = 'tokenizer.pkl'
pickle.dump(tokenizer, open(token_path, 'wb'))

index = [i for i in range(len(data))]
random.shuffle(index)
data = np.array(data)[index]
labels = np.array(labels)[index]

TRAIN_SIZE = int(0.8 * len(data))

X_train, X_test = data[0:TRAIN_SIZE], data[TRAIN_SIZE:]
Y_train, Y_test = labels[0:TRAIN_SIZE], labels[TRAIN_SIZE:]

session = tf.Session()
K.set_session(session)


QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3



Using TensorFlow backend.


In [9]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM,
                    input_length=MAX_SEQUENCE_LENGTH))
model.add(
    Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE)))
model.add(Dense(QA_EMBED_SIZE))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dense(1))
model.add(BatchNormalization())
model.add(Activation('sigmoid'))
model.summary()


EPOCHS = 10
BATCH_SIZE = 64 * 4
VALIDATION_SPLIT = 0.3

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint(
    'model-blstm.h5', save_best_only=True, save_weights_only=False)
tensor_board = TensorBoard(
    'log/tflog-blstm', write_graph=True, write_images=True)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
          validation_split=VALIDATION_SPLIT, shuffle=True,
          callbacks=[early_stopping, model_checkpoint, tensor_board])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 100)          3900      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               84480     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
batch_normalization_3 (Batch (None, 64)                256       
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
_________________________________________________________________
batch_normalization_4 (Batch (None, 1)                 4         
__________

<keras.callbacks.History at 0x64188eda0>

In [10]:
from sklearn.metrics import classification_report
from sklearn import metrics
y_predict_list = model.predict(X_test)
y_predict = []
for i in y_predict_list:
  
    if i[0] > 0.5:
        y_predict.append(1)
    else:
        y_predict.append(0)

print(classification_report(Y_test, y_predict))
print(metrics.confusion_matrix(Y_test, y_predict))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      2018
           1       0.95      0.96      0.95      1959

    accuracy                           0.95      3977
   macro avg       0.95      0.95      0.95      3977
weighted avg       0.95      0.95      0.95      3977

[[1923   95]
 [  85 1874]]


In [11]:
model.evaluate(X_test, Y_test, verbose=1, batch_size=BATCH_SIZE)




[0.2809591689450315, 0.9547397544973305]