In [282]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils, preprocessing

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.8.0


In [283]:
data = pd.read_csv("train.csv", dtype = {'eventid':'object','result':'object'})
test_data= pd.read_csv("test.csv")
token_data = 'dll_token.pickle'
encode_data = 'dll_encode.pickle'
model_data = 'dll_model.h5'

In [284]:
print("Part of training data")
data['eventid'].value_counts()
data.head()

Part of training data


Unnamed: 0,eventid,result
0,4624 4769 4624 4624 4769 4768 4769 4768 4624 4...,normal
1,4624 4624 4624 4624 4769 4624 4624 4624 4624 4...,normal
2,4624 4769 4769 4768 4624 4624 4769 4768 4624 4...,normal
3,5140 5140 5140 5140 5140 5140 5140 5140 5140 5...,normal
4,4672 4672 4672 4672 4672 4672 4672 4672 4672 4...,normal


In [285]:
tag_num = data['result'].nunique()
data['result'].value_counts()

normal    250
attack     57
Name: result, dtype: int64

In [286]:
max_words  = 10000
tokenizer = text.Tokenizer(num_words=max_words, char_level=False)

In [287]:
max_len = 50
print(type(data['eventid']))
tokenizer.fit_on_texts(data['eventid'])
sequences = tokenizer.texts_to_sequences(data['eventid'])
test_sequences = tokenizer.texts_to_sequences(test_data['eventid'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_sec = pad_sequences(sequences, maxlen=max_len)
test_data_sec = pad_sequences(test_sequences, maxlen=max_len)

<class 'pandas.core.series.Series'>
Found 9 unique tokens.


In [288]:
import pickle
# save the token data in a file
with open(token_data, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [289]:
# Split data into train data and test data
train_size = int(len(data_sec) * 1.0)
print ("The number of train data: %d" % train_size)
print ("The number of test data: %d" % (len(data_sec) - train_size))

The number of train data: 307
The number of test data: 0


In [290]:
x_train = data_sec[:train_size]
x_test = test_data_sec

test_posts_doc = test_data['eventid']

In [291]:
train_tags = data['result'][:train_size]
test_tags = test_data['result']

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [292]:
# save the encoder in a file
with open(encode_data, 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [293]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print(y_train.shape)

(307, 2)


In [294]:
from keras.layers import LSTM
# Build the model
model = Sequential()
model.add(Embedding(10000, 128, input_length=max_len))
#model.add(Flatten())
model.add(LSTM(32))
model.add(Dense(tag_num, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              #optimizer='adam',
              optimizer='rmsprop',
              #optimizer='sgd',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 50, 128)           1280000   
_________________________________________________________________
lstm_17 (LSTM)               (None, 32)                20608     
_________________________________________________________________
dense_17 (Dense)             (None, 2)                 66        
Total params: 1,300,674
Trainable params: 1,300,674
Non-trainable params: 0
_________________________________________________________________


In [295]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 2

In [296]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (307, 50)
x_test shape: (57, 50)
y_train shape: (307, 2)
y_test shape: (57, 2)


In [303]:
# model.fit trains the model# model. 
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test, y_test))

Train on 307 samples, validate on 57 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [304]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.0036012195704276
Test accuracy: 0.42982455669787895


In [305]:
#save model if you want to
model.save(model_data)

In [306]:
# Here's how to generate a prediction on individual examples

text_labels = encoder.classes_ 

import csv
with open('result.csv', 'w') as f:
    writer = csv.writer(f, lineterminator='\n')
    writer.writerow(['Actual label','Predicted label','dlls']) 

    for i in range(len(x_test)):
        #print(np.array([x_test[i]]))
        prediction = model.predict(np.array([x_test[i]]))
        predicted_label = text_labels[np.argmax(prediction)]
        act_label=str(test_tags.iloc[i])
        dlls=str(test_posts_doc.iloc[i])
        writer.writerow([act_label,predicted_label,dlls]) 
        #print(test_posts_doc.iloc[i])
        #print(prediction)
        #print('Actual label:' + test_tags.iloc[i])
        #print('Predicted label: ' + predicted_label + "\n")


In [307]:
#golden
print('golden attack')
request = '''
4624 4769 4624 4769 4769 
'''
request = [request]

req_mat = tokenizer.texts_to_sequences(request)
data_sec = pad_sequences(req_mat, maxlen=max_len)
prediction = model.predict(np.array(data_sec))
print(np.array(data_sec))
predicted_label = encoder.classes_[np.argmax(prediction)]
print(prediction)
print(predicted_label)

print('Eternal attack')
request = '''
5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 5140 
'''
request = [request]

req_mat = tokenizer.texts_to_sequences(request)
data_sec = pad_sequences(req_mat, maxlen=max_len)
prediction = model.predict(np.array(data_sec))
predicted_label = encoder.classes_[np.argmax(prediction)]
print(prediction)
print(predicted_label)

golden attack
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 5 1 5 5]]
[[0.09676401 0.9078897 ]]
normal
Eternal attack
[[0.14978842 0.84790593]]
normal


In [302]:
print(tokenizer.word_index)

{'4624': 1, '4672': 2, '5140': 3, '4674': 4, '4769': 5, '4768': 6, '4688': 7, '4673': 8, '4776': 9}
