In [1]:
import tensorflow as tf
import json


In [3]:
from data_utils import Data


config = json.load(open("./config.json"))

# Load training data
training_data = Data(data_source=config["data"]["training_data_source"], alphabet=config["data"]["alphabet"],
                         input_size=config["data"]["input_size"],num_of_classes=config["data"]["num_of_classes"])
training_data.load_data()
training_inputs, training_labels, batch_texts = training_data.get_all_data()


# Load validation data
validation_data = Data(data_source=config["data"]["validation_data_source"], alphabet=config["data"]["alphabet"],
                           input_size=config["data"]["input_size"], num_of_classes=config["data"]["num_of_classes"])
validation_data.load_data()
validation_inputs, validation_labels, batch_texts = validation_data.get_all_data()

# Load test data
test_data = Data(data_source=config["data"]["test_data_source"], alphabet=config["data"]["alphabet"],
                           input_size=config["data"]["input_size"], num_of_classes=config["data"]["num_of_classes"])
test_data.load_data()
test_inputs, test_labels, batch_texts = test_data.get_all_data()


Data loaded from ./_train.csv
Data loaded from ./_validate.csv
Data loaded from ./_test.csv


In [4]:
# Build model ###
from keras.models import Model
from keras.layers import Input, Dense, Flatten
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.layers import ThresholdedReLU
from keras.layers import Dropout
import keras_metrics

#vars
input_size=config["data"]["input_size"]
alphabet_size=config["data"]["alphabet_size"]
embedding_size=config["char_cnn_zhang"]["embedding_size"]
conv_layers=config["char_cnn_zhang"]["conv_layers"]
fully_connected_layers=config["char_cnn_zhang"]["fully_connected_layers"]
num_of_classes=config["data"]["num_of_classes"]
threshold=config["char_cnn_zhang"]["threshold"]
dropout_p=config["char_cnn_zhang"]["dropout_p"]
print("Dropout param: "+str(dropout_p))
optimizer=config["char_cnn_zhang"]["optimizer"]
loss=config["char_cnn_zhang"]["loss"]

# Input layer
inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
        
# Embedding layers
x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)

# Convolution layers
for cl in conv_layers:
    x = Convolution1D(cl[0], cl[1])(x)
    x = ThresholdedReLU(threshold)(x)
    if cl[2] != -1:
        x = MaxPooling1D(cl[2])(x)

x = Flatten()(x)

# Fully connected layers
for fl in fully_connected_layers:
    x = Dense(fl)(x)
    x = ThresholdedReLU(threshold)(x)
    x = Dropout(dropout_p)(x)
        
# Output layer
predictions = Dense(num_of_classes, activation='softmax')(x)

# Build and compile model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=[keras_metrics.precision(), keras_metrics.recall()]) #, metrics=['accuracy'])
print("CharCNNZhang model built: ")
model.summary()

Using TensorFlow backend.


Dropout param: 0.5
CharCNNZhang model built: 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sent_input (InputLayer)      (None, 400)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 400, 128)          8960      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 394, 256)          229632    
_________________________________________________________________
thresholded_re_lu_1 (Thresho (None, 394, 256)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 131, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 125, 256)          459008    
_________________________________________________________________
thresholded_re_lu_2 (Thresho (

In [5]:
# Train

epochs=config["training"]["epochs"]
batch_size=config["training"]["batch_size"]
checkpoint_every=config["training"]["checkpoint_every"]
    
    
print("Training CharCNNZhang model: ")
model.fit(training_inputs, training_labels, validation_data=(validation_inputs, validation_labels),
          epochs=epochs, batch_size=batch_size, verbose=2, callbacks=[])

#model.fit(training_inputs, training_labels,
#          epochs=epochs, batch_size=batch_size, verbose=2, callbacks=[])

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Training CharCNNZhang model: 
Train on 18640 samples, validate on 6213 samples
Epoch 1/7
 - 17s - loss: 0.2826 - precision: 0.9575 - recall: 0.9069 - val_loss: 0.1016 - val_precision: 0.9910 - val_recall: 0.9925
Epoch 2/7
 - 11s - loss: 0.0579 - precision: 0.9943 - recall: 0.9990 - val_loss: 0.0408 - val_precision: 0.9951 - val_recall: 1.0000
Epoch 3/7
 - 11s - loss: 0.0311 - precision: 0.9974 - recall: 0.9990 - val_loss: 0.0168 - val_precision: 0.9992 - val_recall: 1.0000
Epoch 4/7
 - 11s - loss: 0.0149 - precision: 0.9991 - recall: 0.9997 - val_loss: 0.0176 - val_precision: 0.9987 - val_recall: 1.0000
Epoch 5/7
 - 11s - loss: 0.0157 - precision: 0.9989 - recall: 0.9997 - val_loss: 0.0098 - val_precision: 0.9997 - val_recall: 1.0000
Epoch 6/7
 - 11s - loss: 0.0089 - precision: 0.9996 - recall: 0.9999 - val_loss: 0.0172 - val_precision: 0.9997 - val_recall: 1.0000
Epoch 7/7
 - 11s - loss: 0.0134 - precision: 0.9991 - recall: 0.9995 - val_loss: 0.0347 - val_precision: 1.0000 - val_recal

In [6]:
# load json and create model
from keras.models import model_from_json
import keras_metrics

json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(optimizer=optimizer, loss=loss, metrics=[keras_metrics.precision(), keras_metrics.recall()])
score = loaded_model.evaluate(test_inputs, test_labels, batch_size=batch_size, verbose=1)

for i in range(0,len(loaded_model.metrics_names)):
    print("%s: %.2f%%" % (loaded_model.metrics_names[i], score[i]*100))

Loaded model from disk
loss: 3.50%
precision: 99.90%
recall: 99.25%


In [7]:
from sklearn.metrics import classification_report
import numpy as np

model = loaded_model
y_test = test_labels
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict(test_inputs)
y_pred = np.argmax(y_pred,axis=-1)
print(classification_report(Y_test, y_pred))

# for ECMLPKDD
# valid = 0
# xss =  1
# sqlinjection = 2
# ldapinjection  = 3
# xpathinjection  = 4
# pathtransversal = 5
# oscommanding  = 6
# ssi = 7

# for  CISC
# valid = 0
# malicious = 1

# for Morzeux_HttpParamsDataset
# valid = 0
# sqli  = 1
# xss   = 2
# path-traversal = 3
# cmdi = 4

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      3860
           1       1.00      0.99      1.00      2170
           2       0.82      0.95      0.88       110
           3       1.00      0.90      0.95        48
           4       0.38      0.81      0.52        26

   micro avg       0.99      0.99      0.99      6214
   macro avg       0.84      0.93      0.87      6214
weighted avg       0.99      0.99      0.99      6214

