In [1]:
import tensorflow as tf
import json

In [3]:
from data_utils import Data


config = json.load(open("./config.json"))

# Load training data
training_data = Data(data_source=config["data"]["training_data_source"], alphabet=config["data"]["alphabet"],
                         input_size=config["data"]["input_size"],num_of_classes=config["data"]["num_of_classes"])
training_data.load_data()
training_inputs, training_labels, batch_texts = training_data.get_all_data()


# Load validation data
validation_data = Data(data_source=config["data"]["validation_data_source"], alphabet=config["data"]["alphabet"],
                           input_size=config["data"]["input_size"], num_of_classes=config["data"]["num_of_classes"])
validation_data.load_data()
validation_inputs, validation_labels, batch_texts = validation_data.get_all_data()

# Load test data
test_data = Data(data_source=config["data"]["test_data_source"], alphabet=config["data"]["alphabet"],
                           input_size=config["data"]["input_size"], num_of_classes=config["data"]["num_of_classes"])
test_data.load_data()
test_inputs, test_labels, batch_texts = test_data.get_all_data()


Data loaded from ./_train.csv
Data loaded from ./_validate.csv
Data loaded from ./_test.csv


In [7]:
# Build model ###
from keras.models import Model
from keras.layers import Input, Dense, Flatten
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Embedding
from keras.layers import ThresholdedReLU
from keras.layers import Dropout
import keras_metrics

#vars
input_size=config["data"]["input_size"]
alphabet_size=config["data"]["alphabet_size"]
embedding_size=config["char_cnn_zhang"]["embedding_size"]
conv_layers=config["char_cnn_zhang"]["conv_layers"]
fully_connected_layers=config["char_cnn_zhang"]["fully_connected_layers"]
num_of_classes=config["data"]["num_of_classes"]
threshold=config["char_cnn_zhang"]["threshold"]
dropout_p=config["char_cnn_zhang"]["dropout_p"]
print("Dropout param: "+str(dropout_p))
optimizer=config["char_cnn_zhang"]["optimizer"]
loss=config["char_cnn_zhang"]["loss"]

# Input layer
inputs = Input(shape=(input_size,), name='sent_input', dtype='int64')
        
# Embedding layers
x = Embedding(alphabet_size + 1, embedding_size, input_length=input_size)(inputs)

# Convolution layers
for cl in conv_layers:
    x = Convolution1D(cl[0], cl[1])(x)
    x = ThresholdedReLU(threshold)(x)
    if cl[2] != -1:
        x = MaxPooling1D(cl[2])(x)

x = Flatten()(x)

# Fully connected layers
for fl in fully_connected_layers:
    x = Dense(fl)(x)
    x = ThresholdedReLU(threshold)(x)
    x = Dropout(dropout_p)(x)
        
# Output layer
predictions = Dense(num_of_classes, activation='softmax')(x)

# Build and compile model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=[keras_metrics.precision(), keras_metrics.recall()]) #, metrics=['accuracy'])
print("CharCNNZhang model built: ")
model.summary()

Dropout param: 0.5
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fn
CharCNNZhang model built: 
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sent_input (InputLayer)      (None, 400)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 400, 128)          8960      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 394, 256)          229632    
_________________________________________________________________
thresholded_re_lu_1 (Thresho (None, 394, 256)          0         
_________________________________________________________________
m

In [8]:
# Train

epochs=config["training"]["epochs"]
batch_size=config["training"]["batch_size"]
checkpoint_every=config["training"]["checkpoint_every"]
    
    
print("Training CharCNNZhang model: ")
model.fit(training_inputs, training_labels, validation_data=(validation_inputs, validation_labels),
          epochs=epochs, batch_size=batch_size, verbose=2, callbacks=[])

#model.fit(training_inputs, training_labels,
#          epochs=epochs, batch_size=batch_size, verbose=2, callbacks=[])

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Training CharCNNZhang model: 


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 30069 samples, validate on 10023 samples
Epoch 1/7
 - 246s - loss: 1.1519 - precision: 0.7236 - recall: 0.9079 - val_loss: 1.0142 - val_precision: 0.7753 - val_recall: 0.9889
Epoch 2/7
 - 242s - loss: 0.8636 - precision: 0.8630 - recall: 0.9404 - val_loss: 0.7990 - val_precision: 0.9805 - val_recall: 0.9073
Epoch 3/7
 - 241s - loss: 0.6828 - precision: 0.9692 - recall: 0.9788 - val_loss: 0.6638 - val_precision: 0.9659 - val_recall: 0.9826
Epoch 4/7
 - 252s - loss: 0.6400 - precision: 0.9820 - recall: 0.9850 - val_loss: 0.6474 - val_precision: 0.9615 - val_recall: 0.9881
Epoch 5/7
 - 244s - loss: 0.5777 - precision: 0.9861 - recall: 0.9900 - val_loss: 0.5883 - val_precision: 0.9720 - val_recall: 0.9815
Epoch 6/7
 - 242s - loss: 0.5165 - precision: 0.9846 - recall: 0.9891 - val_loss: 0.5263 - val_precision: 0.9840 - val_recall: 0.9644
Epoch 7/7
 - 238s - loss: 0.4424 - precision: 0.9870 - recall: 0.9871 - val_loss: 0.5218 - val_precision: 0.9754 - val_recall: 0.9806
Saved model 

In [9]:
# load json and create model
from keras.models import model_from_json
import keras_metrics

json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(optimizer=optimizer, loss=loss, metrics=[keras_metrics.precision(), keras_metrics.recall()])
score = loaded_model.evaluate(test_inputs, test_labels, batch_size=batch_size, verbose=1)

for i in range(0,len(loaded_model.metrics_names)):
    print("%s: %.2f%%" % (loaded_model.metrics_names[i], score[i]*100))

Loaded model from disk
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> tp
tracking <tf.Variable 'Variable:0' shape=() dtype=int32, numpy=0> fn
loss: 50.94%
precision: 98.20%
recall: 98.30%


In [11]:
from sklearn.metrics import classification_report
import numpy as np

model = loaded_model
y_test = test_labels
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict(test_inputs)
y_pred = np.argmax(y_pred,axis=-1)
print(classification_report(Y_test, y_pred))

# for ECMLPKDD
# valid = 0
# xss =  1
# sqlinjection = 2
# ldapinjection  = 3
# xpathinjection  = 4
# pathtransversal = 5
# oscommanding  = 6
# ssi = 7

# for  CISC
# valid = 0
# malicious = 1

# for Morzeux_HttpParamsDataset
# valid = 0
# sqli  = 1
# xss   = 2
# path-traversal = 3
# cmdi = 4

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      6971
           1       0.37      0.29      0.33       374
           2       0.30      0.63      0.41       439
           3       0.52      0.47      0.50       442
           4       0.60      0.67      0.63       476
           5       0.55      0.49      0.52       462
           6       0.23      0.06      0.09       470
           7       0.38      0.26      0.31       390

    accuracy                           0.81     10024
   macro avg       0.49      0.48      0.47     10024
weighted avg       0.81      0.81      0.81     10024

