In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, GlobalMaxPool2D
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [2]:
def target_row(line, rest=True):
    '''
    Filter logs for lines with
    greater variation.
    '''
    if rest:
        return line.startswith('GET') or line.startswith('POST') or line.startswith('modo=') or line.startswith('id=')
    else:
        return line.startswith('GET') or line.startswith('POST')

def read_format(file_name):
    '''
    Filter out http://localhost:8080... not informative
    '''
    with open(file_name, 'r') as infile:
        data = infile.readlines()
    data = [line.strip() for line in data]
    data = [line for line in data if target_row(line)]
    r_data = []
    start_string = data[0].lower()
    for line in data[1:]:
        if target_row(line, rest=False):
            r_data.append(start_string)
            start_string = line.replace('http://localhost:8080', '').lower()
        else:
            start_string += ' ' + line.replace('http://localhost:8080', '').lower()
    return r_data

In [3]:
norm_data = read_format('normalTrafficTest.txt')
anom_data = read_format('anomalousTrafficTest.txt')

In [4]:
anom_data[:5]

['get http://localhost:8080/tienda1/publico/anadir.jsp?id=2&nombre=jam%f3n+ib%e9rico&precio=85&cantidad=%27%3b+drop+table+usuarios%3b+select+*+from+datos+where+nombre+like+%27%25&b1=a%f1adir+al+carrito http/1.1',
 'post /tienda1/publico/anadir.jsp http/1.1 id=2&nombre=jam%f3n+ib%e9rico&precio=85&cantidad=%27%3b+drop+table+usuarios%3b+select+*+from+datos+where+nombre+like+%27%25&b1=a%f1adir+al+carrito',
 'get /tienda1/publico/anadir.jsp?id=2%2f&nombre=jam%f3n+ib%e9rico&precio=85&cantidad=49&b1=a%f1adir+al+carrito http/1.1',
 'post /tienda1/publico/anadir.jsp http/1.1 id=2%2f&nombre=jam%f3n+ib%e9rico&precio=85&cantidad=49&b1=a%f1adir+al+carrito',
 'get /asf-logo-wide.gif~ http/1.1']

In [5]:
# perhaps byte-pair encodings would be most efficient  https://github.com/rsennrich/subword-nmt
# filtering out the url

In [6]:
char_dict = {}
char_smpl = ' '.join(anom_data)
char_smpl = sorted(list(set(char_smpl)))
for idx, ch in enumerate(char_smpl):
    char_dict[ch] = idx

In [7]:
anom_data = [[char_dict[el] for el in line] for line in anom_data]
norm_data = [[char_dict[el] for el in line] for line in norm_data]
data = anom_data + norm_data
target = np.ones(len(anom_data)).tolist() + np.zeros(len(norm_data)).tolist()

In [8]:
n_steps = np.max([len(line) for line in data])
n_inputs = len(char_dict)

In [9]:
train = []
for line in data:
    ret_mat = np.zeros((n_steps, n_inputs))
    for idx, val in enumerate(line):
        ret_mat[idx] = np.eye(n_inputs)[val]
    train.append(ret_mat)    

In [10]:
train = np.asarray(train)
target = np.asarray(target)
train = np.expand_dims(train, axis=-1)
rnd_idx = list(range(train.shape[0]))
np.random.shuffle(rnd_idx)
N = 1000
train, target, test, tst_target = train[rnd_idx][:-N], target[rnd_idx][:-N], train[rnd_idx][-N:], target[rnd_idx][-N:]

In [11]:
def build_network():

    model = Sequential()
    model.add(Conv2D(32, (5, n_inputs), input_shape=(n_steps, n_inputs, 1)))
    
    model.add(GlobalMaxPool2D())
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(.3))
    
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dropout(.3))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dropout(.3))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


In [12]:
NN = build_network()
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.03, patience=2, verbose=0, mode='auto')

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [13]:
NN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 875, 1, 32)        7872      
_________________________________________________________________
global_max_pooling2d_1 (Glob (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
activation_1 (Activation)    (None, 32)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
__________

In [14]:
NN.fit(x=train, y=target, epochs=20, validation_split=0.1, batch_size=32, callbacks=[early_stopping])

Train on 53699 samples, validate on 5967 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<keras.callbacks.History at 0x7fd8f51e5eb8>

In [15]:
NN.evaluate(test, tst_target)



[0.062154329518321901, 0.96499999999999997]