## Load data

In [1]:
import pandas as pd

In [2]:
df_legit = pd.read_csv('./data/all_legit.txt', sep=' ', names=['domain', 'label'])
df_dga = pd.read_csv('./data/all_dga.txt', sep=' ', names=['domain', 'label'])

In [3]:
df_data = pd.concat([df_legit, df_dga])

In [4]:
# Let's ignore the different DGA sources for now. Group into 2 classes: legit or dga
df_data['label'] = df_data.apply(lambda x: x.label > 0, axis=1)

In [5]:
print df_legit.shape
print df_dga.shape
print df_data.shape

(1000000, 2)
(801667, 2)
(1801667, 2)


## Prep data

In [7]:
# Tokenise by converting each char into int repr
df_data['domain_char'] = df_data.apply(lambda x: [ord(c) for c in x.domain], axis=1)

In [8]:
# Create training and test split
X_train, X_test = train_test_split(df_data, test_size = .2)

## Build LSTM model

In [6]:
from sklearn.model_selection import train_test_split

In [9]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Quadro K4000 (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5110)


In [10]:
maxlen = 100
x_train = sequence.pad_sequences(X_train.domain_char, maxlen=maxlen, value=-1)
x_test = sequence.pad_sequences(X_test.domain_char, maxlen=maxlen, value=-1)

In [11]:
model = Sequential()
model.add(Embedding(256, 128, input_length=maxlen))
model.add(LSTM(128, dropout=.2, recurrent_dropout=.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, X_train.label,
         batch_size=256,
         epochs=5,
         validation_split=.1)

Train on 1297199 samples, validate on 144134 samples
Epoch 1/5

In [None]:
model.evaluate(x_test, X_test.label)

In [23]:
y_pred = model.predict(x_test, batch_size=256, verbose=1)



array([[197905,   2058],
       [  2321, 158050]])

In [30]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
print precision_recall_fscore_support(X_test.label, y_pred > .5)
confusion_matrix(X_test.label, y_pred > .5)

(array([ 0.9884081 ,  0.98714618]), array([ 0.9897081 ,  0.98552731]), array([ 0.98905767,  0.98633608]), array([199963, 160371]))


array([[197905,   2058],
       [  2321, 158050]])

In [31]:
model.save('./dga-bot.h5')