In [2]:
import tldextract
import pandas as pd
import re
from collections import Counter
import requests

def get_alexa():
    """
    提取alexa 域名 全部为正面域名 结构为 [[域名1, 目标值], [域名2, 目标值]..........]
    :return:
    """
    alexa_date = pd.read_csv("./top-1m.csv").iloc[0:200000, :]
    return [("benign", tldextract.extract(row["num1"]).domain) for index, row in alexa_date.iterrows()]

def get_360_DGA():
    """
    提取360dga域名 结构同上
    :return:
    """
    f = open("./360_dga.txt", "r", encoding="utf-8").readlines()[0: 200000]
    # ls = [re.sub('\t+', ' ', i).split(' ')[0] for i in f]
    # d = sorted(Counter(ls).items(), key=lambda x: x[1], reverse=True)[0: 100]
    # print(d)
    return [(re.sub('\t+', ' ', i).split(' ')[0], tldextract.extract(re.sub('\t+', ' ', i).split(' ')[1]).domain) for i in f]



# print(len(get_360_DGA()))
def get_zeus_dga():
    """
    提取zeus域名
    :return:
    """
    f = open("zeus_dga_domains.txt", "r", encoding="utf-8").read()
    return f.split(".")


def get_data():
    """
    拼接返回全部数据 作为数据集
    :return:
    """
    return get_alexa() + get_360_DGA()
date_set = get_data()
print(len(date_set))

400000


In [3]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
import sklearn
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
# date_set[0: 10]
features = [i[1] for i in date_set]   #提取域名
label = [i[0] for i in date_set]      #提取标签
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(features)))} #构造检索字典
y = [0 if x == 'benign' else 1 for x in label]   #目标值修改为0或1
max_features = len(valid_chars) + 1  #？？？
maxlen = np.max([len(x) for x in features])   #获得输入特征的最大长度


In [15]:
import numpy as np
def process_features(valid_chars, features, maxlen):
    """将特征替换为 检索字典的值 并根据最长特征长度 构造每个特征 无值填充为0"""
    X = [[valid_chars[y] for y in x] for x in features]
    X = sequence.pad_sequences(X, maxlen=maxlen)
    X = np.array(X)
    return X
X = process_features(valid_chars, features, maxlen)
y = np.array(y)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)   #训练集测试集切割

# print(len(X[0]), X[0])
# print(len(X[1]), X[1])

# print(label[0: 10])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 24 20 20 24  6
  17]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 19 20 26 25 26 37
  17]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 18 30 16 17 37 20 20
  28]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 37 30  2 21
  26]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 15  2 28  2 22 17 21  2
  30]] <class 'numpy.ndarray'>
[0 0 0 0 0] <class 'numpy.ndarray'>


In [16]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train, velidation):
    """
    可视化训练过程 对比
    :param train_history:
    :param train:
    :param velidation:
    :return:
    """
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[velidation])
    plt.title("Train History")   #标题
    plt.xlabel('Epoch')    #x轴标题
    plt.ylabel(train)  #y轴标题
    plt.legend(['train', 'test'], loc='upper left')  #图例 左上角
    plt.show()

def build_model(max_features, maxlen):
    """Build LSTM model"""
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop')

    return model
def run():
    model = build_model(max_features, maxlen) 
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)   #训练集测试集切割

    history = model.fit(x_train, y_train, epochs=2, batch_size=128, validation_data=(x_test, y_test))   #训练
    
    score, acc = model.evaluate(x_test, y_test, batch_size=128)
    print('Test score:', score)
    print('Test accuracy:', acc)
    
    """可视化训练过程"""
    show_train_history(history, 'acc', 'val_acc')  # 训练集准确率与验证集准确率 折线图
    show_train_history(history, 'loss', 'val_loss')  # 训练集误差率与验证集误差率 折线图

run()

Train on 320000 samples, validate on 80000 samples
Epoch 1/2
Epoch 2/2


TypeError: cannot unpack non-iterable numpy.float64 object