In [3]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
import sklearn
from sklearn.model_selection import train_test_split#分割数据集
import pandas as pd
from keras.models import load_model
from sklearn import feature_extraction

In [4]:
from sklearn.metrics import accuracy_score   #准确率accuracy
from sklearn.metrics import precision_score  #精确率precision
from sklearn.metrics import recall_score     #召回率recall
from collections import Counter              #统计list中各个元素出现的次数
import pickle

In [5]:
RESULT_FILE = 'results.pkl'

In [6]:
def build_model(max_features, maxlen):
    """Build LSTM model"""
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop')

    return model

In [7]:
def run(max_epoch=5, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    dataSet = pd.read_csv("data.csv",header=None)

	#提取数据和标签
    X = dataSet[0].values
    y = dataSet[1].values

    # Generate a dictionary of valid characters
	#生成一个有效字符的字典
	#将每个字符串转换为表示每个可能字符的int数组。这种编码是任意的，
	#但是应该从1开始（我们为结束序列token保留0）并且是连续的。
    valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    maxlen = np.max([len(x) for x in X])

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = sequence.pad_sequences(X, maxlen=maxlen)

    lstm_data = []

    #每循环一次就生成一个新的模型
    for fold in range(nfolds):
        print("fold %u/%u" % (fold+1, nfolds))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        train_number = Counter(y_train)  #训练集里DGA域名和普通域名的数量
        test_number = Counter(y_test)   #测试集里DGA域名和普通域名的数量,1代表DGA域名，0代表普通域名


        print('Build model...')
        model = build_model(max_features, maxlen)

        print("Train...")
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        #同一个模型训练多次，用来提高精确率的
        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, epochs=1)

            t_probs = model.predict_proba(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)
                for i in range(len(probs)):
                    probs[i]=probs[i][0]
                    if probs[i]>0.5:
                        probs[i]=1
                    else:
                        probs[i]=0
                #print(probs)
                accuracy = accuracy_score(y_test,probs)
                precision = precision_score(y_test,probs)
                recall = recall_score(y_test,probs)
                out_data = {'train_number':train_number, 'test_number':test_number, 'accuracy': accuracy,
                            'precision': precision,'recall':recall}
                print(out_data)
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 1:
                    break

        lstm_data.append(out_data)
    results = {'valid_chars':valid_chars,'maxlen':maxlen,'final_data':lstm_data}
    pickle.dump(results, open(RESULT_FILE, 'wb'))
    model.save("model.h5")
    return lstm_data

In [8]:
# final_data = run(max_epoch=3,nfolds=1)
lstm_data = run(max_epoch=3,nfolds=1)

fold 1/1
Build model...
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train...

Epoch 1/1
Epoch 0: auc = 0.990704 (best=0.000000)
{'precision': 0.9724381625441696, 'test_number': Counter({1: 3001, 0: 2999}), 'accuracy': 0.9455, 'train_number': Counter({0: 7001, 1: 6999}), 'recall': 0.9170276574475175}
Epoch 1/1
Epoch 1: auc = 0.995050 (best=0.990704)
{'precision': 0.9514950166112957, 'test_number': Counter({1: 3001, 0: 2999}), 'accuracy': 0.9528333333333333, 'train_number': Counter({0: 7001, 1: 6999}), 'recall': 0.9543485504831722}
Epoch 1/1
Epoch 2: auc = 0.994576 (best=0.995050)


In [9]:
print(lstm_data)

[{'precision': 0.9514950166112957, 'test_number': Counter({1: 3001, 0: 2999}), 'accuracy': 0.9528333333333333, 'train_number': Counter({0: 7001, 1: 6999}), 'recall': 0.9543485504831722}]


In [10]:
def build_model2(max_features):
    """Builds logistic regression model"""
    model = Sequential()
    model.add(Dense(1, input_dim=max_features, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam')

    return model

In [11]:
def run2(max_epoch=50, nfolds=10, batch_size=128):
    """Run train/test on logistic regression model"""
    dataSet = pd.read_csv("data.csv",header=None)

	#提取数据和标签
    X = dataSet[0].values
    y = dataSet[1].values

    # Create feature vectors
    print ("vectorizing data")
    ngram_vectorizer = feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(2, 2))
    count_vec = ngram_vectorizer.fit_transform(X)

    max_features = count_vec.shape[1]

    # Convert labels to 0-1
    #labels = [0 if x == 'benign' else 1 for x in labels]

    bigram_data = []

    for fold in range(nfolds):
        print ("fold %u/%u" % (fold+1, nfolds))
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        X_train, X_test, y_train, y_test = train_test_split(count_vec,y, test_size=0.2)

        print ('Build model...')
        model = build_model2(max_features)

        print ("Train...")
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train.todense(), y_train, batch_size=batch_size, nb_epoch=1)

            t_probs = model.predict_proba(X_holdout.todense())
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print ('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test.todense())

                for i in range(len(probs)):
                    probs[i]=probs[i][0]
                    if probs[i]>0.5:
                        probs[i]=1
                    else:
                        probs[i]=0
                #print(probs)
                accuracy = accuracy_score(y_test,probs)
                precision = precision_score(y_test,probs)
                recall = recall_score(y_test,probs)
                out_data = { 'accuracy': accuracy,'precision': precision,'recall':recall}
                print(out_data)
            else:
                # No longer improving...break and calc statistics
                if (ep-best_iter) > 5:
                    break

        bigram_data.append(out_data)

    return bigram_data

In [12]:
bigram_data = run2(max_epoch=3,nfolds=1)
print(bigram_data)

vectorizing data
fold 1/1
Build model...
Train...




Epoch 1/1
Epoch 0: auc = 0.986617 (best=0.000000)
{'precision': 0.9082217973231358, 'accuracy': 0.92375, 'recall': 0.9438648782911078}
Epoch 1/1



Epoch 1: auc = 0.991563 (best=0.986617)
{'precision': 0.9411182582879762, 'accuracy': 0.9425, 'recall': 0.9448584202682563}
Epoch 1/1



Epoch 2: auc = 0.993071 (best=0.991563)
{'precision': 0.9518314099347717, 'accuracy': 0.947, 'recall': 0.942374565325385}
[{'precision': 0.9518314099347717, 'accuracy': 0.947, 'recall': 0.942374565325385}]


In [13]:
def dga_predict(domain):
    model = load_model('model.h5')
    results = pickle.load(open(RESULT_FILE,'rb'))
    valid_chars = results['valid_chars']
    maxlen = results['maxlen']
    
    X = [[valid_chars[y] for y in domain]]
    X = sequence.pad_sequences(X, maxlen=maxlen)
    result = model.predict(X)
    print("这个域名为恶意域名的概率为:",result)

In [14]:
dga_predict("baidu.com")

这个域名为恶意域名的概率为: [[0.00382655]]


In [16]:
dga_predict("edjsdjksmxma")

这个域名为恶意域名的概率为: [[0.99884146]]
