In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
import string
import math
import pickle
import re
import os

# 数据处理

### 马尔科夫链

In [2]:
accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])

def normalize(line):
    return [c.lower() for c in line if c.lower() in accepted_chars]

def ngram(n, l):
    filtered = normalize(l)
    for start in range(0, len(filtered) - n + 1):
        yield ''.join(filtered[start:start + n])

def train():
    k = len(accepted_chars)
    counts = [[10 for i in range(k)] for i in range(k)]
    for line in open('big.txt'):
        for a, b in ngram(2, line):
            counts[pos[a]][pos[b]] += 1
    for i, row in enumerate(counts):
        s = float(sum(row))
        for j in range(len(row)):
            row[j] = math.log(row[j] / s)
    good_probs = [avg_transition_prob(l, counts) for l in open('good.txt')]
    bad_probs = [avg_transition_prob(l, counts) for l in open('bad.txt')]
    assert min(good_probs) > max(bad_probs)
    thresh = (min(good_probs) + max(bad_probs)) / 2
    pickle.dump({'mat': counts, 'thresh': thresh}, open('gib_model.pki', 'wb'))

def avg_transition_prob(l, log_prob_mat):
    log_prob = 0.0
    transition_ct = 0
    for a, b in ngram(2, l):
        log_prob += log_prob_mat[pos[a]][pos[b]]
        transition_ct += 1
    return math.exp(log_prob / (transition_ct or 1))

if __name__ == '__main__':
    if not os.path.exists("gib_model.pki"):
        train()

### 特征提取函数

In [3]:
# 计算信息熵
def calc_ent(domain):
    dataset = []
    for each in domain:
        dataset.append(each)
    data1 = np.array(dataset)
    x_value_list = set([data1[i] for i in range(data1.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(data1[data1 == x_value].shape[0]) / data1.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    return ent

#计算域名长度
def domain_len(domain):
    return len(domain)

#数字出现概率
def digit_probability(domain):
    digit_num = 0
    for each_char in domain:
        if each_char.isdigit():
            digit_num = digit_num + 1
    return digit_num / len(domain)

# 元音出现概率
def vowel_probability(domain):
    vowel = ['a', 'e', 'i', 'o', 'u']
    vowel_num = 0
    for each in domain:
        if each in vowel:
            vowel_num = vowel_num + 1
    return vowel_num / len(domain)

# 辅音组合与长度比值
def consonant_rate(domain):
    perg = re.compile("[bcdfghjklmnpqrstvwxyz]{2,%d}" % len(domain))
    result = perg.findall(domain)
    return float(len(result) / len(domain))

def avg_calc_ent(arr):
    sum = 0
    for each in arr:
        sum = sum + calc_ent(each)
    return sum / len(arr)

def avg_domain_len(arr):
    sum = 0
    for each in arr:
        sum = sum + domain_len(each)
    return sum / len(arr)

def avg_digit_probability(arr):
    sum = 0 
    for each in arr:
        sum = sum + digit_probability(each)
    return sum / len(arr)

def avg_vowel_probability(arr):
    sum = 0 
    for each in arr:
        sum = sum + vowel_probability(each)
    return sum / len(arr)

def avg_consonant_rate(arr):
    sum = 0 
    for each in arr:
        sum = sum + consonant_rate(each)
    return sum / len(arr)

def clean_data(arr):
    for each in arr:
        if "." not in each:
            arr.remove(each)
    for each in arr:
        if len(each) < 4:
            arr.remove(each)
    return arr

### 特征数值计算

In [4]:
DGA_data = pd.read_table("./DGA.txt")
normal_data = pd.read_csv("./normal.csv")

raw_data_DGA = clean_data(DGA_data.iloc[:,1].values.tolist())
raw_data_normal = clean_data(normal_data.iloc[:,1].values.tolist())

DGA_calc_ent = []
normal_calc_ent = []
DGA_domain_len = []
normal_domain_len = []
DGA_digit_probability = []
normal_digit_probability = []
DGA_vowel_probability = []
normal_vowel_probability = []
DGA_consonant_rate = []
normal_consonant_rate = []
print("进入第一个循环")
for each in raw_data_DGA:
    DGA_calc_ent.append(calc_ent(each))
    DGA_domain_len.append(domain_len(each))
    DGA_digit_probability.append(digit_probability(each))
    DGA_vowel_probability.append(vowel_probability(each))
    DGA_consonant_rate.append(consonant_rate(each))
print("进入第二个循环")
for each in raw_data_normal:
    normal_calc_ent.append(calc_ent(each))
    normal_domain_len.append(domain_len(each))
    normal_digit_probability.append(digit_probability(each))
    normal_vowel_probability.append(vowel_probability(each))
    normal_consonant_rate.append(consonant_rate(each))

进入第一个循环
进入第二个循环


### 数据集平均值

In [5]:
DGA_ent_avg = avg_calc_ent(raw_data_DGA)
normal_ent_avg = avg_calc_ent(raw_data_normal)
DGA_len = avg_domain_len(raw_data_DGA)
normal_len = avg_domain_len(raw_data_normal)
DGA_digit = avg_digit_probability(raw_data_DGA)
normal_digit = avg_digit_probability(raw_data_normal)
DGA_vowel = avg_vowel_probability(raw_data_DGA)
normal_vowel = avg_vowel_probability(raw_data_normal)
DGA_consonant = avg_consonant_rate(raw_data_DGA)
normal_consonant = avg_consonant_rate(raw_data_normal)

### 计算结果

In [6]:
print("信息熵最大最小值平均值：")
print("DGA:",max(DGA_calc_ent),min(DGA_calc_ent),DGA_ent_avg)
print("Normal:",max(normal_calc_ent),min(normal_calc_ent),normal_ent_avg)
print("长度最大最小值平均值：")
print("DGA:",max(DGA_domain_len),min(DGA_domain_len),DGA_len)
print("Normal:",max(normal_domain_len),min(normal_domain_len),normal_len)
print("数字概率最大最小值平均值：")
print("DGA:",max(DGA_digit_probability),min(DGA_digit_probability),DGA_digit)
print("Normal:",max(normal_digit_probability),min(normal_digit_probability),normal_digit)
print("元音概率最大最小值平均值：")
print("DGA:",max(DGA_vowel_probability),min(DGA_vowel_probability),DGA_vowel)
print("Normal:",max(normal_vowel_probability),min(normal_vowel_probability),normal_vowel)
print("辅音比值最大最小值平均值：")
print("DGA:",max(DGA_consonant_rate),min(DGA_consonant_rate),DGA_consonant)
print("Normal:",max(normal_consonant_rate),min(normal_consonant_rate),normal_consonant)

信息熵最大最小值平均值：
DGA: 4.868507898969238 1.9056390622295665 3.7075958136654292
Normal: 4.881142681145251 0.5435644431995964 3.259898914769188
长度最大最小值平均值：
DGA: 50 8 19.982632270262283
Normal: 73 4 14.311734311734313
数字概率最大最小值平均值：
DGA: 0.7692307692307693 0.0 0.033869343374289394
Normal: 0.8709677419354839 0.0 0.01285617549978626
元音概率最大最小值平均值：
DGA: 0.7368421052631579 0.0 0.2646734219039188
Normal: 0.875 0.0 0.32633724598410374
辅音比值最大最小值平均值：
DGA: 0.375 0.0 0.1321342106694171
Normal: 0.4 0.0 0.12928901089169562


### 标签转换

In [7]:
def information(domain):
    if abs(calc_ent(domain)- DGA_ent_avg) < abs(calc_ent(domain)- normal_ent_avg):
        return 1
    else:
        return 0
def lens(domain):
    if abs(domain_len(domain)- DGA_len) < abs(domain_len(domain)- normal_len):
        return 1
    else:
        return 0
def dight(domain):
    if abs(digit_probability(domain)- DGA_digit) < abs(digit_probability(domain)- normal_digit):
        return 1
    else:
        return 0 
def vowel(domain):
    if abs(vowel_probability(domain)- DGA_vowel) < abs(vowel_probability(domain)- normal_vowel):
        return 1
    else:
        return 0 
def consonant(domain):
    if abs(consonant_rate(domain)- DGA_consonant) < abs(consonant_rate(domain)- normal_consonant):
        return 1
    else:
        return 0 
import pickle
model_data = pickle.load(open('gib_model.pki', 'rb'))
model_mat = model_data['mat']
threshold = model_data['thresh']
def gid(domain):
    if((avg_transition_prob(domain, model_mat) > threshold) == False):
        return 1
    else:
        return 0

### 标签提取

In [8]:
def characterize(domain):
    features = [information(domain),lens(domain),dight(domain),vowel(domain),consonant(domain),gid(domain)]
    return features
features = []
labels = []

for each in raw_data_DGA:
    features.append(characterize(each))
    labels.append(1)
for each in raw_data_normal:
    features.append(characterize(each))
    labels.append(0)

# 训练

### 随机森林训练

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.3, random_state=0)
clf = RandomForestClassifier()
clf.fit(train_X, train_y)
pred_y = clf.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))



precision_score: 0.8595657057281917
recall_score: 0.8753574404810536


### 随机森林测验函数

In [10]:
def test_domain_RandomForest(domain):
    test_features = []
    test_features.append(characterize(domain))
    pre = clf.predict(test_features)
    print("DGA") if pre == 1 else print("Normal")

### 决策树训练

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(train_X, train_y)
Y_pred = decision_tree.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))

precision_score: 0.8595657057281917
recall_score: 0.8753574404810536


### 决策树测验函数

In [12]:
def test_domain_DecisionTree(domain):
    test_features = []
    test_features.append(characterize(domain))
    pre = decision_tree.predict(test_features)
    print("DGA") if pre == 1 else print("Normal")

### 逻辑斯蒂回归训练

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
log_reg = LogisticRegression(solver='liblinear', max_iter = 10000)
log_reg.fit(train_X, train_y)
Y_pred = log_reg.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))

precision_score: 0.8595657057281917
recall_score: 0.8753574404810536


### 逻辑斯蒂回归测验函数

In [14]:
def test_domain_Logistic(domain):
    test_features = []
    test_features.append(characterize(domain))
    pre = log_reg.predict(test_features)
    print("DGA") if pre == 1 else print("Normal")

# 验证

In [None]:
while True:
    test_domain_RandomForest(input("随机森林测试："))
    test_domain_DecisionTree(input("决策树测试："))
    test_domain_Logistic(input("逻辑斯蒂回归测试："))

随机森林测试：youtube.com
Normal
决策树测试：youtube.com
Normal
逻辑斯蒂回归测试：youtube.com
Normal
随机森林测试：fdhsauihedruoisag.cc
DGA
决策树测试：fdhsauihedruoisag.cc
DGA
逻辑斯蒂回归测试：fdhsauihedruoisag.cc
DGA
随机森林测试：1vynq17gk6yo14ndoft2xc.com
DGA
决策树测试：1vynq17gk6yo14ndoft2xc.com
DGA
逻辑斯蒂回归测试：1vynq17gk6yo14ndoft2xc.com
DGA
随机森林测试：pppppp.com.cn.net
Normal
决策树测试：pppppp.com.cn.net
Normal
逻辑斯蒂回归测试：pppppp.com.cn.net
Normal
随机森林测试：ncixjaieuwveskmnb.com
DGA
决策树测试：ncixjaieuwveskmnb.com
DGA
逻辑斯蒂回归测试：ncixjaieuwveskmnb.com
DGA
