In [1]:
import pandas as  pd
import numpy as np
from hmmlearn import hmm
import os
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import sklearn.ensemble as ek
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
from sklearn.metrics import accuracy_score, recall_score
import math
import joblib
import nltk
from nltk.collocations import *
from sklearn import preprocessing

  from numpy.core.umath_tests import inner1d


In [2]:
#获取域名的信息熵
def getEntropy(domain):
    lableCount = {}
    length = len(domain)
    for i in range(0,length):
        if domain[i] in lableCount.keys():
            lableCount[domain[i]] = lableCount[domain[i]] + 1
        else:
            lableCount[domain[i]] = 1
    
    shangnon = 0
    for i in lableCount.keys():
        prob = float(lableCount[i]) / length
        shangnon = shangnon - prob * math.log(prob, 2)
    return shangnon

In [3]:
#获取域名中元音出现的概率
def getVowel(domain):
    vowelList = ['a','e','i','o','u']
    domain = domain.lower()
    length = len(domain)
    #letterNum = 0
    vowelNum = 0
    vowelRatio = 0
    for i in range(0,length):
        # if ord(domain[i]) >= ord('a') and ord(domain[i]) <= ord('z'):
        #     letterNum = letterNum + 1
        if domain[i] in vowelList:
            vowelNum = vowelNum + 1
    
    if vowelNum == 0:
        return 0
    else:
        vowelRatio = float(vowelNum) / length
        return vowelRatio

In [4]:
#判断是否是顶级域名
def getIsTLD(domain):
    root = domain.split('.')[-1]   
    TLDlist = ['cn','com','cc','net','org','gov','info']
    if root in TLDlist:
        return 1
    else:
        return 0

In [5]:
#获取域名的长度
def getLength(domain):
    return len(domain)

In [6]:
#获取域名的数字特征
def getNum(domain):
    length = len(domain)
    num = 0
    for i in range(0,length):
        if ord(domain[i]) >= ord('0') and ord(domain[i]) <= ord('9'):
            num = num + 1
    
    return num

In [7]:
#获取域名中辅音出现的概率
def getConsonant(domain):
    length = len(domain)
    domain = domain.lower()
    string = ''
    numList = 0
    vowelList = ['a', 'e', 'i', 'o', 'u']

    for i in range(0, length):
        if ord(domain[i]) >= ord('a') and ord(domain[i]) <= ord('z'):
            if domain[i] not in vowelList:
                string = string + domain[i]
                #print("string:",string)
            else:
                if len(string) >= 2:
                    numList = numList + 1
                string = ''
        else:
            if len(string) >= 2:
                numList = numList + 1
            string = ''
    #print("numList:",numList,"length:",length)
    return float(numList / length)

In [8]:
#统计bigram值
def getBigram(domain):
    bigram_finder = BigramCollocationFinder.from_words(domain)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    scored = bigram_finder.score_ngrams(bigram_measures.pmi)
    number = 0
    for i in scored:
        number = number + i[1]
    return int(number)


In [9]:
#统计trigram值
def getTrigram(domain):
    trigram_finder = TrigramCollocationFinder.from_words(domain)
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    scored = trigram_finder.score_ngrams(trigram_measures.poisson_stirling)
    number = 0
    for i in scored:
        number = number + i[1]
    return int(number)
    

In [10]:
name = ['domain']
dga = pd.read_csv('all_dga.csv', encoding='utf-8', names=name)
print("dga:", dga.shape)
dga['Label'] = 1
print(dga.head())

dga: (801667, 1)
                domain  Label
0  ofdhiydrrttpblp.com      1
1  puciftnfkplcbhp.net      1
2  bowjjxxnhkyvygk.biz      1
3   osvwkptpwqyiqen.ru      1
4  cpmjpnwdgbxyyql.org      1


In [11]:
name = ['domain']
normal = pd.read_csv('all_normal.csv', encoding='utf-8', names=name)
print("normal:", normal.shape)
normal['Label'] = 0
print(normal.head())

normal: (1000000, 1)
                   domain  Label
0             netflix.com      0
1  api-global.netflix.com      0
2        prod.netflix.com      0
3   push.prod.netflix.com      0
4              google.com      0


In [12]:
data = pd.concat([dga, normal], axis=0, ignore_index=True)
print(data.shape)

(1801667, 2)


In [13]:
data['len'] = data['domain'].map(lambda x: getLength(x))
data['Vowel'] = data['domain'].map(lambda x: getVowel(x))
data['entropy'] =  data['domain'].map(lambda x: getEntropy(x))
data['Num'] = data['domain'].map(lambda x: getNum(x))
data['IsTLD'] = data['domain'].map(lambda x: getIsTLD(x))
data['Consonant'] = data['domain'].map(lambda x:getConsonant(x))
data['bigram'] = data['domain'].map(lambda x:getBigram(x))
data['trigram'] = data['domain'].map(lambda x:getTrigram(x))
data = shuffle(data)
print(data.head())

                                           domain  Label  len     Vowel  \
1379298                                 asmdc.org      0    9  0.222222   
1679488       edge-036.hkhkg-4.icloud-content.com      0   35  0.228571   
1300065                     metaphor-platform.com      0   21  0.285714   
1785933  hanjwpc308t4jw.int.marrcorp.marriott.com      0   40  0.200000   
1701958                       media.good-loop.com      0   19  0.421053   

          entropy  Num  IsTLD  Consonant  bigram  trigram  
1379298  3.169925    0      1   0.111111      25       37  
1679488  4.107156    4      1   0.142857      99      203  
1300065  3.558519    0      1   0.190476      55      103  
1785933  3.906198    4      1   0.175000      90      208  
1701958  3.321104    0      1   0.000000      44       80  


In [14]:
#数据进行归一化
scaler = preprocessing.StandardScaler()
len_scale_param = scaler.fit(data['len'].values.reshape(-1,1))
data['len'] = scaler.fit_transform(data['len'].values.reshape(-1,1),len_scale_param)
shan_scale_param = scaler.fit(data['entropy'].values.reshape(-1,1))
data['entropy'] = scaler.fit_transform(data['entropy'].values.reshape(-1,1),shan_scale_param)
print(data.head())
print("  ")



                                           domain  Label       len     Vowel  \
1379298                                 asmdc.org      0 -1.313150  0.222222   
1679488       edge-036.hkhkg-4.icloud-content.com      0  1.565741  0.228571   
1300065                     metaphor-platform.com      0  0.015569  0.285714   
1785933  hanjwpc308t4jw.int.marrcorp.marriott.com      0  2.119374  0.200000   
1701958                       media.good-loop.com      0 -0.205884  0.421053   

          entropy  Num  IsTLD  Consonant  bigram  trigram  
1379298 -0.919221    0      1   0.111111      25       37  
1679488  1.462641    4      1   0.142857      99      203  
1300065  0.068343    0      1   0.190476      55      103  
1785933  0.951930    4      1   0.175000      90      208  
1701958 -0.535017    0      1   0.000000      44       80  
  


In [15]:
#划分数据集
X = data.drop(['domain', 'Label'], axis=1).values
y = data['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [16]:
#决策树算法
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
print("决策树:")
print("Training accuracy：", acc_decision_tree)

y_pred = decision_tree.predict(X_test)
print("Test accuracy：", accuracy_score(y_test, y_pred))
print("Test recall score:", recall_score(y_test, y_pred))

决策树:
Training accuracy： 92.6
Test accuracy： 0.8761676666648165
Test recall score: 0.8479784668872316


In [17]:
#随机森林算法
random_forest = ek.RandomForestClassifier(n_estimators=50)
random_forest.fit(X_train, y_train)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
print("随机森林:")
print("Training accuracy：", acc_random_forest)

y_pred = random_forest.predict(X_test)
print("Test accuracy：", accuracy_score(y_test, y_pred))
print("Test recall score:", recall_score(y_test, y_pred))

随机森林:
Training accuracy： 92.59
Test accuracy： 0.8851038203444582
Test recall score: 0.8708013233892222


In [18]:
#DAG域名特征提取
def get_feature(DGA, label):
    result = []
    result.append(DGA)
    result.append(label)
    result.append(getLength(DGA))
    result.append(getVowel(DGA))
    result.append(getEntropy(DGA))
    result.append(getNum(DGA))
    result.append(getIsTLD(DGA))
    result.append(getConsonant(DGA))
    result.append(getBigram(DGA))
    result.append(getTrigram(DGA))
    return result

In [19]:
#输入单个域名判断是否为DGA域名
test = 'github.com'
result = pd.DataFrame(columns=('domain', 'Label', 'len', 'Vowel', 'entropy', 'Num', 'IsTLD', 'Consonant', 'bigram', 'trigram'))
results = get_feature(test, 1)
result.loc[0] = results
result = result.drop(['domain', 'Label'], axis = 1).values
print(decision_tree.predict(result))

[0]
