In [1]:
from __future__ import division
import os
import sys
import re
import matplotlib
import pandas as pd
import numpy as np
#from os.path import splitext
#import ipaddress as ip
sys.path.append('/usr/local/lib/python3.7/site-packages')
import tldextract
import datetime
from urllib.parse import urlparse
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pkl
from collections import Counter
import nltk
from itertools import groupby
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from sklearn.model_selection import train_test_split

In [2]:
#香农熵
def count_entropy(domain):
    domain_len = len(domain)
    count = Counter(i for i in domain).most_common()
    entropy = -sum(fre / domain_len * (np.log(fre/domain_len)) for ch, fre in count)
    return entropy

In [3]:
#元音字母比重
def count_vowel_ratio(domain):
    vowels=list('aeiou')
    sum_vowel = sum(vowels.count(i) for i in domain.lower())
    return sum_vowel / len(domain)

In [4]:
#数字占比
def count_digits_ratio(word):#how many digits
    digits=list('0123456789')
    sum_digits = sum(digits.count(i) for i in word.lower())
    return sum_digits/len(word)

In [5]:
#重复字母占比
def count_repeat_letter(word):
    count = Counter(i for i in word.lower() if i.isalpha()).most_common()
    cnt = 0
    for letter,ct in count:
        if ct>1:
            cnt+=1
    return cnt / len(word)

In [6]:
#连续数字占比
def consecutive_digits_ratio(word):#how many consecutive digit
    cnt = 0
    digit_map = [int(i.isdigit()) for i in word]
    consecutive=[(k,len(list(g))) for k, g in groupby(digit_map)]
    count_consecutive = sum(j for i,j in consecutive if j>1 and i==1)
    return count_consecutive / len(word)

In [7]:
#2-gram
def count_Bigram(domain):
    bigram_finder = BigramCollocationFinder.from_words(domain)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    scored = bigram_finder.score_ngrams(bigram_measures.pmi)
    number = 0
    for i in scored:
        number = number + i[1]
    return int(number)

In [8]:
#3-gram
def count_Trigram(domain):
    trigram_finder = TrigramCollocationFinder.from_words(domain)
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    scored = trigram_finder.score_ngrams(trigram_measures.poisson_stirling)
    number = 0
    for i in scored:
        number = number + i[1]
    return int(number)

In [9]:
#判断是否是顶级域名
def count_TLD(domain):
    root = domain.split('.')[-1]   
    TLDlist = ['cn','com','cc','net','org','gov','info']
    if root in TLDlist:
        return 1
    else:
        return 0

## 导入数据

In [10]:
df_dga=pd.read_csv("all_dga.txt", sep=' ',names=['domain','label'],header=None)
#df_dga.head()
df_dga.shape

(801667, 2)

In [11]:
df = pd.read_csv("top-1m.csv", names=['domain'],encoding='utf-8')
df['label']=0
df=df.sample(frac=1)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,domain,label
0,www.jaguarforums.com,0
1,timesofindia-indiatimes-com.cdn.ampproject.org,0
2,placer.ca.gov,0
3,businessoffashion.eu.auth0.com,0
4,inspired-beauty.com,0


## 特征提取

In [12]:
featureSet = pd.DataFrame(columns=('d','entropy','vowel_ratio','digits_ratio',\
'repeat_letter','consecutive_digits_ratio','2-gram','3-gram','tld','len','label'))

In [13]:
from urllib.parse import urlparse
import tldextract
def getFeatures(domain, label): 
    result = []
    url = str(domain)
    result.append(domain)
    result.append(count_entropy(domain))
    result.append(count_vowel_ratio(domain))
    result.append(count_digits_ratio(domain))
    result.append(count_repeat_letter(domain))    
    result.append(consecutive_digits_ratio(domain))
    result.append(count_Bigram(domain))
    result.append(count_Trigram(domain))
    result.append(count_TLD(domain))
    result.append(len(domain))
    result.append(label)
    return result
                  
 

In [14]:
for i in range(5000):
    features = getFeatures(df["domain"].loc[i], df["label"].loc[i])    
    featureSet.loc[i] = features
for i in range(5000):
    features = getFeatures(df_dga["domain"].loc[i], df_dga["label"].loc[i])    
    featureSet.loc[i+5000] = features
featureSet.head(10)

Unnamed: 0,d,entropy,vowel_ratio,digits_ratio,repeat_letter,consecutive_digits_ratio,2-gram,3-gram,tld,len,label
0,www.jaguarforums.com,2.415052,0.3,0.0,0.3,0.0,51,95,1,20,0
1,timesofindia-indiatimes-com.cdn.ampproject.org,2.731751,0.347826,0.0,0.26087,0.0,98,242,1,46,0
2,placer.ca.gov,2.245035,0.307692,0.0,0.153846,0.0,32,52,1,13,0
3,businessoffashion.eu.auth0.com,2.609516,0.4,0.033333,0.3,0.0,73,147,1,30,0
4,inspired-beauty.com,2.798513,0.368421,0.0,0.105263,0.0,69,117,1,19,0
5,guideit.com,2.271869,0.454545,0.0,0.090909,0.0,30,47,1,11,0
6,app-measurement.com,2.451583,0.368421,0.0,0.210526,0.0,51,93,1,19,0
7,d-23.winudf.com,2.523211,0.2,0.133333,0.066667,0.133333,47,78,1,15,0
8,apd-p2plogin.teg.tencent-cloud.net,2.598567,0.264706,0.029412,0.264706,0.0,76,166,1,34,0
9,gamma.getresponse-mail.com,2.578019,0.346154,0.0,0.230769,0.0,69,133,1,26,0


## 训练

In [15]:
X = featureSet.drop(['d','label'],axis=1).values
y = featureSet['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2)

In [16]:
from sklearn import tree, linear_model
clf = tree.DecisionTreeClassifier(max_depth=10)
clf.fit(X_train,y_train.astype('int'))
score = clf.score(X_test,y_test.astype('int'))
print ("%s"%(score))
    

0.922


In [17]:
from sklearn.metrics import confusion_matrix
res = clf.predict(X)
#recall = confusion_matrix(y, res)
mt=confusion_matrix(y.astype('int'), res)
print("%f,%f" % (mt[0][0],mt[0][1]))
print("%f,%f" % (mt[1][0],mt[1][1]))
print("Accuracy rate : %f %%" % (((mt[0][0]+mt[1][1]) / float(mt.sum()))*100))
print('Recall rate : %f %%' % ( (mt[0][0] / float(mt.sum(axis=0)[0])*100)))

4507.000000,493.000000
183.000000,4817.000000
Accuracy rate : 93.240000 %
Recall rate : 96.098081 %


## test

In [20]:
#输入域名判断是否为DGA域名
test = 'baidu.com'
result = pd.DataFrame(columns=('d','entropy','vowel_ratio','digits_ratio',\
'repeat_letter','consecutive_digits_ratio','2-gram','3-gram','tld','len','label'))
results = getFeatures(test, 0)
result.loc[0] = results
result = result.drop(['d', 'label'], axis = 1).values
if(clf.predict(result)):
    print("this is a DGA url" )
else:
    print("this is a benign url" )

this is a benign url


In [21]:
#输入域名判断是否为DGA域名
test = 'gllcsbqrbfch.com'
result = pd.DataFrame(columns=('d','entropy','vowel_ratio','digits_ratio',\
'repeat_letter','consecutive_digits_ratio','2-gram','3-gram','tld','len','label'))
results = getFeatures(test, 0)
result.loc[0] = results
result = result.drop(['d', 'label'], axis = 1).values
if(clf.predict(result)):
    print("this is a DGA url" )
else:
    print("this is a benign url" )

this is a DGA url
