In [2]:
import pandas as pd
import numpy as np
from math import log
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
dga_names = ['dga_family', 'domain', 'start_date', 'start_time', 'end_date', 'end_time']
dga = pd.read_csv('dga.txt',header=None, sep='\s+')
dga.columns = dga_names
dga.head()

Unnamed: 0,dga_family,domain,start_date,start_time,end_date,end_time
0,nymaim,ooloda.com,2019-10-23,00:00:00,2019-10-23,23:59:59
1,nymaim,ovpxymiqg.biz,2019-10-23,00:00:00,2019-10-23,23:59:59
2,nymaim,klaidigdl.com,2019-10-23,00:00:00,2019-10-23,23:59:59
3,nymaim,moftmgt.com,2019-10-23,00:00:00,2019-10-23,23:59:59
4,nymaim,inxpnnpx.biz,2019-10-23,00:00:00,2019-10-23,23:59:59


In [4]:
normal = pd.read_csv('all_legit.txt', header=None, sep='\s+')
legit_names = ['domain', 'label']
normal.columns = legit_names 

In [5]:
dga = dga.drop('dga_family', axis=1)
dga = dga.drop('start_date', axis=1)
dga = dga.drop('start_time', axis=1)
dga = dga.drop('end_date', axis=1)
dga = dga.drop('end_time', axis=1)
dga['label'] = 1
dga.head()

Unnamed: 0,domain,label
0,ooloda.com,1
1,ovpxymiqg.biz,1
2,klaidigdl.com,1
3,moftmgt.com,1
4,inxpnnpx.biz,1


In [6]:
data = pd.concat([normal, dga], axis=0, ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2224172 entries, 0 to 2224171
Data columns (total 2 columns):
domain    object
label     int64
dtypes: int64(1), object(1)
memory usage: 33.9+ MB


In [7]:
def get_main_domain(domain):
    domain_list = domain.split('.')
    result = ''
    for i in range(len(domain_list) - 1):
        result = result + domain_list[i] + '.'
    result = result.rstrip('.')
    if result == '':
        return domain
    else:
        return result

data['main_domain'] = data['domain'].map(lambda x: get_main_domain(x))

In [8]:
data.drop_duplicates('main_domain', inplace=True)
data = data.dropna()

In [9]:
data.reset_index(drop=True, inplace=True)

In [10]:
data['len'] = data['main_domain'].map(lambda x: len(x))

In [11]:
def get_vowel(domain):
    length = len(domain)
    domain = domain.lower()
    result = domain.count('a') + domain.count('e') + domain.count('i') + domain.count('o') + domain.count('u')
    return result / length

In [12]:
data['vowel'] = data['main_domain'].map(lambda x: get_vowel(x))

In [13]:
import re
def get_consonants(domain):
    it = re.finditer(r'([bcdfghjklmnpqrstvwxyz])*', domain)
    count = 0
    for node in it:
        #print node.group()
        len_node = len(node.group())
        if len_node > 1:
            count += len_node
    return count

In [14]:
data['consonants'] = data['main_domain'].map(lambda x: get_consonants(x))

In [15]:
import math
def get_shannon(x):
    url = x
    tmp_dict = {}
    url_len = len(url)
    for i in range(0,url_len):
        if url[i] in tmp_dict.keys():
            tmp_dict[url[i]] = tmp_dict[url[i]] + 1
        else:
            tmp_dict[url[i]] = 1
    shannon = 0
    for i in tmp_dict.keys():
        p = float(tmp_dict[i]) / url_len
        shannon = shannon - p * math.log(p,2)
    return shannon

data['shannon'] = data['main_domain'].map(lambda x: get_shannon(x))

In [16]:
def get_num(domain):
    length = len(domain)
    domain = domain.lower()
    result = domain.count('0') + domain.count('1') + domain.count('2') + domain.count('3') + domain.count('4') + domain.count('5') + \
            domain.count('6') + domain.count('7') + domain.count('8') + domain.count('9')
    return result / length

In [17]:
data['num'] = data['main_domain'].map(lambda x: get_num(x))

In [18]:
from nltk.collocations import *
import nltk
def get_bigram(domain):
    finder = BigramCollocationFinder.from_words(domain)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.pmi)
    result = 0
    for i in scored:
        result = result + i[1]
    return int(result)

In [19]:
data['bigram'] = data['main_domain'].map(lambda x: get_bigram(x))

In [20]:
def get_trigram(domain):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    trigram_finder = TrigramCollocationFinder.from_words(domain)
    scored = trigram_finder.score_ngrams(trigram_measures.poisson_stirling)
    result = 0
    for i in scored:
        result = result + i[1]
    return int(result)

In [21]:
data['trigram'] = data['main_domain'].map(lambda x: get_trigram(x))

In [22]:
data

Unnamed: 0,domain,label,main_domain,len,vowel,consonants,shannon,num,bigram,trigram
0,google.com,0,google,6,0.500000,2,1.918296,0.0,5,7
1,facebook.com,0,facebook,8,0.500000,0,2.750000,0.0,17,25
2,youtube.com,0,youtube,7,0.571429,0,2.521641,0.0,12,17
3,baidu.com,0,baidu,5,0.600000,0,2.321928,0.0,9,10
4,yahoo.com,0,yahoo,5,0.600000,0,1.921928,0.0,6,7
5,amazon.com,0,amazon,6,0.500000,0,2.251629,0.0,9,12
6,wikipedia.org,0,wikipedia,9,0.555556,0,2.641604,0.0,15,26
7,qq.com,0,qq,2,0.000000,2,0.000000,0.0,-1,0
8,twitter.com,0,twitter,7,0.285714,4,2.128085,0.0,8,11
9,taobao.com,0,taobao,6,0.666667,0,1.918296,0.0,6,8


In [23]:
data = data.drop('domain', axis = 1)
data = data.drop('main_domain', axis = 1)

In [24]:
X = data.drop('label', axis=1)
Y = data['label']

In [25]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
Xmin_max = min_max_scaler.fit_transform(X)

In [26]:
from sklearn.model_selection import train_test_split
x,y = data.ix[:,1:],data.ix[:,0]
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(x, y, train_size = 0.8, test_size=0.2, random_state=0, stratify = y)
X_train, X_test, Y_train, Y_test = train_test_split(Xmin_max, Y, train_size = 0.8, test_size=0.2, random_state=0, stratify = Y)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


## 决策树

In [27]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X_train1, Y_train1)
acc_decision_tree = round(decision_tree.score(X_train1, Y_train1) * 100, 2)
print("Training accuracy：", acc_decision_tree)

Training accuracy： 92.76


In [28]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = decision_tree.predict(X_test1)
print("Test accuary：", accuracy_score(Y_test1, Y_pred))
print("Recall score:", recall_score(Y_test1, Y_pred))

Test accuary： 0.9148607286779565
Recall score: 0.916929360721307


In [29]:
from sklearn.model_selection import StratifiedKFold,cross_val_score
strKFold = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
scores = cross_val_score(decision_tree,X,Y,cv=strKFold)
print("Mean score of straitified cross validation:{:.2f}".format(scores.mean()))

Mean score of straitified cross validation:0.91


## Logistics Regression

In [30]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver='liblinear', max_iter = 10000)
log_reg.fit(X_train, Y_train)
acc_log = round(log_reg.score(X_train, Y_train) * 100, 2)
print("Training accuracy：", acc_log)

Training accuracy： 84.06


In [31]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = log_reg.predict(X_test)
print("Test accuracy：", accuracy_score(Y_test, Y_pred))
print("Recall score:", recall_score(Y_test, Y_pred))

Test accuracy： 0.8419669341400867
Recall score: 0.8556411429733577


In [32]:
from sklearn.model_selection import StratifiedKFold,cross_val_score
strKFold = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
scores = cross_val_score(log_reg,X,Y,cv=strKFold)
print("Mean score of straitified cross validation:{:.2f}".format(scores.mean()))

Mean score of straitified cross validation:0.84


In [38]:
def get_features(domain):
    main_domain = get_main_domain(domain)
    length = len(main_domain)
    vowel = get_vowel(main_domain)
    consonants = get_consonants(main_domain)
    shannon = get_shannon(main_domain)
    num = get_num(main_domain)
    bigram = get_bigram(main_domain)
    trigram = get_trigram(main_domain)
    return [length, vowel, consonants, shannon, num, bigram, trigram]

In [40]:
def test_domain(domain):
    test_features = []
    test_features.append(get_features(domain))
    pre = decision_tree.predict(test_features)
    print("DGA") if pre == 1 else print("Normal")

In [41]:
domain = 'jianshu.com'
test_domain(domain)

Normal


In [42]:
domain = 'promotion.aliyun.com'
test_domain(domain)

Normal


In [43]:
domain = 'kajugcffktsgskchaymsj.infosec'
test_domain(domain)

DGA
