In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import gensim
import re
from collections import namedtuple
from random import shuffle
import multiprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn import preprocessing
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
dga_file="/Users/leeyn/desktop/work/prac/dga.txt"
alexa_file="/Users/leeyn/desktop/work/prac/top-1m.csv"

In [3]:
def load_alexa():
    x=[]
    data = pd.read_csv(alexa_file, sep=",",header=None)
    x=[i[1] for i in data.values]
    return x

def load_dga():
    x=[]
    data = pd.read_csv(dga_file, sep="\t", header=None,
                      skiprows=18)
    x=[i[1] for i in data.values]
    return x

def get_feature_charseq():
    alexa=load_alexa()
    dga=load_dga()
    x=alexa+dga
    max_features=10000
    y=[0]*len(alexa)+[1]*len(dga)

    t=[]
    for i in x:
        v=[]
        for j in range(0,len(i)):
            v.append(ord(i[j]))
        t.append(v)

    x=t
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.4)

    return x_train, x_test, y_train, y_test


def get_aeiou(domain):
    count = len(re.findall(r'[aeiou]', domain.lower()))
    count = (0.0 + count) / len(domain)
    return count

def get_uniq_char_num(domain):
    count=len(set(domain))
    count=(0.0+count)/len(domain)
    return count

def get_uniq_num_num(domain):
    count = len(re.findall(r'[1234567890]', domain.lower()))
    count = (0.0 + count) / len(domain)
    return count

#单纯的文本特征
def get_feature():
    from sklearn import preprocessing
    alexa=load_alexa()
    dga=load_dga()
    v=alexa+dga
    y=[0]*len(alexa)+[1]*len(dga)
    x=[]

    for vv in v:
        vvv=[get_aeiou(vv),get_uniq_char_num(vv),get_uniq_num_num(vv),len(vv)]
        x.append(vvv)

    x=preprocessing.scale(x)
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.4)
    return x_train, x_test, y_train, y_test

# 2-gram特征提取
def get_feature_2gram():
    alexa=load_alexa()
    dga=load_dga()
    x=alexa+dga
    max_features=10000
    y=[0]*len(alexa)+[1]*len(dga)

    CV = CountVectorizer(
                                    ngram_range=(2, 2),
                                    token_pattern=r'\w',
                                    decode_error='ignore',
                                    strip_accents='ascii',
                                    max_features=max_features,
                                    stop_words='english',
                                    max_df=1.0,
                                    min_df=1)
    x = CV.fit_transform(x)
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.4)

    return x_train.toarray(), x_test.toarray(), y_train, y_test

#2，3，4进行分
def get_feature_234gram():
    alexa=load_alexa()
    dga=load_dga()
    x=alexa+dga
    max_features=10000
    y=[0]*len(alexa)+[1]*len(dga)

    CV = CountVectorizer(
                                    ngram_range=(2, 4),
                                    token_pattern=r'\w',
                                    decode_error='ignore',
                                    strip_accents='ascii',
                                    max_features=max_features,
                                    stop_words='english',
                                    max_df=1.0,
                                    min_df=1)
    x = CV.fit_transform(x)
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.4)

    return x_train.toarray(), x_test.toarray(), y_train, y_test


In [15]:
from sklearn import metrics
from sklearn.externals import joblib
def do_mlp(x_train, x_test, y_train, y_test):
    global max_features
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes = (5, 2),
                        random_state = 1)
    clf.fit(x_train, y_train)
    joblib.dump(clf,"mlp.m")
    y_pred = clf.predict(x_test)
    print(classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))
def do_xgboost(x_train, x_test, y_train, y_test):
    xgb_model = xgb.XGBClassifier().fit(x_train, y_train)
    y_pred = xgb_model.predict(x_test)
    print(classification_report(y_test, y_pred))
    print(metrics.confusion_matrix(y_test, y_pred))



In [7]:
if __name__ == "__main__":
    print("text feature & xgboost")
    x_train, x_test, y_train, y_test = get_feature()
    do_xgboost(x_train, x_test, y_train, y_test)
    print("2gram & xgboost")
    x_train, x_test, y_train, y_test = get_feature_2gram()
    do_xgboost(x_train, x_test, y_train, y_test)
    
    #do_nb(x_train, x_test, y_train, y_test)

text feature & xgboost
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      3976
           1       0.90      0.82      0.86      3970

    accuracy                           0.87      7946
   macro avg       0.87      0.87      0.87      7946
weighted avg       0.87      0.87      0.87      7946

[[3628  348]
 [ 713 3257]]


  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

           0       0.88      0.75      0.81      3952
           1       0.78      0.89      0.84      3994

    accuracy                           0.82      7946
   macro avg       0.83      0.82      0.82      7946
weighted avg       0.83      0.82      0.82      7946

[[2973  979]
 [ 422 3572]]


In [8]:
x_train, x_test, y_train, y_test = get_feature_234gram()
do_xgboost(x_train, x_test, y_train, y_test)

  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

           0       0.88      0.76      0.82      3852
           1       0.80      0.90      0.85      4094

    accuracy                           0.83      7946
   macro avg       0.84      0.83      0.83      7946
weighted avg       0.84      0.83      0.83      7946

[[2938  914]
 [ 412 3682]]


In [16]:
from sklearn.neural_network import MLPClassifier
print("2gram & MLP")
x_train, x_test, y_train, y_test = get_feature_2gram()
do_mlp(x_train, x_test, y_train, y_test)

2gram & MLP


  'stop_words.' % sorted(inconsistent))


              precision    recall  f1-score   support

           0       0.94      0.96      0.95      3959
           1       0.96      0.94      0.95      3987

    accuracy                           0.95      7946
   macro avg       0.95      0.95      0.95      7946
weighted avg       0.95      0.95      0.95      7946

[[3792  167]
 [ 226 3761]]


In [17]:
print(x_train)

[[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
