In [1]:
from sklearn import datasets
from urllib.parse import unquote,quote
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import numpy as np
import pandas as pd
import string
import math
import pickle
import re
import os

## 导入数据集 

In [2]:
normal_train = []
sql_train = []
for x in open("./dataset/normal_train.csv",'rb').readlines():
    normal_train.append(unquote(x[:-1].decode()))
for x in open("./dataset/sql_train.csv",'rb').readlines():
    sql_train.append(unquote(x[:-1].decode()))

## 处理特征(9个)

In [3]:
def calc_ent(domain):  # 计算信息熵
    dataset = []
    for each in domain:
        dataset.append(each)
    data1 = np.array(dataset)
    x_value_list = set([data1[i] for i in range(data1.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(data1[data1 == x_value].shape[0]) / data1.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    return ent

In [4]:
def make_feature(obj):
    result = []
    for domain in obj:
        feature = []
        key_num = 0
        special_proportion = 0
        prefix_proportion = 0
        length = len(domain)  # 长度
        if length == 0:
            continue
        num_proportion = len(re.compile(r'\d').findall(domain))/length   # 数字比例
        cap_proportion = len(re.compile(r'[A-Z]').findall(domain))/length  # 大写字母比例
        space_proportion = domain.count(" ")/length  # 空格比例
        domain=domain.lower()
        keyword = ['and','or','xor','sysobjects','version','substr','len','substring','exists','mid','asc','inner join','xp_cmdshell','exec'\
                  ,'having','union','order','information schema','load_file','load data infile','into outfile','into dumpfile','select']
        special_char = ['{','}','(',')','NULL','=','?','[',']']
        prefix = ['\\x','&','\\u','%']
        for i in keyword:
            key_num += domain.count(i)  # 关键词数量
        for i in special_char:
            special_proportion += domain.count(i)/length  # 特殊符号比例
        for i in prefix:
            prefix_proportion += domain.count(i)/length  # 前缀比例
        close = (1 if (domain.count("'")%2 == 0) & (domain.count("\"")%2 == 0) else 0)  # 引号是否封闭
        info = calc_ent(domain)  # 信息熵
        feature = [length,num_proportion,cap_proportion,space_proportion,key_num,special_proportion,prefix_proportion,close,info]
        result.append(feature)
    return result

In [5]:
print("正常域名：",len(make_feature(normal_train)))
print("SQL域名：",len(make_feature(sql_train)))

正常域名： 5000
SQL域名： 4974


## 添加标签

In [6]:
features = []
labels = []
for feature in make_feature(sql_train):
    features.append(feature)
    labels.append(1)
for feature in make_feature(normal_train):
    features.append(feature)
    labels.append(0)

## 训练模型

## K阶近邻及其K折交叉验证

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score 
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=4)
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
scores = cross_val_score(knn, features, labels, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

0.9488807216839291
[0.83967936 0.9008016  0.94589178 0.9739479  0.97993982 0.95887663
 0.95486459 0.94383149 0.98094283 0.89167503]
0.9370451032456085


## 随机森林

In [9]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=245)
train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.3, random_state=5)
clf = RandomForestClassifier()
clf.fit(train_X, train_y)
pred_y = clf.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))

precision_score: 0.9993297587131368
recall_score: 0.9933377748167888


## 决策树

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(train_X, train_y)
Y_pred = decision_tree.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))

precision_score: 0.9993297587131368
recall_score: 0.9933377748167888


## 逻辑回归

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
log_reg = LogisticRegression(solver='liblinear', max_iter = 10000)
log_reg.fit(train_X, train_y)
Y_pred = log_reg.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))

precision_score: 0.9993297587131368
recall_score: 0.9933377748167888


## 使用KNN进行检测

In [None]:
def test_domain_KNN(domain):
    test_feature = []
    test_feature.append(domain)
    sample = make_feature(test_feature)
    pre = knn.predict(sample)
    print("SQL") if pre == 1 else print("Normal")

while True:
    test_domain_KNN(input("KNN测试："))

KNN测试：baidu
Normal
KNN测试：123
Normal
KNN测试：123abc
Normal
KNN测试：'and 1=1
SQL
KNN测试：?id=243234
Normal
KNN测试：?name=2eoyh
Normal
