In [1]:
import pandas as pd
import numpy as np
from math import log
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_header = ['url']
sql = pd.read_csv('validation_sql.txt',header=None, sep='\n')
sql.columns = data_header
sql['label'] = 1
sql.head()

Unnamed: 0,url,label
0,1%25%27%29%29%20AND%204264%3D9725%23,1
1,1%25%27%29%29%20AND%208787%3D8787%23,1
2,1%25%27%29%29%29%20AND%201781%3D6047%23,1
3,1%25%27%29%29%29%20AND%208787%3D8787%23,1
4,1%25%27%20AND%209027%3D9130%23,1


In [3]:
normal = pd.read_csv('validation_normal.txt',header=None, sep='\n')
normal.columns = data_header
normal['label'] = 0
normal.head()

Unnamed: 0,url,label
0,/wp-login.php?action=0,0
1,/wp-login.php?action=h5bLUVrUAZPRFE7gNEPKkLNCS...,0
2,/wp-login.php?action=3,0
3,/wp-login.php?action=1492877888208,0
4,/wp-login.php?action=b6f01e808cfd526e827fe0233...,0


In [4]:
data = pd.concat([sql, normal], axis=0, ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 2 columns):
url      19999 non-null object
label    19999 non-null int64
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [5]:
import nltk
import re
from urllib.parse import unquote
def urldecode(url):
    url = unquote(url, 'utf-8')
    return url

In [6]:
data['url_decode'] = data['url'].map(lambda x: urldecode(x))

In [7]:
data.head()

Unnamed: 0,url,label,url_decode
0,1%25%27%29%29%20AND%204264%3D9725%23,1,1%')) AND 4264=9725#
1,1%25%27%29%29%20AND%208787%3D8787%23,1,1%')) AND 8787=8787#
2,1%25%27%29%29%29%20AND%201781%3D6047%23,1,1%'))) AND 1781=6047#
3,1%25%27%29%29%29%20AND%208787%3D8787%23,1,1%'))) AND 8787=8787#
4,1%25%27%20AND%209027%3D9130%23,1,1%' AND 9027=9130#


In [8]:
from urllib import parse
def get_payload(url):
    if '?' in url:
        values = url.split('?')[-1]
        key_value = values.split("&")
        try:
            dict_values = dict(item.split("=", 1) for item in key_value)
            length = 0
            temp = ''
            for key, value in dict_values.items():
                if len(value) > length:
                    length = len(value)
                    temp = key
            return dict_values[temp]
        except:
            return str(key_value)
    else:
        return url

In [9]:
data['payload'] = data['url_decode'].map(lambda x: get_payload(x))

In [10]:
def get_length(payload):
    return len(payload)

In [11]:
data['payload_len'] = data['payload'].map(lambda x: get_length(x))

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 5 columns):
url            19999 non-null object
label          19999 non-null int64
url_decode     19999 non-null object
payload        19999 non-null object
payload_len    19999 non-null int64
dtypes: int64(2), object(3)
memory usage: 781.3+ KB


In [13]:
import re
def get_num_freq(payload):
    if len(payload) == 0:
        return 0
    num_len=len(re.compile(r'\d').findall(payload))
    return num_len / len(payload)

In [14]:
data['num_freq'] = data['payload'].map(lambda x: get_num_freq(x))

In [15]:
def get_capital_freq(payload):
    capital_len = len(re.compile(r'[A-Z]').findall(payload))
    if len(payload)!=0:
        return capital_len / len(payload)
    else:
        return 0

In [16]:
data['capital_freq'] = data['payload'].map(lambda x: get_capital_freq(x))

In [17]:
def get_space_freq(payload):
    if len(payload) == 0:
        return 0
    return (payload.count(" ") + payload.count("%20")) / len(payload)

In [18]:
data['space_freq'] = data['payload'].map(lambda x: get_space_freq(x))

In [19]:
words = [
    "select", "from", "insert", "delete", "having", 
    "union", "count", "drop table", "update",
    "truncate", "asc", "mid", "char", "xp_cmdshell", 
    "exec", "master", "net", "and", "or", "where", "substr", 
    "information schema", "xor", "version", "set",
    "where", "group", "order", "create", "sum", "max",
    "min", "avg", "having", "except"
]
def get_key_num(payload):
    payload = payload.lower()
    count = 0
    for word in words:
        count = count + payload.count(word)
    return count

In [20]:
data['key_num'] = data['payload'].map(lambda x: get_key_num(x))

In [21]:
data

Unnamed: 0,url,label,url_decode,payload,payload_len,num_freq,capital_freq,space_freq,key_num
0,1%25%27%29%29%20AND%204264%3D9725%23,1,1%')) AND 4264=9725#,1%')) AND 4264=9725#,20,0.450000,0.150000,0.100000,1
1,1%25%27%29%29%20AND%208787%3D8787%23,1,1%')) AND 8787=8787#,1%')) AND 8787=8787#,20,0.450000,0.150000,0.100000,1
2,1%25%27%29%29%29%20AND%201781%3D6047%23,1,1%'))) AND 1781=6047#,1%'))) AND 1781=6047#,21,0.428571,0.142857,0.095238,1
3,1%25%27%29%29%29%20AND%208787%3D8787%23,1,1%'))) AND 8787=8787#,1%'))) AND 8787=8787#,21,0.428571,0.142857,0.095238,1
4,1%25%27%20AND%209027%3D9130%23,1,1%' AND 9027=9130#,1%' AND 9027=9130#,18,0.500000,0.166667,0.111111,1
5,1%25%27%20AND%208787%3D8787%23,1,1%' AND 8787=8787#,1%' AND 8787=8787#,18,0.500000,0.166667,0.111111,1
6,1%00%27%29%20AND%207841%3D4363%23,1,1�') AND 7841=4363#,1�') AND 7841=4363#,19,0.473684,0.157895,0.105263,1
7,1%00%27%29%20AND%208787%3D8787%23,1,1�') AND 8787=8787#,1�') AND 8787=8787#,19,0.473684,0.157895,0.105263,1
8,1%00%27%20AND%203807%3D5244%23,1,1�' AND 3807=5244#,1�' AND 3807=5244#,18,0.500000,0.166667,0.111111,1
9,1%00%27%20AND%208787%3D8787%23,1,1�' AND 8787=8787#,1�' AND 8787=8787#,18,0.500000,0.166667,0.111111,1


In [22]:
data = data.drop('url', axis = 1)
data = data.drop('url_decode', axis = 1)
data = data.drop('payload', axis = 1)

In [23]:
X = data.drop('label', axis=1)
Y = data['label']

In [24]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
Xmin_max = min_max_scaler.fit_transform(X)

In [25]:
from sklearn.model_selection import train_test_split
x,y = data.ix[:,1:],data.ix[:,0]
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(x, y, train_size = 0.8, test_size=0.2, random_state=0, stratify = y)
X_train, X_test, Y_train, Y_test = train_test_split(Xmin_max, Y, train_size = 0.8, test_size=0.2, random_state=0, stratify = Y)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


## 决策树

In [26]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X_train1, Y_train1)
acc_decision_tree = round(decision_tree.score(X_train1, Y_train1) * 100, 2)
print("Training accuracy：", acc_decision_tree)

Training accuracy： 99.91


In [27]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = decision_tree.predict(X_test1)
print("Test accuary：", accuracy_score(Y_test1, Y_pred))
print("Recall score:", recall_score(Y_test1, Y_pred))

Test accuary： 0.99625
Recall score: 0.996


In [28]:
from sklearn.model_selection import StratifiedKFold,cross_val_score
strKFold = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
scores = cross_val_score(decision_tree,X,Y,cv=strKFold)
print("Mean score of straitified cross validation:{:.2f}".format(scores.mean()))

Mean score of straitified cross validation:0.99


## Logistics回归

In [29]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver='liblinear', max_iter = 10000)
log_reg.fit(X_train, Y_train)
acc_log = round(log_reg.score(X_train, Y_train) * 100, 2)
print("Training accuracy：", acc_log)

Training accuracy： 96.64


In [30]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = log_reg.predict(X_test)
print("Test accuracy：", accuracy_score(Y_test, Y_pred))
print("Recall score:", recall_score(Y_test, Y_pred))

Test accuracy： 0.96625
Recall score: 0.959


In [31]:
from sklearn.model_selection import StratifiedKFold,cross_val_score
strKFold = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)
scores = cross_val_score(log_reg,X,Y,cv=strKFold)
print("Mean score of straitified cross validation:{:.2f}".format(scores.mean()))

Mean score of straitified cross validation:0.97


## 输入测试

In [32]:
def get_features(url):
    url = urldecode(url)
    payload = get_payload(url)
    length = get_length(payload)
    num_freq = get_num_freq(payload)
    capital_freq = get_capital_freq(payload)
    space_freq = get_space_freq(payload)
    key_num = get_key_num(payload)
    return [length, num_freq, capital_freq, space_freq, key_num]

In [33]:
def test(url):
    test_features = []
    test_features.append(get_features(url))
    pre = decision_tree.predict(test_features)
    print("SQL injection") if pre == 1 else print("Normal url")

In [34]:
test("https://github.com/scusec/Data-Mining-for-Cybersecurity/tree/master/Homework/2019/Task5")

Normal url


In [35]:
test("http://localhost/sqlilabs/Less-2/?id=-1 union select 1,2,SCHEMA_NAME, from information_schema.SCHEMATA limit 1,1")

SQL injection


## 用另一个数据集进行测试

In [36]:
data_header = ['url']
test_sql = pd.read_csv("sqlnew.csv", engine="python", sep='\n', error_bad_lines=False)
test_sql.columns = data_header
test_sql['label'] = 1
test_sql.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4942 entries, 0 to 4941
Data columns (total 2 columns):
url      4942 non-null object
label    4942 non-null int64
dtypes: int64(1), object(1)
memory usage: 77.3+ KB


Skipping line 167: Expected 1 fields in line 167, saw 2
Skipping line 287: Expected 1 fields in line 287, saw 2
Skipping line 290: Expected 1 fields in line 290, saw 2
Skipping line 294: Expected 1 fields in line 294, saw 2
Skipping line 334: Expected 1 fields in line 334, saw 2


In [37]:
test_normal = pd.read_csv('normal_less.csv',header=None, sep='\n')
test_normal.columns = data_header
test_normal['label'] = 0
test_normal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
url      5000 non-null object
label    5000 non-null int64
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [38]:
test_data = pd.concat([test_sql, test_normal], axis=0, ignore_index=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9942 entries, 0 to 9941
Data columns (total 2 columns):
url      9942 non-null object
label    9942 non-null int64
dtypes: int64(1), object(1)
memory usage: 155.4+ KB


In [39]:
test_data['url_decode'] = test_data['url'].map(lambda x: urldecode(x))

In [40]:
test_data['payload'] = test_data['url_decode'].map(lambda x: get_payload(x))

In [41]:
test_data['payload_len'] = test_data['payload'].map(lambda x: get_length(x))
test_data['num_freq'] = test_data['payload'].map(lambda x: get_num_freq(x))
test_data['capital_freq'] = test_data['payload'].map(lambda x: get_capital_freq(x))
test_data['space_freq'] = test_data['payload'].map(lambda x: get_space_freq(x))
test_data['key_num'] = test_data['payload'].map(lambda x: get_key_num(x))

In [42]:
test_data = test_data.drop('url', axis = 1)
test_data = test_data.drop('url_decode', axis = 1)
test_data = test_data.drop('payload', axis = 1)

In [43]:
test_X = test_data.drop('label', axis=1)
test_Y = test_data['label']

In [44]:
test_Y_pred = decision_tree.predict(test_X)
print("Test accuary：", accuracy_score(test_Y, test_Y_pred))

Test accuary： 0.9392476362904848
