In [32]:
import pandas as pd
import numpy as np
from math import log
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [33]:
names = ['payload']
good = pd.read_csv('dmzo_nomal.csv', encoding='utf-8', names=names)
bad = pd.read_csv('xssed.csv', encoding='utf-8', names=names)

In [34]:
good.shape

(31407, 1)

In [35]:
bad.shape

(33426, 1)

In [36]:
good['label'] = 1
bad['label'] = 0

In [37]:
good.head()

Unnamed: 0,payload,label
0,sid=&amp;ring=hentff98&amp;id=&amp;list,1
1,ring=yaoi28,1
2,mode=navigation&amp;categoryid=1141,1
3,lang=en,1
4,ring=bettie,1


In [38]:
data = pd.concat([good, bad], axis=0, ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64833 entries, 0 to 64832
Data columns (total 2 columns):
payload    64833 non-null object
label      64833 non-null int64
dtypes: int64(1), object(1)
memory usage: 1013.1+ KB


In [39]:
data['len'] = data['payload'].map(lambda x: len(x))
data['script'] = data['payload'].map(lambda x: x.count('script'))
data['java'] = data['payload'].map(lambda x: x.count('java'))
data['iframe'] = data['payload'].map(lambda x: x.count('iframe'))
data['body'] = data['payload'].map(lambda x: x.count('body'))
data['style'] = data['payload'].map(lambda x: x.count('style'))
data['marquee'] = data['payload'].map(lambda x: x.count('marquee'))
data['quot_1'] = data['payload'].map(lambda x: x.count('<'))
data['quot_2'] = data['payload'].map(lambda x: x.count('>'))
data['quot_3'] = data['payload'].map(lambda x: x.count('\"'))
data['quot_4'] = data['payload'].map(lambda x: x.count('\''))
data['quot_5'] = data['payload'].map(lambda x: x.count('%'))
data['quot_6'] = data['payload'].map(lambda x: x.count('('))
data['quot_7'] = data['payload'].map(lambda x: x.count(')'))
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64833 entries, 0 to 64832
Data columns (total 16 columns):
payload    64833 non-null object
label      64833 non-null int64
len        64833 non-null int64
script     64833 non-null int64
java       64833 non-null int64
iframe     64833 non-null int64
body       64833 non-null int64
style      64833 non-null int64
marquee    64833 non-null int64
quot_1     64833 non-null int64
quot_2     64833 non-null int64
quot_3     64833 non-null int64
quot_4     64833 non-null int64
quot_5     64833 non-null int64
quot_6     64833 non-null int64
quot_7     64833 non-null int64
dtypes: int64(15), object(1)
memory usage: 7.9+ MB


In [40]:
data = data.drop('payload', axis=1)

In [41]:
X = data.drop('label', axis=1)
Y = data['label']

In [42]:
from sklearn.model_selection import train_test_split
x,y = data.ix[:,1:],data.ix[:,0]
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size = 0.8, test_size=0.2, random_state=0, stratify = y)

In [43]:
X_train.head()

Unnamed: 0,len,script,java,iframe,body,style,marquee,quot_1,quot_2,quot_3,quot_4,quot_5,quot_6,quot_7
57582,11,0,0,0,0,0,0,0,0,0,0,0,0,0
32273,250,0,0,0,0,0,0,5,8,3,3,0,10,10
18194,20,0,0,0,0,0,0,0,0,0,0,0,0,0
39545,73,2,0,0,0,0,0,1,1,0,0,6,1,1
46291,85,0,0,1,0,0,0,1,1,0,0,5,0,0


In [44]:
Y_train.head()

57582    0
32273    0
18194    1
39545    0
46291    0
Name: label, dtype: int64

### 决策树

In [45]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
print("Training accuracy：", acc_decision_tree)

Training accuracy： 98.41


In [46]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = decision_tree.predict(X_test)
print("Test accuary：", accuracy_score(Y_test, Y_pred))
print("Recall score:", recall_score(Y_test, Y_pred))

Test accuary： 0.9828024986504204
Recall score: 0.9969754855141675


In [47]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
scores = cross_val_score(decision_tree, X, Y, cv=5)
print("Cross val score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross val score: 0.98 (+/- 0.01)


### KNN

In [48]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
print("Training accuracy：", acc_knn)

Training accuracy： 98.21


In [49]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = knn.predict(X_test)
print("Test accuracy：", accuracy_score(Y_test, Y_pred))
print("Recall score:", recall_score(Y_test, Y_pred))

Test accuracy： 0.9806431711267063
Recall score: 0.9955428207577205


In [50]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
scores = cross_val_score(knn, X, Y, cv=5)
print("Cross val score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross val score: 0.97 (+/- 0.01)


### Logistics回归

In [51]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, Y_train)
acc_log = round(log_reg.score(X_train, Y_train) * 100, 2)
print("Training accuracy：", acc_log)

Training accuracy： 98.19


In [53]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = log_reg.predict(X_test)
print("Test accuracy：", accuracy_score(Y_test, Y_pred))
print("Recall score:", recall_score(Y_test, Y_pred))

Test accuracy： 0.9834194493714814
Recall score: 0.9987265202164916


In [54]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
scores = cross_val_score(log_reg, X, Y, cv=5)
print("Cross val score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross val score: 0.98 (+/- 0.00)


### SVM

In [55]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(max_iter = 10000)
linear_svc.fit(X_train, Y_train)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
print("Training accuracy：", acc_linear_svc)

Training accuracy： 98.17


In [56]:
from sklearn.metrics import accuracy_score, recall_score
Y_pred = linear_svc.predict(X_test)
print("Test accuracy：", accuracy_score(Y_test, Y_pred))
print("Recall score:", recall_score(Y_test, Y_pred))

Test accuracy： 0.9828024986504204
Recall score: 0.9972938554600446


In [57]:
from sklearn.model_selection import train_test_split,cross_val_score,cross_validate
scores = cross_val_score(linear_svc, X, Y, cv=5)
print("Cross val score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross val score: 0.98 (+/- 0.00)


### 输入的URL判定

In [58]:
def get_features(url):
    return [len(url), url.count('script'), url.count('java'), url.count('iframe'), url.count('body'), url.count('style'), 
            url.count('marquee'), url.count('<'), url.count('>'), url.count('\"'), url.count('\''), url.count('%'),
           url.count('('), url.count(')')]
def test(url):
    test_features = []
    test_features.append(get_features(url))
    pre = decision_tree.predict(test_features)
    print(1) if pre == 1 else print(0)

In [59]:
test_url = "<STYLE>@im\port'\ja\vasc\ript:alert(\"XSS\")';</STYLE>"
test(test_url)

0


In [60]:
test_url = '<IMG src="http://www.thesiteyouareon.com/somecommand.php?somevariables=maliciouscode">'
test(test_url)

0


In [61]:
test_url = 'https://promotion.aliyun.com/ntms/act/qwbk.html?spm=5176.64391.592490.3.36311a4bUWGtIe#floor3'
test(test_url)

1


In [62]:
test_url = '<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>'
test(test_url)

0
