In [1]:
import numpy as np 
import pandas as pd 
import tldextract

"""
    XSS检测的相关特征：script,java,iframe,左右括号,style,alert,%,\",\'
    等常见的字符。
"""
def count_script(payload):
    return payload.lower().count("script")

def count_java(payload):
    return payload.lower().count("java")

def count_iframe(payload):
    return payload.lower().count("iframe")

def count_style(payload):
    return payload.lower().count("style")

def count_alert(payload):
    return payload.lower().count("alert")

def count_leftBracket(payload):
    return payload.lower().count("(")

def count_rightBracket(payload):
    return payload.lower().count(")")

def count_leftAngleBracket(payload):
    return payload.lower().count("<")

def count_rightAngleBracket(payload):
    return payload.lower().count(">")

def count_percent(payload):
    return payload.lower().count("%")

def count_slash1(payload):
    return payload.lower().count("\"")

def count_slash2(payload):
    return payload.lower().count("\'")



In [2]:
#获取对应特征的函数；
def getFeature(payload,label):
    result = []
    payload = str(payload)

    result.append(payload)
    result.append(count_script(payload))
    result.append(count_java(payload))
    result.append(count_iframe(payload))
    result.append(count_style(payload))
    result.append(count_alert(payload))
    result.append(count_leftBracket(payload))
    result.append(count_rightBracket(payload))
    result.append(count_leftAngleBracket(payload))
    result.append(count_rightAngleBracket(payload))
    result.append(count_percent(payload))
    result.append(count_slash1(payload))
    result.append(count_slash2(payload))
    result.append(str(label))
    return result


In [3]:
featureSet = pd.DataFrame(columns=('payload','script','java','iframe',\
    'style','alert','leftBracket','rightBracket','leftAngleBracket',\
    'rightAngleBracket','percent','slash1','slash2','label'))

In [4]:
df_xss = pd.read_csv('xss.csv')
#print(df_xss.head())
df_normal = pd.read_csv('normal.csv')
df_normal['label'] = 1
df_xss['label'] = 0
df_total = df_xss.append(df_normal)
df_total = df_total.sample(frac=1).reset_index(drop=True)
print(df_total.head())
length_data = len(df_total)
print("总共的样本数：",length_data)
PERCENTAGE = 1
length=int(length_data*PERCENTAGE)


                                             payload  label
0                                            id=2800      1
1                               area=4&aName=Council      0
2  q=FTP&%27%22%3E%3C/title%3E%3Cscript%3Ealert(<...      0
3  c=&search_country=&search_state=&search_city=&...      0
4  element=%22%3E%3CscRipT%3Ealert%28123%29%3C%2F...      0
总共的样本数： 64833


In [5]:
for i in range(length):
    feature = getFeature(df_total["payload"].loc[i],df_total["label"].loc[i])
    featureSet.loc[i] = feature
print(featureSet.head())


                                             payload script java iframe style  \
0                                            id=2800      0    0      0     0   
1                               area=4&aName=Council      0    0      0     0   
2  q=FTP&%27%22%3E%3C/title%3E%3Cscript%3Ealert(<...      2    0      0     0   
3  c=&search_country=&search_state=&search_city=&...      2    0      0     0   
4  element=%22%3E%3CscRipT%3Ealert%28123%29%3C%2F...      1    0      0     0   

  alert leftBracket rightBracket leftAngleBracket rightAngleBracket percent  \
0     0           0            0                0                 0       0   
1     0           0            0                0                 0       0   
2     1           1            1                1                 1      20   
3     4          10           10                3                 3      14   
4     1           0            0                1                 1      10   

  slash1 slash2 label  
0      0      

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
X = featureSet.drop(['payload','label'],axis=1).values
y = featureSet['label'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [7]:
#使用决策树模型，并计算其对应的召回率和准确率
from sklearn import tree
from sklearn.metrics import confusion_matrix
dt = tree.DecisionTreeClassifier(max_depth = 10)
dt.fit(X_train,y_train)
y_predict = dt.predict(X_test)
matrix = confusion_matrix(y_test, y_predict)
TP = matrix[0][0]
FP = matrix[0][1]
FN = matrix[1][0]
TN = matrix[1][1]
Acc = (TP + TN) / (TP + FP + TN + FN)
Rec = TP / (TP + FN)
print("Accuracy: %.2f  Recall: %.2f" % (Acc, Rec))
print(confusion_matrix(y_test,y_predict))

Accuracy: 0.98  Recall: 1.00
[[6429  219]
 [  16 6303]]


In [27]:
#输入用于XSS测试的payload；
payload1 = '<isindexformaction="javascript:alert(1)"     type=image>'
result = pd.DataFrame(columns=('payload','script','java','iframe',\
    'style','alert','leftBracket','rightBracket','leftAngleBracket',\
    'rightAngleBracket','percent','slash1','slash2','label'))
results = getFeature(payload1, 0)
result.loc[0] = results
result = result.drop(['payload','label'],axis=1).values
print(dt.predict(result))

['0']


In [15]:
#Random Forest
import sklearn.ensemble as ek
clf = ek.RandomForestClassifier(n_estimators=50)
clf.fit(X_train, y_train)
r = clf.predict(X)
matrix = confusion_matrix(y, r)
TP = matrix[0][0]
FP = matrix[0][1]
FN = matrix[1][0]
TN = matrix[1][1]
print(TP)
print(matrix)
Acc = (TP + TN) / (TP + FP + TN + FN)
Rec = TP / (TP + FN)
print("Accuracy: %.2f  Recall: %.2f" % (Acc, Rec))

32347
[[32347  1079]
 [   43 31364]]
Accuracy: 0.98  Recall: 1.00


In [29]:
#输入用于XSS测试的payload；
payload2 = '<input type="image" formaction=JaVaScript:alert(0)>'
result = pd.DataFrame(columns=('payload','script','java','iframe',\
    'style','alert','leftBracket','rightBracket','leftAngleBracket',\
    'rightAngleBracket','percent','slash1','slash2','label'))
results = getFeature(payload2, 0)
result.loc[0] = results
result = result.drop(['payload','label'],axis=1).values
print(dt.predict(result))

['0']
