# XSS Dectection Based On Heuristic Method

## Method of Cleaning data And Extracting features

In [41]:
######## clean data ########
def to_lower_case(obj):
    return obj.lower()

def remove_line_break(obj):
    return obj.replace("<br/>", "")

def clean(obj):
    obj = to_lower_case(obj)
    obj = remove_line_break(obj)
    return obj

######## extract feature ########
def is_angle_brackets_closed(obj):
    left = 0
    right = 0
    for c in obj:
        if(c == '<'):
            left += 1
        elif(c == '>'):
            right += 1
    return 1 if (left == right) else 0

def has_pop_up_window(obj):
    return 1 if("alert" in obj) else 0

def length(obj):
    return len(obj)

def is_script_embedded(obj):
    flag = 0
    if((obj.count("script") % 2) == 0):
        flag = 1
    elif(obj.count("javascript:")):
        flag = 1
    return flag

def has_iframe(obj):
    return 1 if "iframe" in obj else 0

def per_cent_sign_num(obj):
    return obj.count("%")

def has_backslash(obj):
    return 1 if "\\" in obj else 0

def has_closed_sign(obj):
    return 1 if "=\'" in obj else 0

def has_document_cookie(obj):
    return 1 if "document.cookie" in obj else 0


######## interface for user ########
def characterize(obj):
    obj = clean(obj)
    features = [is_angle_brackets_closed(obj),
                has_pop_up_window(obj), has_iframe(obj),
               is_script_embedded(obj), length(obj),
               per_cent_sign_num(obj), has_backslash(obj),
               has_closed_sign(obj), has_document_cookie(obj)]
    return features



## Load Raw Data

In [42]:
import pandas as pd

normal = pd.read_csv("./dmzo_nomal.csv")
xssed = pd.read_csv("./xssed.csv")

raw_data_xssed = xssed.iloc[:,0].values
raw_data_normal = normal.iloc[:,0].values

## Characterize Data

In [43]:
features = []
labels = []

for data in raw_data_xssed:
    features.append(characterize(data))
    labels.append(1)

for data in raw_data_normal:
    features.append(characterize(data))
    labels.append(0)

## Training Data

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

train_X, test_X, train_y, test_y = train_test_split(features, labels, test_size=0.3, random_state=0)
clf = RandomForestClassifier()
clf.fit(train_X, train_y)
pred_y = clf.predict(test_X)
print("precision_score:", precision_score(y_true=test_y, y_pred=pred_y))
print("recall_score:", recall_score(y_true=test_y, y_pred=pred_y))

accuracy_score: 0.9933263816475495
recall_score: 0.9585429663916281


## Model App

In [45]:
def pop_up_box():
    """
    使用tkinter弹出输入框输入data, 输出result
    """

    import tkinter

    
    def inputint():
        nonlocal test_data
        test_data = var.get().strip()
        test = []
        test.append(characterize(test_data))
        if clf.predict(test)[0]:
            l.config(text="xss dectected")
        else:
            l.config(text="no xss found")
        var.set('')
        test_data = ''

    test_data = 0
    root = tkinter.Tk(className='Here is some interpretation')  # 弹出框框名
    root.geometry('270x60')     # 设置弹出框的大小 w x h

    var = tkinter.StringVar()   # 这即是输入框中的内容
    var.set('<img src=\'#\' onerror=javascript:alert(1)>') # 通过var.get()/var.set() 来 获取/设置var的值
    entry1 = tkinter.Entry(root, textvariable=var)  # 设置"文本变量"为var
    entry1.pack()   # 将entry"打上去"
    l =tkinter.Label(root, bg = 'yellow', width = 20, text = '')
    l.pack()
    btn1 = tkinter.Button(root, text='Input', command=inputint)     # 按下此按钮(Input), 触发inputint函数

    # 按钮定位
    btn1.pack(side='right')

    # 上述完成之后, 开始真正弹出弹出框
    root.mainloop()

In [46]:
pop_up_box()