In [None]:
# Dataset from https://github.com/xuantrung1803/XSS-Detection


In [None]:
!pip install transformers

In [2]:
import sklearn
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# 1. Read Data

In [3]:

df = pd.read_csv('XSS_dataset.csv', encoding='utf-8-sig')
df=df[['Sentence','Label']]
df

Unnamed: 0,Sentence,Label
0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,"\t </span> <span class=""reference-text"">Steeri...",0
3,"\t </span> <span class=""reference-text""><cite ...",0
4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0
...,...,...
13681,<img onpointerenter=alert(1)>XSS</img>,1
13682,"<source onbeforepaste=""alert(1)"" contenteditab...",1
13683,"<div draggable=""true"" contenteditable>drag me<...",1
13684,"<li><cite id=""CITEREFDomingos2015"" class=""cita...",0


In [4]:

def get_len(x):
  return len(x.split(' '))
tqdm.pandas()
df['len']=df['Sentence'].progress_apply(lambda x: get_len(x))

100%|██████████| 13686/13686 [00:00<00:00, 594519.59it/s]


In [5]:
# df['len'].plot(kind='bar')

In [6]:
df['len'].mean()

7.4049393540844655

# 2. char2index code

In [7]:
def data2char_index(X, max_len):
    alphabet = " abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    result = []
    for data in X:
        mat = []
        for ch in data:
            if ch not in alphabet:
                continue
            mat.append(alphabet.index(ch))
        result.append(mat)
    X_char = tf.keras.preprocessing.sequence.pad_sequences(np.array(result, dtype=object), padding='post',
                                                           truncating='post', maxlen=max_len)
    return X_char


In [8]:
data = df['Sentence'].values
label = df['Label'].values

# 3. train and test split

In [9]:
x_train1, x_test1, y_train, y_test = train_test_split(
    data, label, test_size=0.2, random_state=42)

x_train = data2char_index(x_train1, max_len=200)
x_test = data2char_index(x_test1, max_len=200)




In [10]:
x_train.shape


(10948, 200)

In [11]:
x_test.shape

(2738, 200)

# 4. CNN model

In [57]:
def get_charcnn_model(max_len):
    main_input = tf.keras.layers.Input(shape=(max_len,))

    embedder = tf.keras.layers.Embedding(
        input_dim=70,
        output_dim=80,
        input_length=max_len,
        trainable=False
    )
    embed = embedder(main_input)
    #chập
    cnn1 = tf.keras.layers.Conv1D(
        32, 5, padding='same', strides=1, activation='relu')(embed)
    cnn1 = tf.keras.layers.MaxPooling1D(pool_size=12)(cnn1)

    cnn2 = tf.keras.layers.Conv1D(
        32, 10, padding='same', strides=1, activation='relu')(embed)
    cnn2 = tf.keras.layers.MaxPooling1D(pool_size=11)(cnn2)

    cnn3 = tf.keras.layers.Conv1D(
        32, 15, padding='same', strides=1, activation='relu')(embed)
    cnn3 = tf.keras.layers.MaxPooling1D(pool_size=10)(cnn3)

    cnn = tf.keras.layers.concatenate([cnn1, cnn2, cnn3], axis=1)

    flat = tf.keras.layers.Flatten()(cnn)

    drop = tf.keras.layers.Dropout(0.2)(flat)


    dense1 = tf.keras.layers.Dense(1024, activation='relu')(drop)
    dense2 = tf.keras.layers.Dense(128, activation='relu')(dense1)

    main_output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2)
    model = tf.keras.Model(inputs=main_input, outputs=main_output)
    return model



In [58]:
model = get_charcnn_model(max_len=200)

In [59]:
tensorboard = tf.keras.callbacks.TensorBoard(
    log_dir='logs/', histogram_freq=0, write_graph=True, write_images=True)


In [60]:
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    metrics=['accuracy']

)
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 200, 80)      5600        ['input_2[0][0]']                
                                                                                                  
 conv1d_3 (Conv1D)              (None, 200, 32)      12832       ['embedding_1[0][0]']            
                                                                                                  
 conv1d_4 (Conv1D)              (None, 200, 32)      25632       ['embedding_1[0][0]']            
                                                                                            

In [61]:
batch_size = 512
num_epoch = 20
model_log = model.fit(
    x_train,
    y_train,
    batch_size=batch_size,
    epochs=num_epoch,
    verbose=1,
    validation_data=(x_test, y_test),
    callbacks=[tensorboard]
)

model.save('model.h5')



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [62]:
pred = model.predict(x_test)
y_pred = np.int64(pred > 0.5)



In [63]:
all_pred=y_pred
all_labels=y_test

In [64]:
import sklearn
confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)



## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=all_labels, y_pred=all_pred)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=all_labels, y_pred=all_pred)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=all_labels, y_pred=all_pred)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=all_labels, y_pred=all_pred)))


Confusion matrix: 
 [[1154  106]
 [  30 1448]]

TP: 1448
FP: 106
TN: 1154
FN: 30

Accuracy: 0.9503287070854638
Precision: 0.9317889317889317
Recall: 0.979702300405954
F-measure: 0.955145118733509


# 5 VulBERTa

In [18]:
print('x_train shape: ',x_train1.shape,np.round(x_train1.shape[0]/data.shape[0],2),'%')
print('x_test.shape:  ',x_test1.shape,np.round(x_test1.shape[0]/data.shape[0],2),'%')

x_train shape:  (10948,) 0.8 %
x_test.shape:   (2738,) 0.2 %


In [19]:
mx_len=514

In [20]:
#define a tokenizer object
from transformers import RobertaTokenizer
model_name="cardiffnlp/twitter-roberta-base-emotion"
tokenizer = RobertaTokenizer.from_pretrained(model_name,num_labels=2,model_max_length=mx_len)

train1_encodings = tokenizer(list(x_train1),
                            truncation=True,
                            padding=True,)

test1_encodings = tokenizer(list(x_test1),
                            truncation=True,
                            padding=True)



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

In [21]:
train_final_encodings=dict()
train_final_encodings['input_ids1']=train1_encodings['input_ids']
train_final_encodings['attention_mask1']=train1_encodings['input_ids']

In [22]:
test_final_encodings=dict()
test_final_encodings['input_ids1']=test1_encodings['input_ids']
test_final_encodings['attention_mask1']=test1_encodings['input_ids']


In [23]:

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_final_encodings),
                                    list(y_train)))
test_dataset= tf.data.Dataset.from_tensor_slices((dict(test_final_encodings),
                                    list(y_test)))

In [None]:
# configuration={
#   "attention_probs_dropout_prob": 0.1,
#   "bos_token_id": 0,
#   "classifier_dropout": 0,
#   "eos_token_id": 2,
#   "hidden_act": "gelu",
#   "hidden_dropout_prob": 0.1,
#   "hidden_size": 768,
#   "initializer_range": 0.02,
#   "intermediate_size": 3072,
#   "layer_norm_eps": 1e-12,
#   "max_position_embeddings": mx_len,
#   "model_type": "roberta",
#   "num_attention_heads": 3,
#   "num_hidden_layers": 3,
#   "pad_token_id": 1,
#   "position_embedding_type": "absolute",
#   "type_vocab_size": 1,
#   "use_cache": True,
#   "vocab_size": 50265,
# }
# from transformers import RobertaConfig
# new_config=RobertaConfig.from_dict(configuration)

In [35]:
## VulBERTa-small config
from transformers import RobertaConfig
new_config_small = RobertaConfig(
                              max_position_embeddings=514,
                              num_attention_heads=3,
                              num_hidden_layers=3,
                              type_vocab_size=1,)


In [36]:
## VulBERTa-medium config
from transformers import RobertaConfig
new_config_med= RobertaConfig(
                              max_position_embeddings=514,
                              num_attention_heads=12,
                              num_hidden_layers=6,
                              type_vocab_size=1,)

In [37]:
## VulBERTa-base config
from transformers import RobertaConfig
new_config_big = RobertaConfig(max_position_embeddings=514,
                           num_attention_heads=12,
                           num_hidden_layers=12,
                           type_vocab_size=1,)

In [46]:
from transformers import TFRobertaModel
# get TF bert
new_config=new_config_small
new_config.output_hidden_states = False
transformer_model = TFRobertaModel.from_pretrained(model_name, config = new_config)


# get input bert1
input_ids_in1 = tf.keras.layers.Input(shape=(new_config.max_position_embeddings,), name='input_ids1', dtype='int32')
input_masks_in1 = tf.keras.layers.Input(shape=(new_config.max_position_embeddings,), name='attention_mask1', dtype='int32')
X= transformer_model(input_ids=input_ids_in1,attention_mask=input_masks_in1)[0]


# extra layer
X = tf.keras.layers.GlobalMaxPool1D()(X)

X = tf.keras.layers.Dense(256, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1, activation='sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids_in1, input_masks_in1], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False

# setting optimizer and loss
optimizerr = tf.keras.optimizers.Adam(learning_rate=0.001)
losss_fun = tf.keras.losses.SparseCategoricalCrossentropy()
# compile model
# model.compile(optimizer=optimizerr,loss=losss_fun,
#               metrics=['accuracy'])# train the model
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    metrics=['accuracy'])

Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing TFRobertaModel: ['classifier', 'roberta/encoder/layer_._5/output/LayerNorm/gamma:0', 'roberta/encoder/layer_._9/output/LayerNorm/gamma:0', 'roberta/encoder/layer_._4/output/dense/kernel:0', 'roberta/encoder/layer_._10/attention/self/value/bias:0', 'roberta/encoder/layer_._11/attention/self/query/bias:0', 'roberta/encoder/layer_._9/attention/self/value/kernel:0', 'roberta/encoder/layer_._8/attention/self/value/bias:0', 'roberta/encoder/layer_._3/attention/self/key/kernel:0', 'roberta/encoder/layer_._10/attention/self/value/kernel:0', 'roberta/encoder/layer_._9/intermediate/dense/bias:0', 'roberta/encoder/layer_._8/output/LayerNorm/gamma:0', 'roberta/encoder/layer_._6/attention/output/LayerNorm/beta:0', 'roberta/encoder/layer_._6/attention/output/LayerNorm/gamma:0', 'roberta/encoder/layer_._9/attention/self/query/bias:0', 'roberta/encoder/layer_._5/attention/self/key/bias:0', 

In [47]:
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids1 (InputLayer)        [(None, 514)]        0           []                               
                                                                                                  
 attention_mask1 (InputLayer)   [(None, 514)]        0           []                               
                                                                                                  
 tf_roberta_model_8 (TFRobertaM  TFBaseModelOutputWi  60854784   ['input_ids1[0][0]',             
 odel)                          thPoolingAndCrossAt               'attention_mask1[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 514,                                         

In [48]:
n_patch=300
model.fit(train_dataset.shuffle(len(train_dataset)).batch(n_patch),
          epochs=2,
          validation_data=test_dataset.shuffle(len(test_dataset)).batch(n_patch))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7b8d006eacb0>

In [54]:
pred = model.predict(test_dataset.batch(n_patch))
y_pred = np.int64(pred > 0.5)




In [55]:
all_pred=y_pred
all_labels=y_test

In [56]:

confusion = sklearn.metrics.confusion_matrix(y_true=all_labels, y_pred=all_pred)
print('Confusion matrix: \n',confusion)

tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)



## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=all_labels, y_pred=all_pred)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=all_labels, y_pred=all_pred)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=all_labels, y_pred=all_pred)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=all_labels, y_pred=all_pred)))


Confusion matrix: 
 [[ 575  685]
 [  23 1455]]

TP: 1455
FP: 685
TN: 575
FN: 23

Accuracy: 0.7414170927684441
Precision: 0.6799065420560748
Recall: 0.9844384303112313
F-measure: 0.8043117744610283
