In [1]:
import os
import re
from my_tokenize3 import my_token_get_all, feature_vector_helper
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
# 读取目录下所有php文件的路径，返回一个路径的列表
def load_all_php_path(dir):
    filelist = []
    
#     root为dir目录地址
#     dirs是一个该文件夹下所有目录名的list
#     filelist是文件名的list
    for root, dirs, files in os.walk(dir):
        for file in files:
            if file.endswith('.php'):
                fullpath = os.path.join(root, file)
#                 print('Loading %s' % fullpath)
                filelist.append(fullpath)
    return filelist

# 读取每一份代码，返回代码内容
def load_one_file(filename):
    context = ''
    with open(filename, 'r', encoding='utf-8', errors='ignore') as codes:
        for line in codes:
            line = line.strip('\r')
            context += line
    return context

In [3]:
black_file_paths = load_all_php_path('./dataset-clean/bad/')
white_file_paths = load_all_php_path('./dataset-clean/good/')
all_paths = black_file_paths + white_file_paths
print(len(black_file_paths))
print(len(white_file_paths))
print(len(all_paths))

4352
5728
10080


In [4]:
token_list = []
label_list = []

for file in black_file_paths:
    token = feature_vector_helper(my_token_get_all(file))
    if len(token) != 0:
        token_list.append(token)
        label_list.append('1')
    else:
        print('get code token error')
        
for file in white_file_paths:
    token = feature_vector_helper(my_token_get_all(file))
    if len(token) != 0:
        token_list.append(token)
        label_list.append('0')
    else:
        print('get code token error')
        
token_list = np.array(token_list)
label_list = np.array(label_list)
print(token_list.shape)
print(label_list.shape)
print(token_list[0])

(10080,)
(10080,)
['T_TAG_DOCTYPE_HTML', 'T_TAG_HTML', 'T_TAG_BODY', 'T_TAG_PHP', 'T_VAR_ASSIGN_BY_ORDINARY_FUNCTION', 'T_VAR_ASSIGN_COMMON', 'T_VAR_SOURCE_$_GET', 'T_VAR_ASSIGN_COMMON', 'T_VAR_ASSIGN_COMMON', 'T_VAR_SANITIZE_BY_ADDSLASHES', 'T_SINK_TYPE_15', 'T_TAG_PHP', 'T_TAG_DIV', 'T_TAG_BODY', 'T_TAG_HTML']


In [5]:
print(token_list[20:30])

[list(['T_TAG_DOCTYPE_HTML', 'T_TAG_HTML_START', 'T_TAG_HEAD_END', 'T_TAG_BODY_START', 'T_TAG_PHP_START', 'T_VAR_ASSIGN_BY_ORDINARY_FUNCTION', 'T_VAR_ASSIGN_COMMON', 'T_VAR_SOURCE_$_GET', 'T_VAR_ASSIGN_COMMON', 'T_VAR_ASSIGN_COMMON', 'T_VAR_SANITIZE_BY_FILTER_VAR_FILTER_SANITIZE_EMAIL', 'T_LOGIC_IF', 'T_VAR_ASSIGN_COMMON', 'T_LOGIC_ELSE', 'T_VAR_ASSIGN_COMMON', 'T_SINK_SINGLE_QUOTE_ATTR_VAL', 'T_TAG_PHP_END', 'T_TAG_BODY_END', 'T_TAG_HTML_END'])
 list(['T_TAG_DOCTYPE_HTML', 'T_TAG_HTML_START', 'T_TAG_HEAD_END', 'T_TAG_BODY_START', 'T_TAG_PHP_START', 'T_VAR_ASSIGN_BY_ORDINARY_FUNCTION', 'T_VAR_ASSIGN_COMMON', 'T_VAR_SOURCE_$_GET', 'T_VAR_ASSIGN_COMMON', 'T_VAR_ASSIGN_COMMON', 'T_VAR_SANITIZE_BY_FILTER_VAR_FILTER_SANITIZE_EMAIL', 'T_LOGIC_IF', 'T_VAR_ASSIGN_COMMON', 'T_LOGIC_ELSE', 'T_VAR_ASSIGN_COMMON', 'T_SINK_NO_QUOTE_ATTR_VAL', 'T_TAG_PHP_END', 'T_TAG_BODY_END', 'T_TAG_HTML_END'])
 list(['T_TAG_DOCTYPE_HTML', 'T_TAG_HTML_START', 'T_TAG_HEAD_START', 'T_TAG_STYLE_START', 'T_TAG_PHP_STA

In [5]:
from nltk import wordpunct_tokenize

raw_token_list = []
label_list = []

for file in black_file_paths:
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        content = wordpunct_tokenize(f.read())
        raw_token_list.append(content)
        f.close()
        label_list.append('1')
        
for file in white_file_paths:
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        content = wordpunct_tokenize(f.read())
        raw_token_list.append(content)
        f.close()
        label_list.append('0')
        
raw_token_list = np.array(raw_token_list)
label_list = np.array(label_list)
print(raw_token_list.shape)
print(label_list.shape)

(10080,)
(10080,)


# doc2Vec分词

In [6]:
tgd_document = [TaggedDocument(doc, [i]) for doc, i in zip(token_list, all_paths)]
print(len(tgd_document))
doc_model = Doc2Vec(min_count=1, workers=4)

10080


In [7]:
doc_model.build_vocab(tgd_document)
doc_model.train(tgd_document, total_examples=doc_model.corpus_count, epochs=10)
print('finished training')

finished training


In [8]:
print(doc_model.corpus_total_words)

157956


In [50]:
doc_model.save('doc2Vec_raw_token.model')

In [10]:
doc_model = Doc2Vec.load('doc2Vec_token.model')

In [9]:
print(len(doc_model.docvecs))
print(doc_model.corpus_count)
print(doc_model.docvecs[0].shape)
doc_vec_list = doc_model.docvecs.vectors_docs
print(doc_vec_list.shape)

10080
10080
(100,)
(10080, 100)


In [30]:
import pandas as pd

In [31]:
token_str_list = []
for token in token_list:
    token = ' '.join(token)
    token_str_list.append(token)
    
doc2Vec_dict = {
    'file_path': all_paths,
    'token': token_str_list,
#     'vector': doc_vec_list,
    'label': label_list
}
doc2Vec_df = pd.DataFrame(doc2Vec_dict)
doc2Vec_df.to_csv('testtttttt.csv', index=None)

# Word2Vec 分词

In [21]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import nltk

In [30]:
corpus = []

for token in token_list:
#     print(token)
    corpus.append(token)
corpus = np.array(corpus)
print(corpus[0])
w2v_model = Word2Vec(corpus, size=100, window=5, min_count=1)

['T_VAR_SOURCE_$_GET', 'T_VAR_SANITIZE_BY_ADDSLASHES', 'T_SINK_ATTR_NAME']


In [31]:
wv = np.array(w2v_model.wv.vectors)
words = np.array(w2v_model.wv.index2word)
print(wv.shape)
print(words.shape)

(29, 100)
(29,)


# Data preprocessing

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(doc_vec_list, label_list, 
                                                    random_state=2019, test_size = 0.2, stratify=label_list)


In [11]:
y_train = to_categorical(y_train, 2)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train).astype('float32').reshape(-1, 100, 1)
X_test_std = scaler.fit_transform(X_test).astype('float32').reshape(-1, 100, 1)

In [13]:
index = 5
print(X_train_std[index])
print(y_train[index])

[[-0.98713356]
 [ 0.23830652]
 [ 1.3215753 ]
 [-0.39758846]
 [ 1.1049883 ]
 [-0.1060916 ]
 [-1.5417783 ]
 [-0.39726794]
 [-0.7075006 ]
 [-1.0773247 ]
 [-0.0627264 ]
 [ 0.33799082]
 [ 0.9908572 ]
 [-0.36377394]
 [ 0.13662373]
 [ 0.56917447]
 [-1.5725927 ]
 [ 0.6334588 ]
 [ 1.779405  ]
 [-0.9299604 ]
 [ 0.42101318]
 [-0.030726  ]
 [ 0.06546658]
 [ 0.28070685]
 [-0.27652687]
 [ 0.45746678]
 [-0.2790404 ]
 [ 1.3612814 ]
 [-0.53467274]
 [-1.2548751 ]
 [-0.51373434]
 [ 0.472557  ]
 [ 1.0438993 ]
 [ 0.7619127 ]
 [ 0.92010343]
 [ 2.0587711 ]
 [ 2.1426845 ]
 [-1.3844591 ]
 [-0.6839878 ]
 [-0.45605454]
 [-0.10832098]
 [ 0.5458253 ]
 [-0.07392456]
 [-1.9689234 ]
 [ 0.08535058]
 [ 0.6397421 ]
 [ 0.45846584]
 [ 0.3553909 ]
 [ 0.01542908]
 [-0.7291864 ]
 [-1.8248147 ]
 [-0.65871817]
 [-1.297065  ]
 [-0.12290539]
 [ 0.48453587]
 [-0.6841087 ]
 [ 0.703199  ]
 [ 0.36211398]
 [ 0.47200456]
 [-0.05922444]
 [ 1.1127754 ]
 [ 0.8170991 ]
 [-1.0728669 ]
 [-1.8385667 ]
 [-0.32584313]
 [ 0.96379113]
 [-1.94562

In [14]:
print(X_train_std.shape)
print(y_train.shape)
print(X_test_std.shape)
print(y_test.shape)

(8064, 100, 1)
(8064, 2)
(2016, 100, 1)
(2016,)


# Bi LSTM model

In [15]:
from tensorflow.keras.layers import Dense, Flatten, Embedding, Dropout, Bidirectional, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import SGD


In [17]:
model = Sequential()
# model.add(Embedding(X_train.shape[0], 100, input_length=100))
model.add(Bidirectional(LSTM(128, dropout=0.25, recurrent_dropout=0.25)))
model.add(Dropout(0.25))
model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.4))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam',
             metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [51]:
from tensorflow.keras.models import load_model
model = load_model('rawtoken_doc2Vec.h5')



In [18]:
history1 = model.fit(X_train_std, y_train, epochs=400, validation_split=0.2)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 6451 samples, validate on 1613 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 

KeyboardInterrupt: 

In [37]:
model.save('mytoken3_doc2Vec.h5')

In [55]:
y_pred = model.predict_classes(X_test_std).astype('str')
print(y_pred[:100])
print(y_test[:100])
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred, 
                                    target_names=['Good sanitize', 'Bad sanitize']))

['0' '0' '0' '1' '0' '1' '1' '0' '1' '1' '0' '0' '1' '1' '1' '1' '0' '1'
 '1' '0' '0' '1' '0' '1' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1'
 '1' '0' '0' '0' '0' '0' '1' '0' '1' '0' '0' '1' '1' '0' '1' '1' '1' '0'
 '0' '0' '0' '0' '0' '1' '0' '0' '1' '1' '0' '0' '0' '0' '0' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '1'
 '0' '0' '0' '0' '0' '1' '0' '0' '1' '0']
['0' '0' '0' '0' '0' '1' '1' '1' '1' '1' '1' '0' '1' '0' '0' '1' '0' '1'
 '1' '0' '0' '1' '0' '1' '0' '0' '0' '1' '1' '0' '1' '0' '0' '0' '0' '0'
 '1' '0' '0' '0' '0' '0' '1' '0' '1' '0' '0' '0' '1' '0' '1' '1' '0' '0'
 '1' '1' '0' '0' '0' '1' '0' '0' '1' '0' '0' '1' '1' '0' '0' '1' '0' '0'
 '1' '0' '1' '0' '1' '1' '0' '1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '1'
 '1' '0' '0' '0' '0' '1' '0' '0' '1' '0']
               precision    recall  f1-score   support

Good sanitize       0.75      0.77      0.76      1718
 Bad sanitize       0.69      0.66      0.67      1306

     accuracy      

# SVM

In [29]:
from sklearn import svm

svc = svm.SVC(kernel='linear', C=1.0, gamma='auto', probability=True, random_state=None).fit(X_train_std, y_train)  # 线性核
# rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=1.0).fit(X_train, y_train)  # 径向基核
# poly_svc = svm.SVC(kernel='poly', degree=3, C=1.0, gamma='auto').fit(X_train, y_train)  # 多项式核
print('Finished training.\n');

from sklearn import metrics
predict_target = svc.predict(X_test_std)
print(metrics.classification_report(y_test, predict_target, 
                                    target_names=['Good sanitize', 'Bad sanitize']))

Finished training.

               precision    recall  f1-score   support

Good sanitize       0.65      0.80      0.72      1718
 Bad sanitize       0.63      0.44      0.52      1306

     accuracy                           0.65      3024
    macro avg       0.64      0.62      0.62      3024
 weighted avg       0.64      0.65      0.63      3024

