In [1]:
import numpy as np
import os
import re

In [6]:
# 读取Document-to-Vector的模型
def load_d2v_model(d2v_path):
    from gensim.models.doc2vec import Doc2Vec
    d2v_model = Doc2Vec.load(d2v_path)
    return d2v_model


# 读取神经网络模型
def load_nn_model(nn_path):
    from tensorflow.keras.models import load_model
    nn_model = load_model(nn_path)
    return nn_model


# 读源码文件，转vld文本
def get_vld(codepath):
    command = 'php -dvld.active=1 -dvld.execute=0 -dvld.verbosity=3 ' + str(codepath) + ' 2>&1'
    return os.popen(command).read()
    

# 由vld文本提取出opcode操作码
def get_opcode(vld):
    opcode_list = []
    tokens=re.findall(r'\s(\b[A-Z_]+\b)\s', vld)
    for token in tokens:
        if len(token) != 1:
            opcode_list.append(token)
    return ' '.join(opcode_list)
    

# opcode转文档向量，shape=(256, )
def opcode2vector(opcode_list, d2v_model):
    vector = d2v_model.infer_vector(opcode_list.split(' '))
    vector = np.array(vector)
    print(vector)
    return vector
    

# vector归一化
def std_vector(sample_vector, vec_list):
    from sklearn.preprocessing import StandardScaler
    std_scaler = StandardScaler().fit(vec_list)
    sample_vector_std = std_scaler.transform(sample_vector)
    sample_vector_std = sample_vector_std.reshape(-1, 16, 16, 1)
    return sample_vector_std
    
    
# 读一份代码文件，取出代码文本并进行一系列转换，并进行数据清洗
def transfer_one(code_path):
    d2v_model = load_d2v_model('opcode_doc.d2v')
    vec_list = d2v_model.docvecs.vectors_docs
    vld = get_vld(code_path)
    opcode_list = get_opcode(vld)
    sample_vector = opcode2vector(opcode_list, d2v_model)
    sample_vector_std = std_vector(sample_vector, vec_list)
    return sample_vector_std


# 预测一个样本的情况，返回标签值和预测概率
def predict_one(pred_vec):
#     nn_model = load_nn_model('doc2vec_cov2d.h5')
    nn_model = load_nn_model('doc2vec_lstm.h5')
    pred_label = nn_model.predict(pred_vec)
    pred_proba = nn_model.predict_proba(pred_vec)
    return pred_label, pred_proba

In [7]:
file_path = input('Input code file to be tested > ')
pred_vec = transfer_one(file_path)
label, proba = predict_one(pred_vec)

Input code file to be tested >  .reshape(-1, 1)


php -dvld.active=1 -dvld.execute=0 -dvld.verbosity=3 .reshape(-1, 1) 2>&1
[ 1.9067775e-04  8.4058347e-04  4.0141944e-04  1.7532494e-04
 -2.9822343e-04  5.6989887e-04 -2.4379995e-04  1.5303633e-03
  1.8111827e-03 -4.5530658e-04  1.1395509e-03  1.1287078e-04
  2.6579906e-04  1.6624868e-03 -1.6756403e-03 -1.6127762e-03
 -1.8741469e-03  1.2992963e-03  1.0865498e-03  1.4453599e-03
  1.8696029e-03  1.1685882e-03 -1.5047124e-04  1.0958171e-03
 -1.4911155e-03  5.4656650e-04 -1.3931512e-03  1.7369880e-03
  8.5345004e-05 -3.3335181e-04 -9.1970462e-04  1.0712254e-03
 -1.7128777e-04  2.6732011e-04 -1.8797274e-03  4.5951366e-04
  4.3787391e-04  4.5677341e-04  1.7333910e-03  7.1023556e-04
 -5.4879725e-04 -2.4596893e-04  7.7199686e-04 -1.7178693e-03
  6.5143249e-04  6.6655420e-04 -1.1313182e-03 -1.4495066e-03
 -7.2098302e-04 -5.3237978e-04  2.7420613e-04 -2.3983784e-04
  1.9077103e-03 -1.5545124e-03 -1.1372002e-03 -1.3230097e-03
  5.9807941e-04 -9.6370466e-04 -1.3159854e-04 -9.9833752e-04
 -1.3321501

ValueError: Expected 2D array, got 1D array instead:
array=[ 1.9067775e-04  8.4058347e-04  4.0141944e-04  1.7532494e-04
 -2.9822343e-04  5.6989887e-04 -2.4379995e-04  1.5303633e-03
  1.8111827e-03 -4.5530658e-04  1.1395509e-03  1.1287078e-04
  2.6579906e-04  1.6624868e-03 -1.6756403e-03 -1.6127762e-03
 -1.8741469e-03  1.2992963e-03  1.0865498e-03  1.4453599e-03
  1.8696029e-03  1.1685882e-03 -1.5047124e-04  1.0958171e-03
 -1.4911155e-03  5.4656650e-04 -1.3931512e-03  1.7369880e-03
  8.5345004e-05 -3.3335181e-04 -9.1970462e-04  1.0712254e-03
 -1.7128777e-04  2.6732011e-04 -1.8797274e-03  4.5951366e-04
  4.3787391e-04  4.5677341e-04  1.7333910e-03  7.1023556e-04
 -5.4879725e-04 -2.4596893e-04  7.7199686e-04 -1.7178693e-03
  6.5143249e-04  6.6655420e-04 -1.1313182e-03 -1.4495066e-03
 -7.2098302e-04 -5.3237978e-04  2.7420613e-04 -2.3983784e-04
  1.9077103e-03 -1.5545124e-03 -1.1372002e-03 -1.3230097e-03
  5.9807941e-04 -9.6370466e-04 -1.3159854e-04 -9.9833752e-04
 -1.3321501e-03 -1.5219721e-03  6.1066245e-04 -1.4133479e-03
 -1.1852252e-03 -5.1279232e-04  1.2538799e-03 -1.5738232e-03
  1.3200972e-03 -1.5777407e-03  1.8611698e-03 -1.2245624e-04
  1.8623481e-03  4.0955280e-04  9.3462336e-04 -1.8000477e-03
 -8.4841030e-04 -1.4836072e-03 -7.9632737e-04 -1.4893449e-03
 -7.1100320e-04 -3.3491018e-04 -1.7025488e-03  7.5184420e-04
  2.6016193e-04 -9.1644732e-04  9.0812711e-05 -1.5861699e-03
  2.9666600e-04  1.6769383e-03 -7.0871500e-04  6.5394677e-04
 -1.4382896e-03  8.4502815e-04 -8.2263246e-04 -1.2375338e-03
  3.3794116e-04 -1.8745799e-03  1.2849220e-03 -1.9347833e-03
  6.9459586e-04 -8.9840638e-04  9.1872667e-04  1.8054240e-03
 -9.8143308e-04  2.9748958e-04  3.5953880e-04  2.8223402e-04
 -1.0817124e-03  1.7685508e-03 -2.0654149e-04  1.3531588e-03
  7.7921594e-04 -7.9126190e-04  1.2257728e-03 -4.0427444e-04
  1.4886843e-03  3.1747215e-04  1.4911537e-03  7.5207651e-04
  8.7989954e-04  5.1733668e-06  1.7815767e-03  5.6246173e-04
 -2.9744121e-04  4.1559848e-04 -1.8781516e-03 -7.7509839e-04
  6.2567787e-04 -8.2000933e-04  4.6099778e-04 -2.7824726e-04
 -1.4239295e-03 -7.8795967e-04  2.7330042e-04  3.5497174e-04
  2.9033300e-04  5.9844070e-04  5.9415342e-04 -2.6789674e-04
  1.5490102e-03 -5.1733642e-04 -2.5052764e-04  1.5309507e-03
  1.1960702e-03  7.9643977e-04 -1.5616138e-03  1.6386040e-03
  8.3688006e-04  1.9486211e-03 -1.3693426e-03  1.4379924e-03
 -1.3183870e-03  4.5140454e-04 -1.4694532e-03  1.3594071e-03
  1.2004647e-03  2.6992476e-04 -3.6256525e-04 -1.6829415e-03
  7.7120616e-04 -1.8147389e-04  8.6740468e-04  1.4311810e-03
  1.8575059e-03  1.3898568e-03 -1.9073669e-03 -5.4696068e-04
  8.9840061e-04 -1.2826966e-03  8.2174243e-05 -1.7408673e-03
 -1.1718886e-03 -1.8807743e-03  1.1472567e-03 -1.0784192e-03
 -6.0409500e-04  1.6721926e-03  7.9849374e-04 -1.8287542e-03
 -1.3097884e-03  4.7452500e-04  3.0167418e-04 -1.0238562e-03
  1.6961484e-03  4.4517952e-04  1.3919064e-04  3.5121085e-04
  8.9891418e-04 -7.3458988e-04 -3.9757398e-04 -1.1334228e-03
 -1.2258085e-03  1.7358296e-03  9.3574531e-04 -3.7270278e-05
 -1.0647866e-03 -9.5954497e-04 -1.7264485e-03 -2.5618507e-04
 -7.3517236e-04  7.6696672e-04 -4.7753187e-04 -1.2515482e-03
 -1.8567237e-03 -1.6904311e-03  7.0075301e-04 -1.8087171e-04
  1.4288754e-04  1.5494972e-03  1.9153865e-03 -1.1058712e-03
  6.3702423e-04 -9.2452194e-04 -1.8724570e-03  1.0092916e-03
 -7.0305803e-04 -4.5521915e-04  3.4498872e-04  1.2931580e-03
  5.0383533e-04  1.4556666e-03 -8.8460144e-04  1.1642454e-03
 -1.2279846e-03  1.7687174e-03  7.3237607e-04 -1.1112981e-03
  1.7475414e-03  9.0178050e-04 -9.6116547e-04 -1.1198751e-03
  7.1096540e-05 -1.8528800e-03 -1.1426950e-03 -2.9419738e-04
 -4.9152353e-04 -1.4228350e-04 -8.6863787e-04  3.3900136e-04
  1.4213109e-03 -1.4940161e-03  6.7887137e-05 -1.4372340e-03
  8.4710814e-04 -4.0601680e-04  2.5555200e-04 -1.2371881e-03
 -1.3873135e-03 -4.6655154e-05 -5.6401273e-04  1.7204373e-03
  1.0364268e-03  9.7134226e-04  1.5770303e-03 -1.6272562e-03].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.