### 数据清洗

In [1]:
# 预处理
import os
import numpy as np
import re
import pandas as pd

# 读取每一份代码，返回代码内容
def load_one_file(filename):
    context = ''
    with open(filename, 'r', encoding='utf-8', errors='ignore') as codes:
        for line in codes:
            line = line.strip('\r')
            context += line
    return context

# 读取目录下所有php文件的路径，返回一个路径的列表
def load_all_php_path(dir):
    filelist = []
    
#     root为dir目录地址
#     dirs是一个该文件夹下所有目录名的list
#     filelist是文件名的list
    for root, dirs, files in os.walk(dir):
        for file in files:
            if file.endswith('.php'):
                fullpath = os.path.join(root, file)
#                 print('Loading %s' % fullpath)
                filelist.append(fullpath)
    return filelist

In [None]:
# 读入shell名单
shell_list = []
file_list = load_all_php_path('black_php/')
print(np.array(file_list).shape)

for file in file_list:
    codes = load_one_file(file)
#     这里清洗掉代码内无关的注释
#     1. 以//开头的注释
    for note1 in re.findall('//.', codes):
        codes = codes.replace(note1, '')
#     2. 以/* */标示的多行注释
    for note2 in re.findall("/\*{1,2}[\s\S]*?\*/", codes):
        codes = codes.replace(note2, '')
    shell_list.append(codes)
shell_y = [1] * len(shell_list)
print(len(shell_y))

# 读入normal名单
normal_list = []
file_list = load_all_php_path('white_php/')
print(np.array(file_list).shape)

for file in file_list:
    codes = load_one_file(file)
#     这里清洗掉代码内无关的注释
#     1. 以//开头的注释
    for note1 in re.findall('//.', codes):
        codes = codes.replace(note1, '')
#     2. 以/* */标示的多行注释
    for note2 in re.findall("/\*{1,2}[\s\S]*?\*/", codes):
        codes = codes.replace(note2, '')
    normal_list.append(codes)
normal_y = [0] * len(normal_list)
print(len(normal_y))

In [None]:
# 保存向量到本地
save_df = pd.DataFrame({'phpcode':X, 'label':y})
print(save_df)
save_df.to_csv('raw_dataset.csv', index=False, encoding='utf-8')

### 开始提取特征

In [2]:
def is_letter(char):
    letter = char.lower()
    return True if letter >= 'a' and letter <= 'z' else False


def is_num(char):
    if char >= '0' and char <= '9':
        return True
    return False


def get_freq_dict(domain):
    freq_dict = dict(Counter(domain))
    return freq_dict

In [3]:
import re
from collections import Counter

# 特征提取，基于每一份源代码phpcode
def count_lines(phpcode):
    return phpcode.count('\n')


def calc_entropy(phpcode):
    ent = 0
    l = len(phpcode)
    all_letters = dict(Counter(phpcode)).keys()
    freq_dict = get_freq_dict(phpcode)
    for letter in all_letters:
        frequency = freq_dict[letter]
        ent -= (frequency/l) * np.log2(frequency/l)
    return ent

def string_function_score(phpcode):
    str_func_list = [
        'addcslashses', 'addslashes', 'bin2hex', 'chop', 'chr', 
        'chunk_split', 'convert_cyr_string', 'convert_uudecode', 'covert_uuencode', 
        'echo', 'str_replace', 'str_ireplace', 'htmlspecialchars', 
        'trim', 'rtrim', 'ltrim', 'strlen', 'strchr', 'strtr', 'substr', 
        'print', 'parse', 'ord'
    ]
    phpcode = phpcode.lower().replace('\\','').replace('#','').replace('%','').replace('/','')
    phpcode = phpcode.replace('@', '').replace('&', '').replace('*', '')
    score = 0
    step = 0
    for word in str_func_list:
        if word in phpcode:
            score += np.exp2(step)
            step += 1
    return score


def system_function_score(phpcode):
    sys_func_list = [
        'eval', 'exec', 'system', 'popen', 'passthru'
    ]
    phpcode = phpcode.lower().replace('\\','').replace('#','').replace('%','').replace('/','')
    phpcode = phpcode.replace('@', '').replace('&', '').replace('*', '')
    score = 0
    step = 0
    for word in sys_func_list:
        if word in phpcode:
            score += np.exp2(step)
            step += 1
    return score


def count_longest_string_length(phpcode):
    phpcode = phpcode.lower().replace('\n', ' ').replace('\t', ' ')
    longest_length = 0
    substr_length = 0
    flag = 0
    for char in phpcode:
        if char != ' ':
            if flag == 0:  #字符串首位
                flag = 1
                substr_length = 0 
            substr_length += 1
        else:
            flag = 0
            if substr_length > longest_length:
                longest_length = substr_length
    return longest_length


def count_words(phpcode):
    len_word = 0
    count = 0
    flag = 0
    for char in phpcode:
        if is_letter(char):  # 是字母
            if flag == 0:  # 是首字母
                flag = 1  # 表示进入单词当中
            len_word += 1
        else:
            if flag == 1:  # 上一个是字母
                if len_word != 1:  # 不是单个字母
                    count += 1
            flag = 0
    return count


def length_of(phpcode):
    return len(str(phpcode))


def count_get_post(phpcode):
    count = 0
    phpcode = phpcode.lower()
    count += phpcode.count('get')
    count += phpcode.count('post')
    return count

In [4]:
raw_data = pd.read_csv('raw_dataset.csv')
print(raw_data)

                                                phpcode  label
0     <?php \n \n$auth_pass = "9c80a1eaca699e2fc6b99...      1
1     <?php\nfunction rooting()\n{\necho '<b>Sw Bilg...      1
2     <?php if (!function_exists("getmicrotime")) {f...      1
3     <?php\n\n$m7caaa45="\142\141\x73\145\66\x34\x5...      1
4     <?php\n\n\n\nif(preg_match("/bot/", $_SERVER[H...      1
...                                                 ...    ...
8410  <?php \n\tdefined('IN_ADMIN') or exit('No perm...      0
8411  <?php \n\tdefined('IN_ADMIN') or exit('No perm...      0
8412  <?php \n\tdefined('IN_ADMIN') or exit('No perm...      0
8413  <?php\ndefined('IN_PHPCMS') or exit('No permis...      0
8414  <?php\ndefined('IN_PHPCMS') or exit('No permis...      0

[8415 rows x 2 columns]


In [5]:
php_codes = np.array(raw_data['phpcode'].astype('str'))
y = np.array(raw_data['label'].astype('int'))
print(php_codes.shape)
print(type(php_codes))
print(y.shape)
print(type(y))

(8415,)
<class 'numpy.ndarray'>
(8415,)
<class 'numpy.ndarray'>


In [6]:
X = []
feature_func_list = [count_lines, calc_entropy, string_function_score, count_get_post, 
                     system_function_score, count_longest_string_length, count_words]
for php_code in php_codes:
    feature = []
    for func in feature_func_list:
        feature.append(func(php_code))
    X.append(feature)
X = np.array(X)
print(X.shape)

(8415, 7)


### 预处理

In [44]:
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=2019, test_size = 0.1, stratify=y)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
print(X_train_std.shape)
print(X_test_std[:5])

(7573, 7)
[[-0.13953418  0.0964765  -0.14772157  0.33596235 -0.37071943 -0.15298484
  -0.18514884]
 [-0.23658741 -0.87898993 -0.15198335 -0.10842046 -0.24362762 -0.15202746
  -0.29043108]
 [-0.28899615  0.57904244 -0.15269364 -0.33533935 -0.37071943 -0.1469027
  -0.26967295]
 [-0.23949901  0.20822278 -0.15198335 -0.28806458 -0.24362762 -0.13000789
  -0.24008951]
 [ 1.31335272  0.59497253  2.75597172  0.76143527  3.56912694 -0.12110995
   1.08656621]]


### 模型训练

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

dtree = DecisionTreeClassifier(random_state=None).fit(X_train_std, y_train)
print('训练完成')

训练完成


In [52]:
# 模型评估报告
from sklearn import metrics
predict_target = dtree.predict(X_test_std)
print(metrics.classification_report(y_test, predict_target, 
                                    target_names=['Normal php', 'Webshell']))


              precision    recall  f1-score   support

  Normal php       0.96      0.96      0.96       520
    Webshell       0.93      0.93      0.93       322

    accuracy                           0.95       842
   macro avg       0.95      0.95      0.95       842
weighted avg       0.95      0.95      0.95       842



In [124]:
from sklearn.preprocessing import MinMaxScaler

# 数据归一化
minmax = MinMaxScaler()
X_trian_std = minmax.fit_transform(X_train_std)
X_test_std = minmax.transform(X_test_std)
print(X_train_std.shape)
print(X_test_std[:5])

(6732, 8)
[[1.08764221e-04 5.78417559e-01 0.00000000e+00 5.88832373e-05
  0.00000000e+00 0.00000000e+00 1.18832589e-04 4.85960598e-06]
 [4.56809728e-04 6.97706938e-01 0.00000000e+00 1.82257639e-04
  1.17050332e-03 0.00000000e+00 6.33773806e-05 1.65226603e-04]
 [2.41456570e-03 6.82517070e-01 0.00000000e+00 2.35252553e-03
  5.46234881e-03 3.22580645e-02 2.58790971e-04 1.38984731e-03]
 [2.17528442e-05 7.14912880e-01 1.52590219e-05 4.32287081e-02
  0.00000000e+00 0.00000000e+00 3.88186456e-04 4.34934735e-02]
 [3.04539819e-04 6.73512416e-01 0.00000000e+00 1.68938812e-04
  0.00000000e+00 0.00000000e+00 8.45031742e-05 1.21490150e-04]]


In [79]:
# 测试用例
input_file = 'test1.php'  # 为了测试方便，文件直接放在当前目录下就行

test_code = load_one_file(input_file)

for note1 in re.findall('//.', test_code):
    test_code = test_code.replace(note1, '')
for note2 in re.findall("/\*{1,2}[\s\S]*?\*/", test_code):
    test_code = test_code.replace(note2, '')
    
feature_vec = [feature_fuc(test_code) for feature_fuc in feature_func_list]

conclusion = dtree.predict([feature_vec])
print('Webshell') if conclusion == 1 else print('Normal php')

Normal php
