In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras

import os
import matplotlib.pyplot as plt

<h3>모델 생성 및 불러오기</h3>

In [2]:
def create_model(input_dim):
    model = Sequential([
        Dense(input_dim, activation = 'tanh', input_shape = (input_dim, )),
        Dense(input_dim//3, activation = 'relu'),
        Dense(input_dim//3, activation = 'tanh'),
        Dense(input_dim, activation = 'sigmoid')
    ])
    
    model.compile(optimizer = Adam(), loss = 'mse')
    model.summary()
    
    return model

In [3]:
def load_model(model, checkpoint_path):
    model.load_weights(checkpoint_path)
    
    return model

In [4]:
model_wmi = create_model(29)
model_wmi = load_model(model_wmi, "training_1/cpwmi.ckpt")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 29)                870       
                                                                 
 dense_1 (Dense)             (None, 9)                 270       
                                                                 
 dense_2 (Dense)             (None, 9)                 90        
                                                                 
 dense_3 (Dense)             (None, 29)                290       
                                                                 
Total params: 1,520
Trainable params: 1,520
Non-trainable params: 0
_________________________________________________________________


<h3>이상탐지</h3>

In [5]:
def load_data(system):
    wmi_mal = []
    
    #데이테 불러오기
    csv1 = pd.read_csv(system + "/wmi.csv", index_col = 0)
    csv_wmi = csv1.drop(["Name", "f30"], axis = 1)

    y = csv1['f30'].values
    y = pd.DataFrame(y, columns = ["label"])
    
    #데이터 정규화
    scaler = MinMaxScaler()
    scaled_wmi = scaler.fit_transform(csv_wmi.to_numpy())
    
    return scaled_wmi, y, csv1

In [6]:
def detect(test_wmi):
    predictions = model_wmi.predict(test_wmi)
    mse = np.mean(np.power(test_wmi - predictions, 2), axis=1)
    
    return mse

In [26]:
path = "infected system/"
listdir = os.listdir(path)
result = []

for system in listdir:
    test_wmi, y, csv1 = load_data(path + system)
    mse = detect(test_wmi)
    
    error_df = pd.DataFrame({'reconstruction_error': mse})
    
    mse_y = pd.concat([error_df, y], axis = 1)
    
    sorted_df = mse_y.sort_values(by = 'reconstruction_error', ascending = False)
    sorted_df = sorted_df.reset_index(drop = True)
    
    t10 = 0
    t20 = 0
    t30 = 0
    tmal = 0
    
    for i in range(0, len(sorted_df)):
        if sorted_df.loc[i]["label"] == 1:
            tmal = tmal + 1
            if i <= len(sorted_df)/20:
                t10 = t10 + 1
            if i <= len(sorted_df)/10:
                t20 = t20 + 1
            if i <= len(sorted_df)/7:
                t30 = t30 + 1
        
        
        
    prc10 = t10 / (len(sorted_df)/20)
    prc20 = t20 / (len(sorted_df)/10)
    prc30 = t30 / (len(sorted_df)/7)
    
    if tmal != 0:
        rcl10 = t10 / tmal
        rcl20 = t20 / tmal
        rcl30 = t30 / tmal
        
    else:
        rcl10 = 0
        rcl20 = 0
        rcl30 = 0
    
    if (prc10 + rcl10) != 0:
        fs10 = 2 * (prc10 * rcl10) / (prc10 + rcl10)
    else:
        fs10 = 0
    if (prc20 + rcl20) != 0:
        fs20 = 2 * (prc20 * rcl20) / (prc20 + rcl20)
    else:
        fs20 = 0
    if (prc30 + rcl30) != 0:
        fs30 = 2 * (prc30 * rcl30) / (prc30 + rcl30)
    else:
        fs30 = 0
    result.append([system, prc10, rcl10, fs10, prc20, rcl20, fs20, prc30, rcl30, fs30])



<h3>엔트로피</h3>

In [29]:
import pymysql
import math

In [17]:
def connectdb():
    db = pymysql.connect(
        user = 'root',
        passwd = '110419',
        host = '127.0.0.1',
        db = 'knowledgebase',
        charset = 'utf8'
    )
    
    cursor = db.cursor(pymysql.cursors.DictCursor)
    
    return db, cursor

In [18]:
def separate_dict(data):
    names = []
    pathes = []
    
    for line in data:
        names.append(line['name'])
        
        path = line['path']
        path_split = path.split('\\')[:-1]
        pathes.append(path_split)
        
    return names, pathes

In [19]:
def E(p):
    return - math.log2(1 + p)

def E2(p):
    return math.log2(p) * p

In [20]:
def cal_entropy(name, path):
    db, cursor = connectdb()
    
    sql = "select * from `exelist`;"
    cursor.execute(sql)
    result = cursor.fetchall()
    
    names, pathes = separate_dict(result)
    path = path.split('\\')[:-1]
    
    #name의 엔트로피
#    e = 0
#    for i in names:
#        if i == name:
#            p = 1 + 1/(names.count(name) + 1)
#        else:
#            p = 1
#        e += E(p)
#    name_e = 1 + e/math.log2(len(names))
    p = names.count(name)/len(names)
    name_e = E(p)

    #path의 각 단어를 반복문으로 돌면서 path의 등장 비율 계산하고 엔트로피계산해서 모두 더함    
    e = 0
    for i in path:
        tmp = 0
        total_i = 0
        for k in pathes:
            total_i += k.count(i)
        for j in pathes:
            p = 1 + j.count(i)/(total_i + 1)
            tmp += E2(p)
        e += tmp
    path_e = 1 + e/math.log2(len(pathes))
    
    #return name_e+path_2
    return name_e

In [21]:
def scoring(path):
    e_list = []
    infected_list = []
        
    df = pd.read_csv(path, index_col = 0)
    for idx in range(0, len(df)):
        name = df.loc[idx]["Name"]
        Path = df.loc[idx]["Path"]
            
        e = cal_entropy(name, Path)
        
        if df.loc[idx]["Infected"] == 1:
            infected_list.append(e)
        e_list.append(e)
        
    return e_list

In [40]:
path = "infected system/"
listdir = os.listdir(path)

for system in listdir:
    print("========================================================")
    print(system)
    test_wmi, wmi_mal, csv1= load_data(path + system)
    wmi_mse = detect(test_wmi)
    
    error_df = pd.DataFrame({'reconstruction_error': wmi_mse})
    e_list = scoring("is_after/" + system + "_wmi.csv")
    
    error_df = error_df.assign(Entropy = e_list)
    sorted_df = error_df.sort_values(by = 'reconstruction_error', ascending = False)
    
    namelist = []
    for i in sorted_df.head(len(sorted_df)//10).index:
    #for i in sorted_df.index:
        name = csv1.loc[i]["Name"]
        if i in wmi_mal:
            print("  ", name, sorted_df.index.tolist().index(i), "/", len(sorted_df))

    part2_df = sorted_df[len(sorted_df)//10:]
    part2_df = part2_df.sort_values(by = 'Entropy', ascending = False)
    for i in part2_df.index:
        name = csv1.loc[i]["Name"]
        if i in wmi_mal:
            print("  ", name, part2_df.index.tolist().index(i), "/", len(part2_df))

2a198c0616908013a49e856678b5143c
4684f096da1f045a4e5c4ee7ca945259
4996efb901d5c379c575bb7908651071
4aa53791373b8d09b5773f3ea086b803
5771a7f5aaa45c2a12fa7d1d043ae222
6b8d9642391687d948129206b805b980
6e2da504218d67df1a4d8db18e9be706
6ea487c55f789e2d2c0b3c20c8674214
6eedee0bc42edb6922e6e78f749bab88
6f194c2bf64aa91f8b25768aaedf954e
6f3c7fa16f0e61f01c9ca873ce257981
6fd75941954dda0a8d6bd5180b233a37
8102acd2e76cdc88d8c893ed30e56f46
8821a5f07134fbaacf35ba62ba50aaed
9629bdb89f6d9fa96162f5d4a0be3e38
