<a href="https://colab.research.google.com/github/stwater20/AIS3-2024-Material/blob/main/AIS3_Lab6_Malware_API_Sequence_Using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
!ls -al

total 350456
drwxr-xr-x 1 root root      4096 Jul 28 18:00 .
drwxr-xr-x 1 root root      4096 Jul 28 17:27 ..
drwxrwxr-x 4 root root      4096 Jul 23 08:55 ais3
-rw-r--r-- 1 root root 195282493 Jul 28 18:00 ais3.zip
drwxr-xr-x 4 root root      4096 Jul 25 13:17 .config
-rw-r--r-- 1 root root 160032752 Oct 28  2021 data_demo.npz
-rw-r--r-- 1 root root   3525445 Jul 28 17:50 data_demo.zip
drwxr-xr-x 1 root root      4096 Jul 25 13:18 sample_data


In [48]:
!unzip data_demo.zip

Archive:  data_demo.zip
replace data_demo.npz? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: data_demo.npz           


In [49]:
import numpy as np

# 讀取 npz 檔案
data = np.load('data_demo.npz')

# 列出檔案中所有的陣列
print(list(data.keys()))

['x_name', 'x_semantic', 'y']


In [50]:
data['x_name']

array([[  1,   4,  59, ...,   0,   0,   0],
       [ 10,  11,  10, ...,   0,   0,   0],
       [  1,   4,  59, ...,   0,   0,   0],
       ...,
       [123, 253, 253, ...,   0,   0,   0],
       [ 15,   3,   3, ...,  27,  27,  27],
       [  1,  10,  11, ...,   9,  76,  52]])

In [51]:
len(data['x_name'][0]) # 每個軟體有 1000 個 API 序列

1000

In [52]:
data['x_semantic']

array([[  4, 101,  21, ...,  14,  14,  14],
       [  0,  25, 255, ...,  14,  14,  14],
       [  4, 101,  21, ...,  14,  14,  14],
       ...,
       [  0,   1, 163, ...,  14,  14,  14],
       [  0,   5,  61, ...,  14,  32,  33],
       [  4, 101,  21, ...,   5,  79,  80]])

In [53]:
len(data['x_semantic'][0])

4000

In [54]:
data['y'],len(data['y'])

(array([[0],
        [0],
        [0],
        ...,
        [1],
        [1],
        [1]]),
 4000)

In [55]:
flat_list = [item for sublist in data['y'].tolist() for item in sublist]

In [56]:
from collections import Counter
Counter(flat_list)

Counter({0: 2000, 1: 2000})

In [57]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import Embedding, LSTM, Linear, BCELoss
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
import warnings

# 忽略警告
warnings.filterwarnings("ignore")

# 定義 LSTM 神經網路模型
class LSTMNet(torch.nn.Module):
    def __init__(self):
        super(LSTMNet, self).__init__()
        # 定義嵌入層，輸入為316，嵌入向量維度為16
        self.embedder = Embedding(num_embeddings=316, embedding_dim=16)
        # 定義雙向LSTM層，輸入維度為16，隱藏層維度為100，設置 batch_first=True
        self.lstm = LSTM(input_size=16, hidden_size=100, bidirectional=True, batch_first=True)
        # 定義全連接層1，輸入維度為200，輸出維度為64
        self.lin1 = Linear(200, 64)
        # 定義全連接層2，輸入維度為64，輸出維度為1
        self.lin2 = Linear(64, 1)

    # 定義前向傳播
    def forward(self, x):
        # 通過嵌入層
        x = self.embedder(x)
        # 通過LSTM層，並獲取最後一層的隱藏狀態
        x, (h_n, c_n) = self.lstm(x)
        # 拼接雙向LSTM的最後一層隱藏狀態
        x = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        # 通過全連接層1，並應用ReLU激活函數
        x = F.relu(self.lin1(x))
        # 通過全連接層2，並應用Sigmoid激活函數
        x = torch.sigmoid(self.lin2(x))
        return x

# 測試模型
def test(model, device, test_loader):
    model.eval()  # 設置模型為評估模式
    all_preds = []  # 用來存儲所有預測結果
    all_targets = []  # 用來存儲所有真實標籤
    with torch.no_grad():  # 在不需要梯度計算的上下文中
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)  # 獲取模型的輸出
            preds = output.round()  # 將輸出四捨五入為0或1
            all_preds.extend(preds.view(-1).cpu().numpy())  # 將預測結果添加到列表中
            all_targets.extend(target.cpu().numpy())  # 將真實標籤添加到列表中

    # 計算指標
    precision = precision_score(all_targets, all_preds)  # 計算精確度
    recall = recall_score(all_targets, all_preds)  # 計算召回率
    f1 = f1_score(all_targets, all_preds)  # 計算F1分數

    return precision, recall, f1  # 返回精確度、召回率和F1分數


In [58]:
data = np.load('./data_demo.npz')
train_x_name = data['x_name']
train_y = data['y']
test_x_name = data['x_name']
test_y = data['y']

In [59]:
# 特殊用途，第一次不要執行，看到後面就知道了XD
# train_x_name = train_x
# train_y = train_y
# test_x_name = test_x
# test_y = test_y

In [60]:
# 將 NumPy 陣列轉換為 PyTorch 張量並設置類型為長整數（long），適用於分類任務中的索引
train_xt = torch.from_numpy(train_x_name).long()
test_xt = torch.from_numpy(test_x_name).long()
train_yt = torch.from_numpy(train_y.astype(np.float32))
test_yt = torch.from_numpy(test_y.astype(np.float32))

In [61]:
# 創建 TensorDataset，用於將輸入和標籤配對
train_data = TensorDataset(train_xt, train_yt)
test_data = TensorDataset(test_xt, test_yt)

In [62]:
train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True, num_workers=1)
test_loader = DataLoader(dataset=test_data, batch_size=128, num_workers=1)

In [63]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMNet().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01) #使用 Adam 優化器，設置學習率為 0.01


In [64]:
# Training loop
for epoch in range(1, 6):  # 5 epochs
    model.train()
    total_loss = 0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad() # 梯度清零，因為在每次反向傳播（backpropagation）之前，我們需要將上一次計算的梯度清除，以防止它們與新計算的梯度累積。
        output = model(data) # 前向傳播
        if target.dim() == 1:
            target = target.unsqueeze(1)  # target 現在應該是 [batch_size, 1]
        loss = BCELoss()(output, target) # 二元交叉熵損失
        loss.backward() # 反向傳播計算梯度
        optimizer.step() # 更新模型參數
        total_loss += loss.item() # 累加損失
    print(f'Epoch {epoch}, Train Loss:  {total_loss / len(train_loader)}')


# Test
precision, recall, f1 = test(model, device, test_loader)
print(f'Test Precision: {precision:.4f}, Test Recall: {recall:.4f}, Test F1: {f1:.4f}')


Epoch 1, Train Loss:  0.4657269483432174
Epoch 2, Train Loss:  0.31567000364884734
Epoch 3, Train Loss:  0.24410137068480253
Epoch 4, Train Loss:  0.23557070130482316
Epoch 5, Train Loss:  0.21346177766099572
Test Precision: 0.9226, Test Recall: 0.9295, Test F1: 0.9260


# 換您試試
我們設計了新資料集提供給同學訓練與測試，
請參考上方的程式碼，
讓模型能夠讀取新資料來訓練模型!

以下提供下載我們提供資料集連結的方法，以及一些必要的前處理

In [65]:
import gdown

In [66]:
!pip install gdown

import gdown

file_id = '10mcBZM8Aub5WiLZO92iuLfvWwjvn7gp_'
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, 'ais3.zip', quiet=False)




Downloading...
From (original): https://drive.google.com/uc?id=10mcBZM8Aub5WiLZO92iuLfvWwjvn7gp_
From (redirected): https://drive.google.com/uc?id=10mcBZM8Aub5WiLZO92iuLfvWwjvn7gp_&confirm=t&uuid=6e32d704-9025-4bf2-bf6c-ff9fba1270ab
To: /content/ais3.zip
100%|██████████| 195M/195M [00:01<00:00, 141MB/s]


'ais3.zip'

In [67]:
!unzip ais3.zip

Archive:  ais3.zip
replace ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/summary_report.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/summary_report.json  
 extracting: ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/winlogbeat.json  
  inflating: ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/dynamic_report.json  
  inflating: ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/features.json  
  inflating: ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/static_report.json  
 extracting: ais3/mal/4d423b3ed84f438fb8d96ed825d93ff5/etw_log.json  
  inflating: ais3/mal/81190efd451140d899d1dfada6554b6b/summary_report.json  
  inflating: ais3/mal/81190efd451140d899d1dfada6554b6b/winlogbeat.json  
  inflating: ais3/mal/81190efd451140d899d1dfada6554b6b/dynamic_report.json  
  inflating: ais3/mal/81190efd451140d899d1dfada6554b6b/features.json  
  inflating: ais3/mal/81190efd451140d899d1dfada6554b6b/static_report.json  
 extracting: ais3/mal/81190efd451140d899d1dfada6554b

In [68]:
import os

def count_subdirectories(root_dir, label):
    label_dir = os.path.join(root_dir, label)
    if not os.path.isdir(label_dir):
        return 0
    return sum([1 for name in os.listdir(label_dir) if os.path.isdir(os.path.join(label_dir, name))])


root_dir = 'ais3'
ben_count = count_subdirectories(root_dir, 'ben')
mal_count = count_subdirectories(root_dir, 'mal')

print(f"'ben' 資料夾中的子資料夾數量: {ben_count}")
print(f"'mal' 資料夾中的子資料夾數量: {mal_count}")

'ben' 資料夾中的子資料夾數量: 200
'mal' 資料夾中的子資料夾數量: 135


In [69]:
import os
import json
import numpy as np
from collections import defaultdict

def read_api_calls_from_json(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    api_calls = []
    for process in data['processes']:
        for call in process.get('calls', []):
            api = call.get('api')
            if api:
                api_calls.append(api)
    return api_calls

def collect_api_calls(root_dir):
    api_sequences = []
    labels = []
    for label, label_value in zip(['ben', 'mal'], [0, 1]):
        label_dir = os.path.join(root_dir, label)
        if not os.path.isdir(label_dir):
            continue
        for subdir in os.listdir(label_dir):
            subdir_path = os.path.join(label_dir, subdir)
            json_path = os.path.join(subdir_path, 'dynamic_report.json')
            if os.path.isfile(json_path):
                api_calls = read_api_calls_from_json(json_path)
                if api_calls:
                    api_sequences.append(api_calls)
                    labels.append(label_value)
    return api_sequences, labels

def build_vocabulary(api_sequences):
    vocabulary = defaultdict(lambda: len(vocabulary) + 1)
    for sequence in api_sequences:
        for api in sequence:
            _ = vocabulary[api]
    return dict(vocabulary)

def encode_sequences(api_sequences, vocabulary):
    encoded_sequences = []
    for sequence in api_sequences:
        encoded_sequence = [vocabulary[api] for api in sequence]
        encoded_sequences.append(encoded_sequence)
    return encoded_sequences

def pad_sequences(encoded_sequences, max_length):
    padded_sequences = np.zeros((len(encoded_sequences), max_length), dtype=int)
    for i, sequence in enumerate(encoded_sequences):
        length = min(len(sequence), max_length)
        padded_sequences[i, :length] = sequence[:length]
    return padded_sequences

root_dir = 'ais3'
api_sequences, labels = collect_api_calls(root_dir)

# 建立 API 調用的字典映射
vocabulary = build_vocabulary(api_sequences)

# 將 API 調用序列轉換為數字序列
encoded_sequences = encode_sequences(api_sequences, vocabulary)

max_length = 1000

# 將數字序列填充到相同的長度
padded_sequences = pad_sequences(encoded_sequences, max_length)

# 將標籤轉換為 NumPy 陣列
labels_array = np.array(labels).reshape(-1, 1)

print("Padded API sequences:")
print(padded_sequences[0])
print("Labels:")
print(labels_array[0])


Padded API sequences:
[ 1  2  3  3  3  3  3  3  3  3  1  1  1  4  5  6  5  6  7  8  7  9 10 11
 12 13 14 15 10 15 15 16 10 11 12 13 14 15 10 15 15 16 10 11 12 13 14 15
 10 15 15 16 13 14 15 10 15  2 17  9 10 11 12  1  1 15 15 18 10 15 19  7
  3 16 15 13 10 10 11 12 15  1  1  1  1 13 14 15 13 14 15  6  1  1  1  1
  1  1  1  1 20  9 10 11 12  1  1  1  1  1 15 15  1 18 10 15 19  1  1  1
  1 21 22 23 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 15  6 21 22 14
 14 15 25 26 11 12  6  6 27 27 27  6 27  6  6  5  6  5  5 27 27 27  6 21
 22 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 15 21 22 23 28 28 28
 28 22 15  1  1  2 19  3 13 2

In [70]:
from sklearn.model_selection import train_test_split

labels_array = np.array(labels).reshape(-1, 1)

# 切分數據集為訓練集和測試集，比例為8:2
train_x, test_x, train_y, test_y = train_test_split(padded_sequences, labels_array, test_size=0.2, random_state=42)


In [71]:
train_x

array([[ 1,  2,  3, ..., 14, 14, 14],
       [48,  5,  6, ...,  0,  0,  0],
       [ 1,  2,  3, ...,  0,  0,  0],
       ...,
       [ 1,  2,  3, ...,  0,  0,  0],
       [ 1,  2,  3, ...,  1,  1,  1],
       [ 1,  2,  3, ...,  0,  0,  0]])

In [72]:
train_y

array([[1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
    