In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
# from src.model import *
# from src.util import *
os.environ["CUDA_VISIBLE_DEVICES"] = '2'

In [2]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/ember/'
best_trained_model = '2022-01-18 13:36/1w_epoch:1_test_acc:0.890742.pt'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/ember2018/"
train_data_path = data_path + "malwares/"  # Training data
train_label_path = data_path + "train_labels.csv"  # Training label

In [3]:
CUDA = True if torch.cuda.is_available() else False
NUM_WORKERS = 24  # Number of cores to use for data loader
BATCH_SIZE = 256  #
LEAVE_BIT_NUMBER = 10000
KERNEL_SIZE = 500  # Kernel size & stride for Malconv (defualt : 500)

In [4]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')
validset = validset.iloc[np.argwhere(validset['labels'].values == 1).squeeze(),:]

In [5]:
class ExeDataset(Dataset):
    def __init__(self, malware_names, data_path, labels, leave_bit_num):
        self.malware_names = malware_names
        self.data_path = data_path
        self.labels = labels
        self.leave_bit_num = leave_bit_num

    def __len__(self):
        return len(self.malware_names)

    def __getitem__(self, idx):
        with open(self.data_path + self.malware_names[idx] + '.txt','rb') as fp:
            data = [bit+1 for bit in fp.read()[:self.leave_bit_num]]
            padding = [0]*(self.leave_bit_num-len(data))
            data = data + padding

        return np.array(data), np.array([self.labels[idx]])

In [6]:
train_dataset = ExeDataset(
    trainset["id"].tolist(), 
    train_data_path, 
    trainset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
valid_dataset = ExeDataset(
    validset["id"].tolist(), 
    train_data_path, 
    validset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)

In [7]:
trainloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
validloader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    pin_memory = True
)

In [8]:
class Model(nn.Module):
    def __init__(self, data_length = 2e6, kernel_size = 500):
        super().__init__()
        self.embedding = nn.Embedding(257, 8, padding_idx=0)
        self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        # self.bn_1 = nn.BatchNorm1d(128)
        self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
        self.fc_layer_3 = nn.Linear(128, 128)
        self.fc_layer_4 = nn.Linear(128, 2)
        
    def forward(self,input_):
        input_ = self.embedding(input_)
        input_.retain_grad()
        x = input_.transpose(-1,-2)
        x_conv_1 = self.conv_layer_1(x[:,:4,:])
        x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
        x = x_conv_1*x_conv_2
        del x_conv_1,x_conv_2
        x = self.pool_layer_2(x).squeeze()
        x = self.fc_layer_3(x)
        x = self.fc_layer_4(x)
        return x,input_

In [9]:
def mp_func(batch_idx,data,embedded_data,embedded_data_grad,mis):
    for j,(num,zj,wj) in enumerate(zip(data,embedded_data,embedded_data_grad)):
        if num != 0: continue
        nj = -wj/np.linalg.norm(wj,2) if np.sum(wj) != 0 else np.zeros_like(wj)
        choose = 0
        max_di = float('inf')
        for i,mi in enumerate(mis):
            si = nj.reshape(1,-1) @ (mi - zj)
            di = np.linalg.norm(mi - (zj + si*nj),ord=2)
            if si > 0 and di < max_di:
                choose = i
                max_idx = di
        data[j] = choose
    return [batch_idx,data]

In [10]:
model = Model(data_length=LEAVE_BIT_NUMBER,kernel_size=KERNEL_SIZE)

ce_loss = nn.CrossEntropyLoss()
optim = Adam(model.parameters())

model = model.cuda() if CUDA else model
ce_loss = ce_loss.cuda() if CUDA else ce_less

In [11]:
model.load_state_dict(torch.load(trained_model_path + best_trained_model))

<All keys matched successfully>

In [12]:
model_embedding_layer = model.embedding

In [None]:
model.eval()
results = {}

for step, (batch_data,batch_label) in enumerate(tqdm(validloader)):
    batch_data = batch_data.cuda() if CUDA else batch_data
    batch_label = batch_label.cuda() if CUDA else batch_label
    ls = []
    bar = tqdm(range(5))
    for run in bar:
        pred,embedded_batch_data = model(batch_data)
        temp_label = torch.zeros((len(batch_label),2))
        for idx,target in enumerate(batch_label.squeeze()): temp_label[idx,target] = 1
        temp_label = temp_label.cuda() if CUDA else temp_label
        loss = ce_loss(pred,temp_label)
        loss.backward()
        
        miss = torch.LongTensor([list(range(257))]).cuda() if CUDA else torch.LongTensor([list(range(257))])
        miss = model_embedding_layer(miss).squeeze()
        miss = miss.detach().cpu().numpy()

        zip_data = zip(
            batch_data.detach().cpu().numpy(),
            embedded_batch_data.detach().cpu().numpy(),
            embedded_batch_data.grad.detach().cpu().numpy(),
            np.expand_dims(miss,axis=0).repeat(len(batch_data),axis=0)
        )
        mp_data = [(batch_idx,data,embedded_data,embedded_data_grad,mis) \
           for batch_idx,(data,embedded_data,embedded_data_grad,mis) in enumerate(zip_data)]
        with mp.Pool(processes=24 if len(mp_data) > 24 else len(mp_data)) as pool:
            mp_results = pool.starmap(mp_func,mp_data)
        mp_results = sorted(mp_results,key = lambda x: x[0])
        for i in range(len(mp_results)):
            batch_data.data[i] = torch.tensor(mp_results[i][1], dtype=torch.float, requires_grad=True).cuda()
        
        pred,embedded_batch_data = model(batch_data)
        temp_label = torch.zeros((len(batch_label),2))
        for idx,target in enumerate(batch_label.squeeze()): temp_label[idx,target] = 1
        temp_label = temp_label.cuda() if CUDA else temp_label
        pred = np.argmax(pred.detach().cpu().numpy(),1)
        temp_acc = (batch_label.cpu().data.numpy() == pred).mean()
        ls.append(temp_acc)
        bar.set_description(f'acc: {temp_acc:.5f}')
    results[step] = ls

  0%|          | 0/235 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s][A
acc: 0.72656:   0%|          | 0/5 [06:29<?, ?it/s][A
acc: 0.72656:  20%|██        | 1/5 [06:29<25:59, 389.75s/it][A
acc: 0.67188:  20%|██        | 1/5 [12:07<25:59, 389.75s/it][A
acc: 0.67188:  40%|████      | 2/5 [12:07<17:57, 359.26s/it][A
acc: 0.67188:  40%|████      | 2/5 [17:13<17:57, 359.26s/it][A
acc: 0.67188:  60%|██████    | 3/5 [17:13<11:09, 334.64s/it][A

In [None]:
# zip_data = zip(
#     batch_data.detach().cpu().numpy(),
#     embedded_batch_data.detach().cpu().numpy(),
#     embedded_batch_data.grad.detach().cpu().numpy()
# )
# for batch_idx,(data,embedded_data,embedded_data_grad) in enumerate(zip_data):
#     for j,(num,zj,wj) in enumerate(zip(data,embedded_data,embedded_data_grad)):
#         if num != 0: continue
#         nj = -wj/np.linalg.norm(wj,2) if np.sum(wj) != 0 else np.zeros_like(wj)
#         choose = 0
#         max_di = float('inf')
#         mis = torch.LongTensor([list(range(257))]).cuda() if CUDA else torch.LongTensor([list(range(257))])
#         mis = model_embedding_layer(mis).squeeze()
#         mis = mis.detach().cpu().numpy()
#         for i,mi in enumerate(mis):
#             si = nj.reshape(1,-1) @ (mi - zj)
#             di = np.linalg.norm(mi - (zj + si*nj),ord=2)
#             if si > 0 and di < max_di:
#                 choose = i
#                 max_idx = di
#         batch_data.data[batch_idx][j] = choose