In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from functools import partial
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
# from src.model import *
# from src.util import *
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [2]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/ember/'
best_trained_model = '2022-01-18 14:55/2w_epoch:13_test_acc:0.896058.pt'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/ember2018/"
train_data_path = data_path + "malwares/" 
train_label_path = data_path + "train_labels.csv" 

In [3]:
CUDA = True if torch.cuda.is_available() else False
NUM_WORKERS = 18  # Number of cores to use for data loader
BATCH_SIZE = 64  #
LEAVE_BIT_NUMBER = 20000
KERNEL_SIZE = 500  # Kernel size & stride for Malconv (defualt : 500)

In [4]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')
validset = validset.iloc[np.argwhere(validset['labels'].values == 1).squeeze(),:]

In [5]:
class ExeDataset(Dataset):
    def __init__(self, malware_names, data_path, labels, leave_bit_num):
        self.malware_names = malware_names
        self.data_path = data_path
        self.labels = labels
        self.leave_bit_num = leave_bit_num

    def __len__(self):
        return len(self.malware_names)

    def __getitem__(self, idx):
        with open(self.data_path + self.malware_names[idx] + '.txt','rb') as fp:
            data = [bit+1 for bit in fp.read()[:self.leave_bit_num]]
            padding = [0]*(self.leave_bit_num-len(data))
            data = data + padding

        return np.array(data), np.array([self.labels[idx]])

In [6]:
train_dataset = ExeDataset(
    trainset["id"].tolist(), 
    train_data_path, 
    trainset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
valid_dataset = ExeDataset(
    validset["id"].tolist(), 
    train_data_path, 
    validset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)

In [7]:
trainloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
validloader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    pin_memory = True
)

In [8]:
class Model(nn.Module):
    def __init__(self, data_length = 2e6, kernel_size = 500):
        super().__init__()
        self.embedding = nn.Embedding(257, 8, padding_idx=0)
        self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        # self.bn_1 = nn.BatchNorm1d(128)
        self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
        self.fc_layer_3 = nn.Linear(128, 128)
        self.fc_layer_4 = nn.Linear(128, 2)
        
    def forward(self,x):
        embedd_x = self.embedding(x)
        embedd_x.retain_grad()
        x = embedd_x.transpose(-1,-2)
        x_conv_1 = self.conv_layer_1(x[:,:4,:])
        x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
        x = x_conv_1*x_conv_2
        del x_conv_1,x_conv_2
        x = self.pool_layer_2(x).squeeze()
        x = self.fc_layer_3(x)
        x = self.fc_layer_4(x)
        return x,embedd_x

In [9]:
model = Model(data_length=LEAVE_BIT_NUMBER,kernel_size=KERNEL_SIZE)

ce_loss = nn.CrossEntropyLoss()
optim = Adam(model.parameters())

model = model.cuda() if CUDA else model
ce_loss = ce_loss.cuda() if CUDA else ce_less

In [10]:
model.load_state_dict(torch.load(trained_model_path + best_trained_model))

<All keys matched successfully>

In [11]:
model_embedding_layer = model.embedding

In [12]:
total_acc = []
bar = tqdm(validloader)
for step, (batch_data,batch_label) in enumerate(bar):
    optim.zero_grad()
    batch_data = batch_data.cuda() if CUDA else batch_data
    batch_label = batch_label.cuda() if CUDA else batch_label
    batch_label = batch_label.squeeze()
    temp = torch.zeros((len(batch_label),2))
    for idx,target in enumerate(batch_label.squeeze()): temp[idx,target] = 1
    temp_label = temp.cuda() if CUDA else temp
    pred,embedd_x = model(batch_data)
    loss = ce_loss(pred, temp_label)
    loss.backward()
    break

  0%|          | 0/938 [00:02<?, ?it/s]


In [13]:
mask = batch_data == 0

In [14]:
embedd_x_grad = embedd_x.grad.detach().cpu().numpy()

In [15]:
embedd_x = embedd_x.detach().cpu().numpy()

In [16]:
all_embedd = model.embedding(torch.arange(start=0,end=257).cuda())
all_embedd = all_embedd.detach().cpu().numpy()

In [17]:
del temp,temp_label,pred
torch.cuda.empty_cache()

In [18]:
embedded_x_grad_two_norm = np.linalg.norm(embedd_x_grad,ord=2,axis=-1)
embedded_x_grad_two_norm = np.expand_dims(embedded_x_grad_two_norm,axis=-1)
grad = -np.divide(
    embedd_x_grad,
    embedded_x_grad_two_norm,
#     out = np.full_like(embedd_x_grad,np.inf),
    out = np.full_like(embedd_x_grad,0),
    where = embedded_x_grad_two_norm != 0
)

In [19]:
def mp_func(idx,one_of_all_embedd,embedd_x,grad):
    sb = np.sum(grad * (one_of_all_embedd - embedd_x),axis=-1)
    sb = np.expand_dims(sb,axis=-1)
    sb_gz = (sb > 0)*1.0
    db = np.linalg.norm(one_of_all_embedd - (embedd_x + sb*grad),ord=1,axis=-1)
    db = np.expand_dims(db,axis=-1)
    return [idx,sb,db,sb_gz]

In [20]:
partial_mp_func = partial(mp_func,embedd_x=embedd_x,grad=grad)

In [21]:
with mp.Pool(processes = 12) as pool:
    data = [(idx,one_of_all_embedded) for idx,one_of_all_embedded in enumerate(all_embedd)]
    results = pool.starmap(partial_mp_func,data)

In [22]:
results = sorted(results,key=lambda x:x[0])

In [23]:
# total_sb = np.concatenate(np.array(list(r[1] for r in results)),axis=-1)

In [None]:
total_db = np.concatenate(np.array(list(r[2] for r in results)),axis=-1)

In [None]:
total_sb_gz = np.concatenate(np.array(list(r[3] for r in results)),axis=-1)

In [None]:
# total_sb = torch.from_numpy(total_sb).cuda()
# total_db = torch.from_numpy(total_db).cuda()
# total_sb_gz = torch.from_numpy(total_sb_gz).cuda()

In [None]:
total = torch.from_numpy(total_sb_gz).cuda()*torch.from_numpy(total_db).cuda()

In [None]:
total += (1-torch.from_numpy(total_sb_gz).cuda())*1e5

In [None]:
total = torch.argmin(total,dim=-1)

In [None]:
batch_data.data = (~mask)*batch_data + mask*total

In [None]:
pred,embedd_x = model(batch_data)
_, predicted = torch.max(pred, 1)
acc = (batch_label.cpu().data.numpy() == predicted.cpu().data.numpy()).mean()

In [None]:
acc