In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from MalConv import MalConv
import numpy as np
import pandas as pd
import sys
import struct
import os
from tqdm import tqdm
os.environ["CUDA_VISIBLE_DEVICES"] = '1'


In [2]:
kernel_size = 512
eps = 0.7
target = 0  # benign
loop_num = 10

In [3]:
def reconstruction(x, y):
    """
    reconstruction restore original bytes from embedding matrix.

    Args:
        x torch.Tensor:
            x is word embedding

        y torch.Tensor:
            y is embedding matrix

    Returns:
        torch.Tensor:
    """
    x_size = x.size()[0]
    y_size = y.size()[0]
    # print(x_size, y_size)

    z = torch.zeros(x_size)

    for i in tqdm(range(x_size)):
        dist = torch.zeros(256)

        for j in range(y_size):
            dist[j] = torch.dist(x[i], y[j])  # computation of euclidean distance

        z[i] = dist.argmin()

    return z

In [4]:
# with open(sys.argv[1], 'rb') as f:
#     bytez = f.read()

In [5]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/ember/'
best_trained_model = '2022-01-18 14:55/2w_epoch:13_test_acc:0.896058.pt'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/ember2018/"
train_data_path = data_path + "malwares/" 
train_label_path = data_path + "train_labels.csv" 

In [6]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')
validset = validset.iloc[np.argwhere(validset['labels'].values == 1).squeeze(),:]

In [7]:
LEAVE_BIT_NUMBER = 20000

In [8]:
with open(train_data_path + trainset["id"].tolist()[0] + '.txt','rb') as fp:
    data = [bit+1 for bit in fp.read()[:LEAVE_BIT_NUMBER]]
    padding = [0]*(LEAVE_BIT_NUMBER-len(data))
    data = data + padding

In [9]:
bytez = data

In [10]:


# Create malconv
malconv = MalConv(channels=256, window_size=512, embd_size=8)
weights = torch.load('./malconv.checkpoint', map_location='cpu')
malconv.load_state_dict(weights['model_state_dict'])
malconv.eval()

# Create optimizer
opt = torch.optim.SGD(malconv.parameters(), lr=0.01, momentum=0.9)

# Compute payload size
payload_size = kernel_size + (kernel_size - np.mod(len(bytez), kernel_size))

# Creat embedding matrix
embed = malconv.embd
m = embed(torch.arange(0, 256))

# Make label from target
label = torch.tensor([target], dtype=torch.long)

perturbation = np.random.randint(0, 256, payload_size, dtype=np.uint8)

# Make input file x as numpy array
# x = np.frombuffer(bytez, dtype=np.uint8)
x = np.array(bytez)

In [19]:
np.concatenate([x, perturbation])[np.newaxis, :]

(1, 20992)

In [11]:


for i in range(loop_num):
    print('[{}]'.format(str(i + 1)))
    opt.zero_grad()  # initialize grad

    # Make input of malconv
    inp = torch.from_numpy(np.concatenate([x, perturbation])[np.newaxis, :]).float()
    inp_adv = inp.requires_grad_()
    embd_x = embed(inp_adv.long()).detach()
    embd_x.requires_grad = True
    # embd_x.retain_grad()

    outputs = malconv(embd_x)
    results = F.softmax(outputs, dim=1)

    r = results.detach().numpy()[0]
    print('Benign: {:.5g}'.format(r[0]), ', Malicious: {:.5g}'.format(r[1]))

    # Compute loss
    loss = nn.CrossEntropyLoss()(results, label)
    print('Loss: {:.5g}'.format(loss.item()))

    # Update
    loss.backward()
    opt.step()

    grad = embd_x.grad
    grad_sign = grad.detach().sign()[0][-payload_size:]  # extract only grad_sign of perturbation

    # Change types to numpy to prevent Error: Leaf variable was used in an inplace operation
    perturbation = embed(torch.from_numpy(perturbation).long())

    # Compute perturbation
    perturbation = (perturbation - eps * grad_sign).detach().numpy()

    embd_x = embd_x.detach().numpy()
    embd_x[0][-payload_size:] = perturbation  # update perturbation

    print('Reconstruction phase:')
    perturbation = reconstruction(torch.from_numpy(perturbation), m).detach().numpy()
    print('sum of perturbation: ', perturbation.sum(), '\n')  # for debug

#     # Generate perturbation file
#     with open('perturb.bin', 'wb') as out:
#         for s in perturbation:
#             out.write(struct.pack('B', int(s)))

#     # Make a decision on evasion rates
#     if results[0][0] > 0.5:
#         print('Evasion rates: {:.5g}'.format(results[0][0].item()), '\n')
#         aes_name = os.path.splitext(sys.argv[1])[0] + '_AEs.exe'

#         with open(aes_name, 'wb') as out:
#             aes = np.concatenate([x, perturbation.astype(np.uint8)])

#             for s in aes:
#                 out.write(struct.pack('B', int(s)))

#         print(aes_name, ' has been created.')
        
#         break

# print('Adversarial Examples is not found.')

[1]
Benign: 0.096648 , Malicious: 0.90335
Loss: 1.1757
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 220.25it/s]


sum of perturbation:  125291.0 

[2]
Benign: 1 , Malicious: 1.8236e-14
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 199.98it/s]


sum of perturbation:  122779.0 

[3]
Benign: 1 , Malicious: 2.2009e-36
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 206.91it/s]


sum of perturbation:  126310.0 

[4]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:05<00:00, 192.73it/s]


sum of perturbation:  126310.0 

[5]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:05<00:00, 195.55it/s]


sum of perturbation:  126310.0 

[6]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:05<00:00, 190.62it/s]


sum of perturbation:  126310.0 

[7]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 223.19it/s]


sum of perturbation:  126310.0 

[8]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 215.33it/s]


sum of perturbation:  126310.0 

[9]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 198.53it/s]


sum of perturbation:  126310.0 

[10]
Benign: 1 , Malicious: 0
Loss: 0.31326
Reconstruction phase:


100%|██████████| 992/992 [00:04<00:00, 202.90it/s]

sum of perturbation:  126310.0 






In [20]:
perturbation

array([ 55., 157., 244., 230., 213.,  70.,  51., 165., 221.,  16., 219.,
        26.,  51.,  48.,   9., 122.,   4., 105., 246., 239.,  21., 133.,
       216., 126., 196., 113., 166., 193.,  52.,  73.,  65.,  22.,  69.,
        46., 236., 184., 137., 209., 123., 129., 243., 181., 101.,  66.,
       187., 100.,  73.,  20.,  14., 132.,  49., 197., 221., 154., 109.,
       108.,  60., 140., 184., 151.,  67., 186., 117., 142.,  67., 255.,
        78., 187.,  49., 108., 238.,  94., 138., 193., 109.,  96., 206.,
        33., 188.,  30.,  27., 142., 163., 106.,  59., 210., 181., 120.,
        45., 111., 238., 239., 244., 144., 125., 138., 214.,  57., 208.,
         9.,  28., 132., 115.,  12., 155., 254., 240., 107.,  92., 209.,
         2.,  62.,  31., 165.,  85., 154.,  11.,  94., 248.,  35., 243.,
        94., 104., 249.,   5., 246.,  69.,  59.,  22., 215., 243., 152.,
        88., 235., 217., 150.,  61.,  50.,  79.,  89.,  20.,  32., 181.,
       180., 241., 203., 252., 206., 186.,  93.,  7