In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from datetime import datetime
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
os.environ["CUDA_VISIBLE_DEVICES"] = '3'

In [2]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/ember/adversarial_training/my_way/'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/ember2018/"
train_data_path = data_path + "malwares/" 
test_data_path = data_path + "test_malwares/" 

In [3]:
CUDA = True if torch.cuda.is_available() else False
NUM_WORKERS = 24
BATCH_SIZE = 40
# BATCH_SIZE = 10
LEAVE_BIT_NUMBER = 20000
KERNEL_SIZE = 500

In [4]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')
# testset = pd.read_csv(data_path + 'test_dataset.csv')
# testset = testset.iloc[np.argwhere(testset['labels'].values == 1).squeeze(),:]

In [5]:
class ExeDataset(Dataset):
    def __init__(self, malware_names, data_path, labels, leave_bit_num):
        self.malware_names = malware_names
        self.data_path = data_path
        self.labels = labels
        self.leave_bit_num = leave_bit_num

    def __len__(self):
        return len(self.malware_names)

    def __getitem__(self, idx):
        with open(self.data_path + self.malware_names[idx] + '.txt','rb') as fp:
            data = [bit+1 for bit in fp.read()[:self.leave_bit_num]]
            padding = [0]*(self.leave_bit_num-len(data))
            data = data + padding

        return np.array(data), np.array([self.labels[idx]])

In [6]:
train_dataset = ExeDataset(
    trainset["id"].tolist(), 
    train_data_path, 
    trainset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
valid_dataset = ExeDataset(
    validset["id"].tolist(), 
    train_data_path, 
    validset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
# test_dataset = ExeDataset(
#     testset["id"].tolist(), 
#     test_data_path, 
#     testset["labels"].tolist(), 
#     LEAVE_BIT_NUMBER
# )

In [7]:
trainloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
validloader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
# testloader = DataLoader(
#     dataset = test_dataset,
#     batch_size = BATCH_SIZE,
#     shuffle = False,
#     num_workers = NUM_WORKERS,
#     pin_memory = True
# )

In [8]:
class Model(nn.Module):
    def __init__(self, data_length = 2e6, kernel_size = 500):
        super().__init__()
        self.embedding = nn.Embedding(257, 8, padding_idx=0)
        self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        # self.bn_1 = nn.BatchNorm1d(128)
        self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
        self.fc_layer_3 = nn.Linear(128, 128)
        self.fc_layer_4 = nn.Linear(128, 2)
        
    def sequence(self,embedd_x):
        x = embedd_x.transpose(-1,-2)
        x_conv_1 = self.conv_layer_1(x[:,:4,:])
        x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
        x = x_conv_1*x_conv_2
        del x_conv_1,x_conv_2
        x = self.pool_layer_2(x).squeeze()
        x = self.fc_layer_3(x)
        x = self.fc_layer_4(x)
        return x
        
    def forward(self, input_, loss_fn ,true_label, repeat = 8):
        batch_acc = []
        batch_grad = []
        
        embedd_x = self.embedding(input_)
        x = self.sequence(embedd_x)
        acc = torch.argmax(torch.softmax(x,dim=-1),dim=-1).float().mean()
        batch_acc.append(acc.tolist())
        
        padding_place_mask = input_ == 0
        random_padding = torch.randint_like(input=padding_place_mask.float(),low=1,high=257) * padding_place_mask.float()
        input_ += random_padding.long()
        
        one_hot_x = F.one_hot(input_,num_classes=257).float()
        one_hot_x.requires_grad = True
        one_hot_x.retain_grad()
        for _ in range(repeat):
            embedd_x = one_hot_x @ self.embedding.weight
            x = self.sequence(embedd_x)
            
            acc = torch.argmax(torch.softmax(x,dim=-1),dim=-1).float().mean()
            batch_acc.append(acc.tolist())
            
            mislead_labels = torch.abs(true_label - 1)
            loss = loss_fn(x,mislead_labels)
            loss.backward()
            
            shape = padding_place_mask.shape
            expand_padding_place_mask = padding_place_mask.unsqueeze(-1).expand(shape[0],shape[1],257)
            grad_min_idx = torch.argmin(one_hot_x.grad,dim=-1)
            grad_sez_mask = (one_hot_x.grad <= 0)
            grad_mask = (expand_padding_place_mask & grad_sez_mask)
            batch_grad.append(torch.div(
                input=(expand_padding_place_mask.float()*one_hot_x.grad).sum(),
                other=expand_padding_place_mask.float().sum()
            ).detach().cpu().numpy())
            
            one_hot_x.data = F.one_hot(torch.argmin(
                one_hot_x.grad,
                dim=-1
            ),num_classes=257).float()*(expand_padding_place_mask.float()) + one_hot_x*(1 - expand_padding_place_mask.float())
            
        embedd_x = one_hot_x @ self.embedding.weight
        x = self.sequence(embedd_x)
        return x,batch_acc,batch_grad,one_hot_x

In [9]:
# class Model(nn.Module):
#     def __init__(self, data_length = 2e6, kernel_size = 500):
#         super().__init__()
#         self.embedding = nn.Embedding(257, 8, padding_idx=0)
#         self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         # self.bn_1 = nn.BatchNorm1d(128)
#         self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
#         self.fc_layer_3 = nn.Linear(128, 128)
#         self.fc_layer_4 = nn.Linear(128, 2)
        
#     def sequence(self,embedd_x):
#         x = embedd_x.transpose(-1,-2)
#         x_conv_1 = self.conv_layer_1(x[:,:4,:])
#         x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
#         x = x_conv_1*x_conv_2
#         del x_conv_1,x_conv_2
#         x = self.pool_layer_2(x).squeeze()
#         x = self.fc_layer_3(x)
#         x = self.fc_layer_4(x)
#         return x
        
#     def forward(self, input_, loss_fn, true_label, repeat = 8):
#         batch_acc = []
#         batch_grad = []
        
#         embedd_x = self.embedding(input_)
#         x = self.sequence(embedd_x)
#         acc = torch.argmax(torch.softmax(x,dim=-1),dim=-1).float().mean()
#         batch_acc.append(acc.tolist())
        
#         padding_place_mask = input_ == 0
#         random_padding = torch.randint_like(input=padding_place_mask.float(),low=1,high=257) * padding_place_mask.float()
#         input_ += random_padding.long()
        
#         one_hot_x = F.one_hot(input_,num_classes=257).float()
#         one_hot_x.requires_grad = True
#         one_hot_x.retain_grad()
#         for _ in range(repeat):
#             embedd_x = one_hot_x @ self.embedding.weight
#             x = self.sequence(embedd_x)
            
#             acc = torch.argmax(torch.softmax(x,dim=-1),dim=-1).float().mean()
#             batch_acc.append(acc.tolist())
            
#             mislead_labels = torch.abs(true_label - 1)
#             loss = loss_fn(x,mislead_labels)
#             loss.backward()
            
#             shape = padding_place_mask.shape
#             expand_padding_place_mask = padding_place_mask.unsqueeze(-1).expand(shape[0],shape[1],257)
#             grad_min_idx = torch.argmin(one_hot_x.grad,dim=-1)
#             grad_sez_mask = (one_hot_x.grad <= 0)
#             grad_mask = (expand_padding_place_mask & grad_sez_mask)
#             batch_grad.append(torch.div(
#                 input=(expand_padding_place_mask.float()*one_hot_x.grad).sum(),
#                 other=expand_padding_place_mask.float().sum()
#             ).detach().cpu().numpy())
            
#             one_hot_x.data = F.one_hot(torch.argmin(
#                 one_hot_x.grad,
#                 dim=-1
#             ),num_classes=257).float()*(expand_padding_place_mask.float()) + one_hot_x*(1 - expand_padding_place_mask.float())
            
#         embedd_x = one_hot_x @ self.embedding.weight
#         x = self.sequence(embedd_x)
#         return x,batch_acc,batch_grad,one_hot_x

In [10]:
def train_def(model,trainloader,loss_fn,optim,CUDA):
    model.train()
    acc = []
    preds = []
    labels = []
    total_batch_acc = []
    total_batch_grad = []
    bar = tqdm(trainloader)
    # bar = tqdm(testloader)
    # bar = tqdm(validloader)
    for step, (batch_data,batch_label) in enumerate(bar):
        batch_data = batch_data.cuda() if CUDA else batch_data
        batch_label = batch_label.cuda() if CUDA else batch_label
        batch_label = batch_label.squeeze()

        true_label = torch.zeros((len(batch_label),2)).cuda()
        for idx,target in enumerate(batch_label.squeeze()): true_label[idx,target] = 1
        true_label = true_label.cuda() if CUDA else true_label

        pred,batch_acc,batch_grad,one_hot_x = model(batch_data,loss_fn,true_label)

        optim.zero_grad()
        loss = loss_fn(pred,true_label)
        loss.backward()
        optim.step()

        pred = np.argmax(pred.detach().cpu().numpy(),1)
        batch_label = batch_label.cpu().data.numpy()

        preds.extend(pred.tolist())
        labels.extend(batch_label.tolist())

        temp_acc = (batch_label == pred).mean()
        acc.append(temp_acc)
        total_batch_acc.append(batch_acc + [temp_acc])
        total_batch_acc_str = '[' + ' '.join(map(lambda x: '%.10f'%x,np.mean(total_batch_acc,axis=0))) + ']'
        bar.set_description(total_batch_acc_str)
    return model

In [11]:
def valid_def(model,validloader,loss_fn,CUDA=True):
    model.eval()
    acc = []
    preds = []
    labels = []
    total_batch_acc = []
    total_batch_grad = []
    # bar = tqdm(trainloader)
    # bar = tqdm(testloader)
    bar = tqdm(validloader)
    for step, (batch_data,batch_label) in enumerate(bar):
        batch_data = batch_data.cuda() if CUDA else batch_data
        batch_label = batch_label.cuda() if CUDA else batch_label
        batch_label = batch_label.squeeze()

        pred,batch_acc,batch_grad,one_hot_x = model(batch_data,loss_fn,true_label,repeat = 1)

        pred = np.argmax(pred.detach().cpu().numpy(),1)
        batch_label = batch_label.cpu().data.numpy()

        preds.extend(pred.tolist())
        labels.extend(batch_label.tolist())

        temp_acc = (batch_label == pred).mean()
        acc.append(temp_acc)
        total_batch_acc.append(batch_acc + [temp_acc])
        bar.set_description(f"test：{temp_acc:4f}, test mean: {np.mean(acc):4f}")
    return model,np.mean(acc)

In [12]:
model = Model(data_length=LEAVE_BIT_NUMBER,kernel_size=KERNEL_SIZE)

ce_loss = nn.CrossEntropyLoss()
optim = Adam(model.parameters())

model = model.cuda() if CUDA else model
ce_loss = ce_loss.cuda() if CUDA else ce_less

In [13]:
time_dir = str(datetime.now())
time_dir = time_dir[:time_dir.rfind(':')]
os.mkdir(f'{trained_model_path}{time_dir}')

In [None]:
for i in range(20):
    print(i)
    model = train_def(model,trainloader,ce_loss,optim,CUDA)
    model,test_acc = valid_def(model,validloader,ce_loss,CUDA)
    save_path = f'{trained_model_path}{time_dir}/my_way_epoch:{i}_test_acc:{test_acc:.6f}.pt'
    torch.save(model.state_dict(),save_path)

0


[0.1902826856 0.1924911662 0.3172261519 0.3440812760 0.3566254459 0.3659894038 0.3724381670 0.3773851637 0.3792402875 0.2832155477]:   2%|▏         | 282/12000 [02:27<1:40:44,  1.94it/s]