In [1]:
import sys
import os
import time
import numpy as np
import pandas as pd
from datetime import datetime
import multiprocessing as mp
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.autograd import Variable
from tqdm import tqdm
from src.model import *
# from src.util import *
os.environ["CUDA_VISIBLE_DEVICES"] = '2'

In [2]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/ember/'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/ember2018/"
train_data_path = data_path + "malwares/"  # Training data
train_label_path = data_path + "train_labels.csv"  # Training label

In [3]:
CUDA = True if torch.cuda.is_available() else False
NUM_WORKERS = 24  # Number of cores to use for data loader
BATCH_SIZE = 25  #
LEAVE_BIT_NUMBER = 20000
KERNEL_SIZE = 500  # Kernel size & stride for Malconv (defualt : 500)

In [4]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')

In [5]:
class ExeDataset(Dataset):
    def __init__(self, malware_names, data_path, labels, leave_bit_num):
        self.malware_names = malware_names
        self.data_path = data_path
        self.labels = labels
        self.leave_bit_num = leave_bit_num

    def __len__(self):
        return len(self.malware_names)

    def __getitem__(self, idx):
        with open(self.data_path + self.malware_names[idx] + '.txt','rb') as fp:
            data = [bit+1 for bit in fp.read()[:self.leave_bit_num]]
            padding = [0]*(self.leave_bit_num-len(data))
            data = data + padding

        return np.array(data), np.array([self.labels[idx]])

In [6]:
train_dataset = ExeDataset(
    trainset["id"].tolist(), 
    train_data_path, 
    trainset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
valid_dataset = ExeDataset(
    validset["id"].tolist(), 
    train_data_path, 
    validset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)

In [7]:
trainloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
validloader = DataLoader(
    dataset = valid_dataset,
    batch_size = 1024,
    shuffle = True,
    num_workers = NUM_WORKERS,
    pin_memory = True
)

In [8]:
class Model(nn.Module):
    def __init__(self, data_length = 2e6, kernel_size = 500):
        super().__init__()
        self.embedding = nn.Embedding(257, 8, padding_idx=0)
        self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        # self.bn_1 = nn.BatchNorm1d(128)
        self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
        self.fc_layer_3 = nn.Linear(128, 128)
        self.fc_layer_4 = nn.Linear(128, 2)
        
    def forward(self,x):
        x = self.embedding(x)
        x = x.transpose(-1,-2)
        x_conv_1 = self.conv_layer_1(x[:,:4,:])
        x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
        x = x_conv_1*x_conv_2
        del x_conv_1,x_conv_2
        x = self.pool_layer_2(x).squeeze()
        x = self.fc_layer_3(x)
        x = self.fc_layer_4(x)
        return x

In [9]:
def train_def(model,trainloader,loss_fn,optim,cuda=True):
    model.train()
    ls = []
    bar = tqdm(trainloader)
    for step, (batch_data,batch_label) in enumerate(bar):
        optim.zero_grad()
        batch_data = batch_data.cuda() if cuda else batch_data
        batch_label = batch_label.cuda() if cuda else batch_label
        batch_label = batch_label.squeeze()
        temp = torch.zeros((len(batch_label),2))
        for idx,target in enumerate(batch_label.squeeze()): temp[idx,target] = 1
        temp_label = temp.cuda() if cuda else temp
        # label = label.squeeze() - 1
        pred = model(batch_data)
        loss = loss_fn(pred, temp_label)
        loss.backward()
        optim.step()
        _, predicted = torch.max(pred, 1)
        temp_ls = (batch_label.cpu().data.numpy() == predicted.cpu().data.numpy()).tolist()
        ls.extend(temp_ls)
        bar.set_description(f'train: {np.mean(ls):.6}')
    return model

In [10]:
def valid_def(model,validloader,cuda=True):
    model.eval()
    ls = []
    bar = tqdm(validloader)
    for step, (batch_data,batch_label) in enumerate(bar):
        optim.zero_grad()
        batch_data = batch_data.cuda() if cuda else batch_data
        batch_label = batch_label.cuda() if cuda else batch_label
        batch_label = batch_label.squeeze()
        temp = torch.zeros((len(batch_label),2))
        for idx,target in enumerate(batch_label.squeeze()): temp[idx,target] = 1
        temp_label = temp.cuda() if cuda else temp
        # label = label.squeeze() - 1
        pred = model(batch_data)
        _, predicted = torch.max(pred, 1)
        temp_ls = (batch_label.cpu().data.numpy() == predicted.cpu().data.numpy()).tolist()
        ls.extend(temp_ls)
        bar.set_description(f'test: {np.mean(ls):.6}')
    return model,np.mean(ls)

In [12]:
time_dir = str(datetime.now())
time_dir = time_dir[:time_dir.rfind(':')]
os.mkdir(f'{trained_model_path}{time_dir}')

In [13]:
model = Model(data_length=LEAVE_BIT_NUMBER,kernel_size=KERNEL_SIZE)

ce_loss = nn.CrossEntropyLoss()
optim = Adam(model.parameters())

model = model.cuda() if CUDA else model
ce_loss = ce_loss.cuda() if CUDA else ce_less

In [14]:
for i in range(25):
    print(i)
    model = train_def(model,trainloader,ce_loss,optim,CUDA)
    model,test_acc = valid_def(model,validloader,CUDA)
    save_path = f'{trained_model_path}{time_dir}/2w_epoch:{i}_test_acc:{test_acc:.6f}.pt'
    torch.save(model.state_dict(),save_path)

0


train: 0.85696: 100%|██████████| 19200/19200 [08:35<00:00, 37.27it/s] 
test: 0.880583: 100%|██████████| 118/118 [00:26<00:00,  4.53it/s]


1


train: 0.910431: 100%|██████████| 19200/19200 [08:23<00:00, 38.16it/s]
test: 0.88955: 100%|██████████| 118/118 [00:25<00:00,  4.55it/s] 


2


train: 0.935844: 100%|██████████| 19200/19200 [08:15<00:00, 38.74it/s]
test: 0.893517: 100%|██████████| 118/118 [00:25<00:00,  4.62it/s]


3


train: 0.95345: 100%|██████████| 19200/19200 [08:13<00:00, 38.94it/s] 
test: 0.890633: 100%|██████████| 118/118 [00:25<00:00,  4.62it/s]


4


train: 0.963206: 100%|██████████| 19200/19200 [08:12<00:00, 38.98it/s]
test: 0.89335: 100%|██████████| 118/118 [00:25<00:00,  4.62it/s] 


5


train: 0.970092: 100%|██████████| 19200/19200 [08:30<00:00, 37.61it/s]
test: 0.895942: 100%|██████████| 118/118 [00:26<00:00,  4.53it/s]


6


train: 0.974992: 100%|██████████| 19200/19200 [08:16<00:00, 38.70it/s]
test: 0.895858: 100%|██████████| 118/118 [00:25<00:00,  4.58it/s]


7


train: 0.977944: 100%|██████████| 19200/19200 [08:11<00:00, 39.07it/s]
test: 0.895258: 100%|██████████| 118/118 [00:25<00:00,  4.60it/s]


8


train: 0.980673: 100%|██████████| 19200/19200 [08:17<00:00, 38.58it/s]
test: 0.891858: 100%|██████████| 118/118 [00:25<00:00,  4.58it/s]


9


train: 0.982406: 100%|██████████| 19200/19200 [08:09<00:00, 39.25it/s]
test: 0.8942: 100%|██████████| 118/118 [00:25<00:00,  4.62it/s]  


10


train: 0.983752: 100%|██████████| 19200/19200 [08:09<00:00, 39.19it/s]
test: 0.894433: 100%|██████████| 118/118 [00:25<00:00,  4.57it/s]


11


train: 0.985356: 100%|██████████| 19200/19200 [08:20<00:00, 38.35it/s]
test: 0.895417: 100%|██████████| 118/118 [00:26<00:00,  4.48it/s]


12


train: 0.986369: 100%|██████████| 19200/19200 [08:18<00:00, 38.52it/s]
test: 0.893383: 100%|██████████| 118/118 [00:25<00:00,  4.54it/s]


13


train: 0.987156: 100%|██████████| 19200/19200 [08:08<00:00, 39.27it/s]
test: 0.896192: 100%|██████████| 118/118 [00:25<00:00,  4.60it/s]


14


train: 0.987738: 100%|██████████| 19200/19200 [08:10<00:00, 39.15it/s]
test: 0.896383: 100%|██████████| 118/118 [00:25<00:00,  4.56it/s]


15


train: 0.988642: 100%|██████████| 19200/19200 [08:10<00:00, 39.18it/s]
test: 0.893042: 100%|██████████| 118/118 [00:25<00:00,  4.62it/s]


16


train: 0.988952: 100%|██████████| 19200/19200 [08:24<00:00, 38.08it/s]
test: 0.897317: 100%|██████████| 118/118 [00:26<00:00,  4.38it/s]


17


train: 0.989471: 100%|██████████| 19200/19200 [08:25<00:00, 37.98it/s]
test: 0.895033: 100%|██████████| 118/118 [00:25<00:00,  4.57it/s]


18


train: 0.991459:  44%|████▎     | 8352/19200 [02:46<04:42, 38.45it/s]Exception in thread Thread-41:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/workdir/security/home/junjiehuang2468/.local/lib/python3.6/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 113, in get
    return _ForkingPickler.loads(res)
  File "/workdir/security/home/junjiehuang2468/.local/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
    fd = df.detach()
  File "/usr/lib/python3.6/multiprocessing/resource_sharer.py", line 57, in detach
    with _resource_sharer.get_connection(self._id) as conn:
  File "/usr/lib/python3.6/multiprocessin

KeyboardInterrupt: 