<a href="https://colab.research.google.com/github/rimbourouphael/NSY107/blob/main/Model-2_Balanced_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Deep Graph Convolutional Neural Network (DGCNN)

This code is part our research on malware detection and classification using Deep Learning and Deep Graph Convolutional Neural Networks.

For more information or citation, please refer to our research paper:

"Oliveira, Angelo; Sassi, Renato José (2019): Behavioral Malware Detection Using Deep Graph Convolutional Neural Networks. TechRxiv. Preprint." at https://doi.org/10.36227/techrxiv.10043099.v1

For the dataset, please refer to our repository:

https://ieee-dataport.org/open-access/malware-analysis-datasets-api-call-sequences

#### Model-2, Balanced Dataset

In [1]:
!pip install skorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting skorch
  Downloading skorch-0.11.0-py3-none-any.whl (155 kB)
[K     |████████████████████████████████| 155 kB 7.2 MB/s 
Installing collected packages: skorch
Successfully installed skorch-0.11.0


In [2]:
import numpy as np
SEED = 137
np.random.seed(SEED)

import time
from collections import Counter

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from sklearn.metrics import precision_score, f1_score, roc_auc_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import torch
torch.manual_seed(SEED)
import torch.nn as nn
import torch.nn.functional as F

from skorch.classifier import NeuralNetBinaryClassifier

### Data Preprocessing

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/rimbourouphael/NSY107/main/dynamic_api_call_sequence_per_malware_100_0_306.csv')
df.head()

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,1
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,1
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,1
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,1
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43876 entries, 0 to 43875
Columns: 102 entries, hash to malware
dtypes: int64(101), object(1)
memory usage: 34.1+ MB


In [5]:
X = df.drop(['hash', 'malware'], axis = 1).values.astype(int)
y = df['malware'].values.astype(int)
print(X.shape)
print(y.shape)

(43876, 100)
(43876,)


In [6]:
print(X.min())
print(X.max())

0
306


In [7]:
def check_imbalance(dataset):
    count = sorted(Counter(dataset).items())
    print(count)
    print(count[1][1] / count[0][1])
    return

In [8]:
check_imbalance(y)

[(0, 1079), (1, 42797)]
39.66357738646895


In [9]:
random_undersampler = RandomUnderSampler(random_state = SEED)

X, y = random_undersampler.fit_resample(X, y)

check_imbalance(y)

[(0, 1079), (1, 1079)]
1.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED)

In [11]:
check_imbalance(y_train)
check_imbalance(y_test)

del df, X, y

[(0, 738), (1, 772)]
1.046070460704607
[(0, 341), (1, 307)]
0.9002932551319648


### Architecture

In [12]:
def norn_adj(X, input_dim_1):
    
    A = torch.zeros((X.size(0), input_dim_1, input_dim_1), dtype = torch.float).cuda()
        
    A_view = A.view(A.size(0), -1)
    x_size = X.size(-1)
    indices = X.narrow(-1, 0, x_size - 1) * A.stride(1) * A.stride(2) + X.narrow(-1, 1, x_size - 1) * A.stride(2)
    A_view.scatter_(1, indices, 1)
        
    A_hat = A + torch.eye(input_dim_1, dtype = torch.float).cuda()
    D_hat = A_hat.sum(dim = 1).pow(-1.0).diag_embed()
    
    return A_hat, D_hat

def to_one_hot(X, input_dim_1):
    
    X = F.one_hot(X, num_classes = input_dim_1).float()    
    X = X.permute(0, 2, 1)
    
    return X

class DGCNN_network(nn.Module):
    
    def __init__(self, weight_dim_1, weight_dim_2):

        super(DGCNN_network, self).__init__()
        self.weight_dim_1 = weight_dim_1
        self.weight_dim_2 = weight_dim_2        
        self.weights = nn.Parameter(torch.rand((self.weight_dim_1, weight_dim_2), dtype = torch.float, requires_grad = True))
        
    def forward(self, A_hat, D_hat, X):
        return D_hat.matmul(A_hat).matmul(X).matmul(self.weights)

class Model_2_network(nn.Module):
    
    def __init__(self, input_dim_1, input_dim_2, weight_dim_2, weight_dim_4, dropout_rate):
    
        super(Model_2_network, self).__init__()
        
        self.input_dim_1 = input_dim_1
        self.input_dim_2 = input_dim_2
        self.weight_dim_1 = input_dim_2
        self.weight_dim_2 = weight_dim_2
        self.weight_dim_3 = weight_dim_2
        self.weight_dim_4 = weight_dim_4
        self.dropout_rate = dropout_rate
        
        self.dgcnn_1 = DGCNN_network(self.weight_dim_1, self.weight_dim_2)
        self.dropout = nn.Dropout(p = self.dropout_rate)
        self.dgcnn_2 = DGCNN_network(self.weight_dim_3, self.weight_dim_4)
        self.fc = nn.Linear(self.input_dim_1 * (self.weight_dim_2 + self.weight_dim_4), 1)
        
    def forward(self, X):
        
        A_hat, D_hat = norn_adj(X, input_dim_1)
        X = to_one_hot(X, input_dim_1)        

        H_1 = self.dgcnn_1(A_hat, D_hat, X)
        H_1 = self.dropout(H_1)
        H_1 = torch.relu(H_1)
        H_2 = self.dgcnn_2(A_hat, D_hat, H_1)
        H_2 = self.dropout(H_2)
        H_2 = torch.relu(H_2)
        H_2 = torch.cat([H_1, H_2], 2)
        H_2 = H_2.view(H_2.size(0), -1)
        H_2 = self.fc(H_2)
                
        return H_2.squeeze()
    
model = Model_2_network(
    input_dim_1 = 307,
    input_dim_2 = 100,
    weight_dim_2 = 71,    
    weight_dim_4 = 17,
    dropout_rate = 0.4
)

print(model)
print(f'\nParameters: {np.sum([param.numel() for param in model.parameters()])}')
del model    

Model_2_network(
  (dgcnn_1): DGCNN_network()
  (dropout): Dropout(p=0.4, inplace=False)
  (dgcnn_2): DGCNN_network()
  (fc): Linear(in_features=27016, out_features=1, bias=True)
)

Parameters: 35324


### Hyperparameter Optimization

In [None]:
start_training = time.time()

torch.manual_seed(SEED)

input_dim_1 = 307
input_dim_2 = 100

net = NeuralNetBinaryClassifier(
    Model_2_network,
    module__input_dim_1 = input_dim_1,
    module__input_dim_2 = input_dim_2,
    module__weight_dim_2 = 17,
    module__weight_dim_4 = 17,
    module__dropout_rate = 0.4,
    batch_size = 32,
    max_epochs = 10,
    train_split = None,
    optimizer = torch.optim.Adam,
    iterator_train__shuffle = True,
    device = 'cuda'
)

pipe = Pipeline([
    ('net', net)
])

# LSTM(10) => 12771 => DGCNN(17, 17) => 12428
# LSTM(40) => 55881 => DGCNN(71, 71) => 55736
# LSTM(70) => 106191 => DGCNN(126, 126) => 105841
# LSTM(100) => 163701 => DGCNN(182, 182) => 163073

params = {
    'net__module__weight_dim_2' : [17, 71, 126, 182],
    'net__module__weight_dim_4' : [17, 71, 126, 182],
    'net__module__dropout_rate' : [0.4, 0.5, 0.6],
    'net__batch_size' : [32, 64, 128],
    'net__max_epochs' : [10, 20, 30]
}

gs = GridSearchCV(
    pipe,
    params,
    refit = False,
    cv = StratifiedKFold(n_splits = 5, random_state = SEED, shuffle = True),
    scoring = lambda net_gs, X_gs, y_gs : roc_auc_score(y_gs, net_gs.predict_proba(X_gs)),
    verbose = 2
)

gs.fit(X_train, y_train.astype(np.float))

print(gs.best_score_, gs.best_params_)

end_training = (time.time() - start_training) / 60

print(f'\nDone! Time: {end_training:.2f} min')

#### Best Configuration:

0.9745231287489504 {'net__batch_size': 32, 'net__max_epochs': 30, 'net__module__dropout_rate': 0.6, 'net__module__weight_dim_2': 71, 'net__module__weight_dim_4': 17}

### Training

In [13]:
torch.manual_seed(SEED)

# weight_dim_2 = gs.best_params_['net__module__weight_dim_2']
# weight_dim_4 = gs.best_params_['net__module__weight_dim_4']
# dropout_rate = gs.best_params_['net__module__dropout_rate']
# batch_size = gs.best_params_['net__batch_size']
# max_epochs = gs.best_params_['net__max_epochs']

input_dim_1 = 307
input_dim_2 = 100
weight_dim_2 = 71
weight_dim_4 = 17
dropout_rate = 0.6
batch_size = 32
max_epochs = 30

net = NeuralNetBinaryClassifier(
    Model_2_network,
    module__input_dim_1 = input_dim_1,
    module__input_dim_2 = input_dim_2,
    module__weight_dim_2 = weight_dim_2,
    module__weight_dim_4 = weight_dim_4,
    module__dropout_rate = dropout_rate,
    batch_size = batch_size,
    max_epochs = max_epochs,
    train_split = None,
    optimizer = torch.optim.Adam,
    iterator_train__shuffle = True,
    device = 'cuda'
)

pipe = Pipeline([
    ('net', net)
])

pipe.fit(X_train, y_train.astype(np.float))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


  epoch    train_loss     dur
-------  ------------  ------
      1       [36m17.7610[0m  0.5903
      2        [36m3.0380[0m  0.3783
      3        [36m0.6344[0m  0.3680
      4        [36m0.2701[0m  0.3699
      5        [36m0.1933[0m  0.3621
      6        [36m0.1601[0m  0.3642
      7        [36m0.1355[0m  0.3534
      8        [36m0.1150[0m  0.3702
      9        0.1170  0.3611
     10        [36m0.0815[0m  0.3589
     11        0.0832  0.3622
     12        [36m0.0730[0m  0.3503
     13        [36m0.0676[0m  0.3565
     14        [36m0.0664[0m  0.3532
     15        0.0683  0.3547
     16        [36m0.0539[0m  0.3571
     17        0.0607  0.3548
     18        [36m0.0383[0m  0.3494
     19        [36m0.0365[0m  0.3539
     20        0.0394  0.3667
     21        0.0462  0.3579
     22        0.0477  0.3546
     23        [36m0.0314[0m  0.3610
     24        0.0351  0.3508
     25        0.0384  0.3595
     26        [36m0.0254[0m  0.3548
     27

Pipeline(steps=[('net',
                 <class 'skorch.classifier.NeuralNetBinaryClassifier'>[initialized](
  module_=Model_2_network(
    (dgcnn_1): DGCNN_network()
    (dropout): Dropout(p=0.6, inplace=False)
    (dgcnn_2): DGCNN_network()
    (fc): Linear(in_features=27016, out_features=1, bias=True)
  ),
))])

### Evaluation

In [14]:
def model_evaluate(y, pred):
    print('Confusion matrix\n[TN FP]\n[FN TP]')
    print(confusion_matrix(y >= 0.5, pred >= 0.5))
    print(f'Accuracy: {accuracy_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'Precision: {precision_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'Recall: {recall_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'F1-Score: {f1_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'ROC AUC: {roc_auc_score(y, pred):.4f}')
    return

In [15]:
model_evaluate(y_test, np.ones(len(y_test)))

Confusion matrix
[TN FP]
[FN TP]
[[  0 341]
 [  0 307]]
Accuracy: 0.4738
Precision: 0.4738
Recall: 1.0000
F1-Score: 0.6429
ROC AUC: 0.5000


In [18]:
X_test_1 = pipe.predict_proba(X_test)[:, 1]
model_evaluate(y_test, X_test_1)

Confusion matrix
[TN FP]
[FN TP]
[[312  29]
 [ 21 286]]
Accuracy: 0.9228
Precision: 0.9079
Recall: 0.9316
F1-Score: 0.9196
ROC AUC: 0.9687


In [None]:
%%javascript
IPython.notebook.save_notebook()

<IPython.core.display.Javascript object>