<a href="https://colab.research.google.com/github/rimbourouphael/NSY107_test/blob/main/Copy_of_LSTM_Balanced_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Long-Short Term Memory (LSTM) Network

This code is part our research on malware detection and classification using Deep Learning and Deep Graph Convolutional Neural Networks.

For more information or citation, please refer to our research paper:

"Oliveira, Angelo; Sassi, Renato José (2019): Behavioral Malware Detection Using Deep Graph Convolutional Neural Networks. TechRxiv. Preprint." at https://doi.org/10.36227/techrxiv.10043099.v1

For the dataset, please refer to our repository:

https://ieee-dataport.org/open-access/malware-analysis-datasets-api-call-sequences

#### Balanced Dataset

In [None]:
!pip install skorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting skorch
  Downloading skorch-0.11.0-py3-none-any.whl (155 kB)
[K     |████████████████████████████████| 155 kB 31.5 MB/s 
Installing collected packages: skorch
Successfully installed skorch-0.11.0


In [None]:
import numpy as np
SEED = 137
np.random.seed(SEED)

import time
from collections import Counter

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
from sklearn.metrics import precision_score, f1_score, roc_auc_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import torch
torch.manual_seed(SEED)
import torch.nn as nn
import torch.nn.functional as F

from skorch.classifier import NeuralNetBinaryClassifier

### Data Preprocessing

In [None]:
df = pd.read_csv('C:\Users\rimbo\OneDrive\Desktop\Rim\dynamic_api_call_sequence_per_malware_100_0_306.csv')
df.head()

SyntaxError: ignored

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43876 entries, 0 to 43875
Columns: 102 entries, hash to malware
dtypes: int64(101), object(1)
memory usage: 34.1+ MB


In [None]:
X = df.drop(['hash', 'malware'], axis = 1).values.astype(int)
y = df['malware'].values.astype(int)
print(X.shape)
print(y.shape)

(43876, 100)
(43876,)


In [None]:
print(X.min())
print(X.max())

0
306


In [None]:
def check_imbalance(dataset):
    count = sorted(Counter(dataset).items())
    print(count)
    print(count[1][1] / count[0][1])
    return

In [None]:
random_undersampler = RandomUnderSampler(random_state = SEED)

X, y = random_undersampler.fit_resample(X, y)

check_imbalance(y)

[(0, 1079), (1, 1079)]
1.0


In [None]:
check_imbalance(y)

[(0, 1079), (1, 1079)]
1.0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED)

In [None]:
check_imbalance(y_train)
check_imbalance(y_test)

del df, X, y

[(0, 738), (1, 772)]
1.046070460704607
[(0, 341), (1, 307)]
0.9002932551319648


### Architecture

In [None]:
class LSTM_network(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        
        super(LSTM_network, self).__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.dropout_rate = dropout_rate
        
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, batch_first = True)
        self.dropout = nn.Dropout(p = self.dropout_rate)
        self.fc = nn.Linear(self.hidden_dim, 1)
        
    def forward(self, X):
        
        X = F.one_hot(X, num_classes = self.input_dim).float().cuda()
        
        # Hidden layer shape: (num_layers, batch_size, hidden_dim)
        hidden_0 = (torch.zeros(1, X.size(0), self.hidden_dim).float().cuda(),
                    torch.zeros(1, X.size(0), self.hidden_dim).float().cuda())
        
        # Input/Output shape: (batch_size, seq_len, input_dim)
        _, self.hidden = self.lstm(X, hidden_0)
                
        H = self.hidden[0].squeeze()
        H = self.dropout(H)
        H = self.fc(H)
                
        return H.squeeze()

model = LSTM_network(
    input_dim = 307,
    hidden_dim = 10,
    dropout_rate = 0.4
)

print(model)
print(f'\nParameters: {np.sum([param.numel() for param in model.parameters()])}')
del model

LSTM_network(
  (lstm): LSTM(307, 10, batch_first=True)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=10, out_features=1, bias=True)
)

Parameters: 12771


### Hyperparameter Optimization

In [None]:
start_training = time.time()

torch.manual_seed(SEED)

input_dim = 307

net = NeuralNetBinaryClassifier(
    LSTM_network,
    module__input_dim = input_dim,
    module__hidden_dim = 10,
    module__dropout_rate = 0.4,
    batch_size = 32,
    max_epochs = 10,
    train_split = None,
    optimizer = torch.optim.Adam,
    iterator_train__shuffle = True,
    device = 'cuda'
)

pipe = Pipeline([
    ('net', net)
])

params = {
    'net__module__hidden_dim' : [10, 40, 70, 100],
    'net__module__dropout_rate' : [0.4, 0.5, 0.6],
    'net__batch_size' : [32, 64, 128],
    'net__max_epochs' : [10, 20, 30]
}

gs = GridSearchCV(
    pipe,
    params,
    refit = False,
    cv = StratifiedKFold(n_splits = 5, random_state = SEED, shuffle = True),
    scoring = lambda net_gs, X_gs, y_gs : roc_auc_score(y_gs, net_gs.predict_proba(X_gs)),
    verbose = 2
)

gs.fit(X_train, y_train.astype(np.float))

print(gs.best_score_, gs.best_params_)

end_training = (time.time() - start_training) / 60

print(f'\nDone! Time: {end_training:.2f} min')

#### Best configuration:

0.9404987294237913 {'net__batch_size': 32, 'net__max_epochs': 30, 'net__module__dropout_rate': 0.4, 'net__module__hidden_dim': 10}

### Training

In [None]:
torch.manual_seed(SEED)

# hidden_dim = gs.best_params_['net__module__hidden_dim']
# dropout_rate = gs.best_params_['net__module__dropout_rate']
# batch_size = gs.best_params_['net__batch_size']
# max_epochs = gs.best_params_['net__max_epochs']

input_dim = 307
hidden_dim = 10
dropout_rate = 0.4
batch_size = 32
max_epochs = 30

net = NeuralNetBinaryClassifier(
    LSTM_network,
    module__input_dim = input_dim,
    module__hidden_dim = hidden_dim,
    module__dropout_rate = dropout_rate,
    batch_size = batch_size,
    max_epochs = max_epochs,
    train_split = None,
    optimizer = torch.optim.Adam,
    iterator_train__shuffle = True,
    device = 'cuda'
)

pipe = Pipeline([
    ('net', net)
])

pipe.fit(X_train, y_train.astype(np.float))

  epoch    train_loss     dur
-------  ------------  ------
      1        [36m0.6664[0m  0.4579
      2        [36m0.5257[0m  0.1993
      3        [36m0.4366[0m  0.1719
      4        [36m0.4106[0m  0.1722
      5        [36m0.3575[0m  0.1737
      6        [36m0.3386[0m  0.2183
      7        [36m0.3072[0m  0.1774
      8        0.3523  0.1717
      9        0.3200  0.1784
     10        0.6659  0.1823
     11        0.3816  0.1814
     12        0.3830  0.1776
     13        0.3078  0.1760
     14        [36m0.2816[0m  0.1780
     15        [36m0.2746[0m  0.1738
     16        [36m0.2595[0m  0.1703
     17        [36m0.2463[0m  0.1694
     18        [36m0.2186[0m  0.1709
     19        [36m0.2099[0m  0.1726
     20        [36m0.1955[0m  0.1740
     21        [36m0.1932[0m  0.1733
     22        0.2224  0.1695
     23        0.1950  0.1806
     24        [36m0.1703[0m  0.2185
     25        [36m0.1659[0m  0.1912
     26        [36m0.1553[0m  0.180

Pipeline(memory=None,
         steps=[('net',
                 <class 'skorch.classifier.NeuralNetBinaryClassifier'>[initialized](
  module_=LSTM_network(
    (lstm): LSTM(307, 10, batch_first=True)
    (dropout): Dropout(p=0.4, inplace=False)
    (fc): Linear(in_features=10, out_features=1, bias=True)
  ),
))],
         verbose=False)

### Evaluation

In [None]:
def model_evaluate(y, pred):
    print('Confusion matrix\n[TN FP]\n[FN TP]')
    print(confusion_matrix(y >= 0.5, pred >= 0.5))
    print(f'Accuracy: {accuracy_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'Precision: {precision_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'Recall: {recall_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'F1-Score: {f1_score(y >= 0.5, pred >= 0.5):.4f}')
    print(f'ROC AUC: {roc_auc_score(y, pred):.4f}')
    return

In [None]:
model_evaluate(y_test, np.ones(len(y_test)))

Confusion matrix
[TN FP]
[FN TP]
[[  0 341]
 [  0 307]]
Accuracy: 0.4738
Precision: 0.4738
Recall: 1.0000
F1-Score: 0.6429
ROC AUC: 0.5000


In [None]:
model_evaluate(y_test, pipe.predict_proba(X_test))

Confusion matrix
[TN FP]
[FN TP]
[[288  53]
 [ 45 262]]
Accuracy: 0.8488
Precision: 0.8317
Recall: 0.8534
F1-Score: 0.8424
ROC AUC: 0.9270


In [None]:
%%javascript
IPython.notebook.save_notebook()

<IPython.core.display.Javascript object>