## Malware Detection
* 0 -> Benign
* 1 -> Malware

In [None]:
import torch
import pandas as pd
from torch import nn

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Importing Dataset and extracting training, validation and test data

* First Feature scaling is performed using scikitlearn
* Used a split of 60:20:20 split
* Changed all data to torch.Tensor with type = Torch.float32

In [None]:
df = pd.read_excel('malware.xlsx')

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
X = torch.tensor(X).type(torch.float32)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = torch.from_numpy(X).type(torch.float32)

In [None]:
X.shape, y.shape

(torch.Size([58596, 52]), torch.Size([58596, 1]))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True)

In [None]:
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)
X_val, y_val = X_val.to(device), y_val.to(device)

y_train = y_train.squeeze()
y_test = y_test.squeeze()
y_val = y_val.squeeze()

## Malware Detection Model
* 1 Input Layer of size 52
* 1 Hidden Layer of size 110
* 1 Output Layer of size 1
* Non-linear activation function `ReLU()` is used

In [None]:
class MalwareDetection(nn.Module):
  def __init__(self):
    super().__init__()
    self.model = nn.Sequential(
        nn.Linear(in_features=52, out_features=110),
        nn.ReLU(),
        nn.Linear(in_features=110, out_features=1)
    )
  
  def forward(self, x):
    return self.model(x)

model = MalwareDetection().to(device)

In [None]:
def accuracy_fn(y_true, y_pred):
  """
  Returns the Accuracy based on number of matches found between corresponding indices in True value and Predicted value
  """
  # torch.eq equates all corresponding values of two tensors
  correct = torch.eq(y_true, y_pred).sum().item()
  acc = (correct/len(y_pred)) *100
  return acc

In [None]:
def test(model: nn.Module, X_test: torch.Tensor, y_test: torch.Tensor, loss_fn: nn.Module) -> dict:
  """
  Runs Testing and returns the Accuracy and model name
  """
  model.eval()
  with torch.inference_mode():
    test_logits = model(X_test).squeeze()
    test_acc = accuracy_fn(y_test, torch.round(torch.sigmoid(test_logits)))
    test_loss = loss_fn(test_logits, y_test)
  
  report = {
      "Model Name": model.__class__.__name__,
      "Test Loss": test_loss,
      "Test Accuracy": test_acc
  }
  return report

In [None]:
def train(model: nn.Module, X_train: torch.Tensor, y_train: torch.Tensor, X_test: torch.Tensor, y_test: torch.Tensor, loss_fn: nn.Module, optimizer: torch.optim.Optimizer, epochs: int=1000):
  """
  Function to Run the Training Loop.
  Takes Training Data and Validation data as parameter along with model, loss function and Optimizer
  """

  print(f"Model: {model.__class__.__name__}")
  for epoch in range(epochs):
    model.train()
    y_logits = model(X_train).squeeze()
    loss = loss_fn(y_logits, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    report = test(model, X_test, y_test, loss_fn)
    test_loss = report["Test Loss"]
    test_acc = report["Test Accuracy"]

    if epoch%100 == 0:
      print(f"Epoch: {epoch} | Loss: {loss:.5f} | Test Loss: {test_loss:.5f} | Test Acc: {test_acc:.5f}")

In [None]:
# Loss Function and Optimizer defined here
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.1)

In [None]:
# Training the Model
train(model, X_train, y_train, X_val, y_val, loss_fn, optimizer, 1000)

Model: MalwareDetection
Epoch: 0 | Loss: 0.74860 | Test Loss: 0.67409 | Test Acc: 47.80271
Epoch: 100 | Loss: 0.03190 | Test Loss: 0.02831 | Test Acc: 99.45388
Epoch: 200 | Loss: 0.01544 | Test Loss: 0.01412 | Test Acc: 99.76107
Epoch: 300 | Loss: 0.01173 | Test Loss: 0.01053 | Test Acc: 99.76107
Epoch: 400 | Loss: 0.01004 | Test Loss: 0.00891 | Test Acc: 99.78667
Epoch: 500 | Loss: 0.00902 | Test Loss: 0.00797 | Test Acc: 99.83787
Epoch: 600 | Loss: 0.00832 | Test Loss: 0.00736 | Test Acc: 99.88907
Epoch: 700 | Loss: 0.00781 | Test Loss: 0.00694 | Test Acc: 99.91467
Epoch: 800 | Loss: 0.00740 | Test Loss: 0.00663 | Test Acc: 99.92320
Epoch: 900 | Loss: 0.00707 | Test Loss: 0.00640 | Test Acc: 99.93173


In [None]:
report = test(model, X_test, y_test, loss_fn)
report

{'Model Name': 'MalwareDetection',
 'Test Loss': tensor(0.0064, device='cuda:0'),
 'Test Accuracy': 99.9061433447099}

## Final Accuracy

* 99.93173%