In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [3]:
X_input_path = "./data/numpy/dataset_X.npy"
y_input_path = "./data/numpy/dataset_y.npy"

In [4]:
X = np.load(X_input_path)
y = np.load(y_input_path)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
class SystemCallAnomalyDetector(nn.Module):
    def __init__(
        self,
        vocab_size=325,
        embedding_dim=32,
        num_filters=64,
        kernel_size=3,
        lstm_hidden=128,
        dropout_rate=0.3
        ):
        super(SystemCallAnomalyDetector, self).__init__()

        # Layer 1
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        self.embedding_dropout = nn.Dropout(dropout_rate)

        # Layer 2
        self.conv1d = nn.Conv1d(
            in_channels=embedding_dim,
            out_channels=num_filters,
            kernel_size=kernel_size,
            padding=kernel_size//2
        )
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=2)

        # Layer 3
        self.lstm = nn.LSTM(
            input_size=num_filters,
            hidden_size=lstm_hidden,
            num_layers=1,
            batch_first=True,
            dropout=0 if dropout_rate == 0 else dropout_rate
        )

        # Layer 4 - Output for EACH timestep
        self.fc = nn.Linear(lstm_hidden, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch, 100)
        embedded = self.embedding(x)
        embedded = self.embedding_dropout(embedded)
        # embedded shape: (batch, 100, 32)

        embedded = embedded.permute(0, 2, 1)
        # embedded shape: (batch, 32, 100)

        conv_out = self.conv1d(embedded)
        conv_out = self.relu(conv_out)
        conv_out = self.maxpool(conv_out)
        # conv_out shape: (batch, 64, 50) after maxpool

        conv_out = conv_out.permute(0, 2, 1)
        # conv_out shape: (batch, 50, 64)

        lstm_out, (hidden, cell) = self.lstm(conv_out)
        # lstm_out shape: (batch, 50, 128)

        # CHANGE: Use lstm_out for all timesteps, not just final hidden
        logits = self.fc(lstm_out)  # (batch, 50, 1)
        probability = self.sigmoid(logits)  # (batch, 50, 1)

        return probability

In [17]:
class SystemCallDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, index):
        sequence = torch.LongTensor(self.sequences[index])  # (100,)
        label = torch.FloatTensor(self.labels[index])       # (100,)
        
        # Downsample labels from 100 to 50 to match maxpool output
        label = label[::2]  # Take every 2nd element: (50,)
        label = label.unsqueeze(1)  # Add dimension: (50, 1)
        
        return sequence, label

In [15]:
def train_model(model, train_loader, num_epochs=10, learning_rate=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for sequences, labels in train_loader:
            # DEBUG: Print shapes on first batch
            if epoch == 0 and correct == 0:
                print(f"sequences shape: {sequences.shape}")
                print(f"labels shape BEFORE to(device): {labels.shape}")
            
            sequences = sequences.to(device)
            labels = labels.to(device)
            
            if epoch == 0 and correct == 0:
                print(f"labels shape AFTER to(device): {labels.shape}")

            outputs = model(sequences)
            
            if epoch == 0 and correct == 0:
                print(f"outputs shape: {outputs.shape}")
            
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predictions = (outputs > 0.5).float()
            correct += (predictions == labels).sum().item()
            total += labels.numel()

        accuracy = 100 * correct / total
        avg_loss = total_loss / len(train_loader)

        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Loss: {avg_loss:.4f}, '
              f'Accuracy: {accuracy:.2f}%')

In [9]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.LongTensor(X_train)  # LongTensor for integer IDs
X_test_tensor = torch.LongTensor(X_test)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)  # FloatTensor for labels, add dimension
y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1)

In [18]:
# Create TensorDatasets
train_dataset = SystemCallDataset(X_train, y_train)
test_dataset = SystemCallDataset(X_test, y_test)

In [19]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [20]:
# Initialize model
model = SystemCallAnomalyDetector(
    vocab_size=340,  # Adjust based on your max system call ID
    embedding_dim=32,
    num_filters=64,
    kernel_size=3,
    lstm_hidden=128,
    dropout_rate=0.3
)



In [21]:
# Train the model
train_model(model, train_loader, num_epochs=10, learning_rate=0.001)

sequences shape: torch.Size([32, 100])
labels shape BEFORE to(device): torch.Size([32, 50, 1])
labels shape AFTER to(device): torch.Size([32, 50, 1])
outputs shape: torch.Size([32, 50, 1])
Epoch [1/10], Loss: 0.0001, Accuracy: 100.00%
Epoch [2/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [3/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [4/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [5/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [6/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [7/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [8/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [9/10], Loss: 0.0000, Accuracy: 100.00%
Epoch [10/10], Loss: 0.0000, Accuracy: 100.00%


In [22]:
torch.save(model.state_dict(), "./data/models/model_weights1.pth")

In [24]:
# Identify the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device) # Ensure model is on device

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for sequences, labels in test_loader:
        # --- FIX: Move data to the same device as the model ---
        sequences = sequences.to(device)
        labels = labels.to(device)
        
        outputs = model(sequences)
        predictions = (outputs > 0.5).float()
        
        # Calculate accuracy across all timesteps (matching your training logic)
        correct += (predictions == labels).sum().item()
        total += labels.numel() # Use numel() to count every prediction in the (32, 50, 1) tensor

print(f'Test Accuracy: {100 * correct / total:.2f}%')

Test Accuracy: 100.00%


In [25]:
print(model)

SystemCallAnomalyDetector(
  (embedding): Embedding(340, 32, padding_idx=0)
  (embedding_dropout): Dropout(p=0.3, inplace=False)
  (conv1d): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(64, 128, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)
