In [1]:
import torch 

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  Quadro P1000
Using device: cuda


In [6]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as functional
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torchmetrics import Accuracy
# from sklearn.metrics import accuracy_score  # uncomment to use sklearn

In [7]:
# Load preprocessed data
train_df = pd.read_csv('dataset/labelled_train.csv')
test_df = pd.read_csv('dataset/labelled_test.csv')
val_df = pd.read_csv('dataset/labelled_validation.csv')

# View the first 5 rows of training set
train_df.head()

Unnamed: 0,processId,threadId,parentProcessId,userId,mountNamespace,argsNum,returnValue,sus_label
0,381,7337,1,100,4026532231,5,0,1
1,381,7337,1,100,4026532231,1,0,1
2,381,7337,1,100,4026532231,0,0,1
3,7347,7347,7341,0,4026531840,2,-2,1
4,7347,7347,7341,0,4026531840,4,0,1


In [8]:
# Start coding here
# Define Features and Labels for training, val, and test df
X_train = train_df.drop(columns=['sus_label']).values 
y_train = train_df['sus_label'].values

X_test = test_df.drop(columns=['sus_label']).values
y_test = test_df['sus_label'].values

X_val = val_df.drop(columns=['sus_label']).values
y_val = val_df['sus_label'].values

# Scaling Features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Converting into PyTorch tensors 
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

print("Training Features Shape:", X_train_tensor.shape)
print("Training Labels Shape:", y_train_tensor.shape)
print("Validation Features Shape:", X_val_tensor.shape)
print("Validation Labels Shape:", y_val_tensor.shape)
print("Test Features Shape:", X_test_tensor.shape)
print("Test Labels Shape:", y_test_tensor.shape)
 

Training Features Shape: torch.Size([763144, 7])
Training Labels Shape: torch.Size([763144])
Validation Features Shape: torch.Size([188967, 7])
Validation Labels Shape: torch.Size([188967])
Test Features Shape: torch.Size([188967, 7])
Test Labels Shape: torch.Size([188967])


In [9]:
# Creating TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create a dataloader using above dataset 
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64, num_workers=2)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=64)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=64)


In [10]:
# Defining Neural Network 
input_size = X_train_tensor.shape[1]

model = nn.Sequential(
    nn.Linear(input_size, 32), 
    nn.ReLU(), 
    nn.Dropout(p=0.5),
    nn.Linear(32, 16), 
    nn.ReLU(), 
    nn.Dropout(p=0.5),
    nn.Linear(16, 1), 
    nn.Sigmoid() #for binary classification 
)


In [17]:
# Define the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device (GPU or CPU)
model.to(device)
# Show the device being used
if device.type == 'cuda':
    device_name = torch.cuda.get_device_name(device)
    print(f"Training on: {device_name}")
else:
    print("Training on: CPU")

Training on: Quadro P1000


In [18]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-3)



In [21]:
import torch
num_epochs = 5

for epoch in range(num_epochs): 
    model.train()
    running_loss = 0.0 

    for features, labels in train_dataloader: 

        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features).view(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    # Add a print statement to monitor progress and potentially identify slow iterations
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_dataloader)}")

    # Add a break condition to handle KeyboardInterrupt gracefully
    try:
        pass
    except KeyboardInterrupt:
        print("Training interrupted. Exiting...")
        break 

Epoch [1/5], Loss: 0.004557683898252679
Epoch [2/5], Loss: 0.004599323497865002
Epoch [3/5], Loss: 0.004565871231076337
Epoch [4/5], Loss: 0.0045929145017337145
Epoch [5/5], Loss: 0.004528082815136573


In [24]:
# Initialize torchmetrics Accuracy metric
train_accuracy_metric = Accuracy(task='binary').to(device)
val_accuracy_metric = Accuracy(task='binary').to(device)
test_accuracy_metric = Accuracy(task='binary').to(device)

# Function to calculate accuracy
def calculate_accuracy(loader, model, accuracy_metric, device):
    model.eval()
    with torch.no_grad():
        for features, labels in loader:

            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            accuracy_metric.update(predicted, labels)
    return accuracy_metric.compute().item()

# Calculate accuracy for training, validation, and testing datasets
train_accuracy = calculate_accuracy(train_dataloader, model, train_accuracy_metric, device)
val_accuracy = calculate_accuracy(val_dataloader, model, val_accuracy_metric, device)
test_accuracy = calculate_accuracy(test_dataloader, model, test_accuracy_metric, device)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.9983
Validation Accuracy: 0.9958
Testing Accuracy: 0.0927
