In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import time
import seaborn as sns
import plotly.express as px

<h3>Preprocessing (Loading, removing and showing graphs)</h3>

In [2]:
# Load your CSV dataset (replace 'your_dataset.csv' with your actual dataset)
data = pd.read_csv('Obfuscated-MalMem2022-v3.csv')
from sklearn.preprocessing import LabelEncoder
lab=LabelEncoder()
data['class']=lab.fit_transform(data['Class'])
#data['category']=lab.fit_transform(data['Category'])
#data['family']=lab.fit_transform(data['Family'])
data = data.drop(['Class'], axis=1)
data = data.drop(['Category'], axis=1)
data = data.drop(['Family'], axis=1)
#columns with 0 values
data = data.drop(['pslist.nprocs64bit'], axis=1)
data = data.drop(['handles.nport'], axis=1)
data = data.drop(['psxview.not_in_eprocess_pool'], axis=1)
data = data.drop(['psxview.not_in_eprocess_pool_false_avg'], axis=1)
data = data.drop(['svcscan.interactive_process_services'], axis=1)
data = data.drop(['callbacks.nanonymous'], axis=1)
data = data.drop(['modules.nmodules'], axis=1)
data = data.drop(['callbacks.ngeneric'], axis=1)
data = data.drop(['svcscan.fs_drivers'], axis=1)

<h3>Normalization</h3>

In [3]:
from sklearn.preprocessing import MinMaxScaler
# Create a Min-Max scaler instance
scaler = MinMaxScaler()
# Select the columns you want to scale (exclude the target variable if needed)
columns_to_scale = data.columns  # You can select specific columns here

# Fit the scaler on the selected columns and transform the data
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

<h3>Spliting for Training and Testing</h3>

In [4]:
# Separate features and labels
X = data.iloc[:, :-1].values  # Features
y = data.iloc[:, -1].values   # Class labels
X.shape

(58596, 46)

<h3>ELM with 20 Epochs Implementation</h3>

In [7]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the k-fold cross-validation
num_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
accuracies = []
confusion_matrices = []

# Initialize lists to store training and testing times for each fold
training_times = []
testing_times = []

# Define hyperparameters
input_size = X.shape[1]  # Number of features
hidden_size = 128
num_classes = len(np.unique(y))  # Number of unique classes
learning_rate = 0.001
num_epochs = 20

# Loop over the folds
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # Split the dataset into train and test sets for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Create DataLoader for training and testing
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

    batch_size = 32
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create an instance of the MLP model
    model = MLPModel(input_size, hidden_size, num_classes)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    # Evaluation on the test set
    start_time = time.time()
    model.eval()
    test_predicted = []
    test_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_predicted.extend(predicted.numpy())
            test_true.extend(labels.numpy())
    end_time = time.time()
    testing_time = end_time - start_time
    testing_times.append(testing_time)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(test_true, test_predicted)
    accuracies.append(accuracy)

    # Calculate and store the confusion matrix for this fold
    cm = confusion_matrix(test_true, test_predicted)
    confusion_matrices.append(cm)

    print(f"Fold {fold + 1}/{num_splits} - Accuracy: {accuracy * 100:.2f}%")

# Calculate mean and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Display the mean accuracy and standard deviation
print(f"Mean Accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard Deviation of Accuracy: {std_accuracy * 100:.2f}%")

# Plot confusion matrix (you can choose a specific fold for visualization)
plt.figure(figsize=(8, 6))
plt.imshow(confusion_matrices[0], interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print training and testing times
print(f'Mean Training Time: {np.mean(train_times):.2f} seconds')
print(f'Mean Testing Time: {np.mean(test_times):.2f} seconds')

Fold 1/5 - Accuracy: 50.91%


KeyboardInterrupt: 

<h3>ELM with 30 Epochs Implementation</h3>

In [None]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the k-fold cross-validation
num_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
accuracies = []
confusion_matrices = []

# Initialize lists to store training and testing times for each fold
training_times = []
testing_times = []

# Define hyperparameters
input_size = X.shape[1]  # Number of features
hidden_size = 128
num_classes = len(np.unique(y))  # Number of unique classes
learning_rate = 0.001
num_epochs = 30

# Loop over the folds
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # Split the dataset into train and test sets for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Create DataLoader for training and testing
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

    batch_size = 32
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create an instance of the MLP model
    model = MLPModel(input_size, hidden_size, num_classes)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    # Evaluation on the test set
    start_time = time.time()
    model.eval()
    test_predicted = []
    test_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_predicted.extend(predicted.numpy())
            test_true.extend(labels.numpy())
    end_time = time.time()
    testing_time = end_time - start_time
    testing_times.append(testing_time)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(test_true, test_predicted)
    accuracies.append(accuracy)

    # Calculate and store the confusion matrix for this fold
    cm = confusion_matrix(test_true, test_predicted)
    confusion_matrices.append(cm)

    print(f"Fold {fold + 1}/{num_splits} - Accuracy: {accuracy * 100:.2f}%")

# Calculate mean and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Display the mean accuracy and standard deviation
print(f"Mean Accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard Deviation of Accuracy: {std_accuracy * 100:.2f}%")

# Plot confusion matrix (you can choose a specific fold for visualization)
plt.figure(figsize=(8, 6))
plt.imshow(confusion_matrices[0], interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print training and testing times
print(f'Mean Training Time: {np.mean(train_times):.2f} seconds')
print(f'Mean Testing Time: {np.mean(test_times):.2f} seconds')

<h3>ELM with 50 Epochs Implementation</h3>

In [None]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the k-fold cross-validation
num_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
accuracies = []
confusion_matrices = []

# Initialize lists to store training and testing times for each fold
training_times = []
testing_times = []

# Define hyperparameters
input_size = X.shape[1]  # Number of features
hidden_size = 128
num_classes = len(np.unique(y))  # Number of unique classes
learning_rate = 0.001
num_epochs = 50

# Loop over the folds
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # Split the dataset into train and test sets for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Create DataLoader for training and testing
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

    batch_size = 32
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create an instance of the MLP model
    model = MLPModel(input_size, hidden_size, num_classes)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    # Evaluation on the test set
    start_time = time.time()
    model.eval()
    test_predicted = []
    test_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_predicted.extend(predicted.numpy())
            test_true.extend(labels.numpy())
    end_time = time.time()
    testing_time = end_time - start_time
    testing_times.append(testing_time)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(test_true, test_predicted)
    accuracies.append(accuracy)

    # Calculate and store the confusion matrix for this fold
    cm = confusion_matrix(test_true, test_predicted)
    confusion_matrices.append(cm)

    print(f"Fold {fold + 1}/{num_splits} - Accuracy: {accuracy * 100:.2f}%")

# Calculate mean and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Display the mean accuracy and standard deviation
print(f"Mean Accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard Deviation of Accuracy: {std_accuracy * 100:.2f}%")

# Plot confusion matrix (you can choose a specific fold for visualization)
plt.figure(figsize=(8, 6))
plt.imshow(confusion_matrices[0], interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print training and testing times
print(f'Mean Training Time: {np.mean(train_times):.2f} seconds')
print(f'Mean Testing Time: {np.mean(test_times):.2f} seconds')

<h3>ELM with 100 Epochs Implementation</h3>

In [None]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the k-fold cross-validation
num_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
accuracies = []
confusion_matrices = []

# Initialize lists to store training and testing times for each fold
training_times = []
testing_times = []

# Define hyperparameters
input_size = X.shape[1]  # Number of features
hidden_size = 128
num_classes = len(np.unique(y))  # Number of unique classes
learning_rate = 0.001
num_epochs = 100

# Loop over the folds
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # Split the dataset into train and test sets for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Create DataLoader for training and testing
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

    batch_size = 32
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create an instance of the MLP model
    model = MLPModel(input_size, hidden_size, num_classes)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    # Evaluation on the test set
    start_time = time.time()
    model.eval()
    test_predicted = []
    test_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_predicted.extend(predicted.numpy())
            test_true.extend(labels.numpy())
    end_time = time.time()
    testing_time = end_time - start_time
    testing_times.append(testing_time)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(test_true, test_predicted)
    accuracies.append(accuracy)

    # Calculate and store the confusion matrix for this fold
    cm = confusion_matrix(test_true, test_predicted)
    confusion_matrices.append(cm)

    print(f"Fold {fold + 1}/{num_splits} - Accuracy: {accuracy * 100:.2f}%")

# Calculate mean and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Display the mean accuracy and standard deviation
print(f"Mean Accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard Deviation of Accuracy: {std_accuracy * 100:.2f}%")

# Plot confusion matrix (you can choose a specific fold for visualization)
plt.figure(figsize=(8, 6))
plt.imshow(confusion_matrices[0], interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print training and testing times
print(f'Mean Training Time: {np.mean(train_times):.2f} seconds')
print(f'Mean Testing Time: {np.mean(test_times):.2f} seconds')

<h3>ELM with 150 Epochs Implementation</h3>

In [None]:
# Define the MLP model
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Define the k-fold cross-validation
num_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
accuracies = []
confusion_matrices = []

# Initialize lists to store training and testing times for each fold
training_times = []
testing_times = []

# Define hyperparameters
input_size = X.shape[1]  # Number of features
hidden_size = 128
num_classes = len(np.unique(y))  # Number of unique classes
learning_rate = 0.001
num_epochs = 150

# Loop over the folds
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    # Split the dataset into train and test sets for this fold
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Convert data to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    # Create DataLoader for training and testing
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test)

    batch_size = 32
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Create an instance of the MLP model
    model = MLPModel(input_size, hidden_size, num_classes)

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    end_time = time.time()
    training_time = end_time - start_time
    training_times.append(training_time)

    # Evaluation on the test set
    start_time = time.time()
    model.eval()
    test_predicted = []
    test_true = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            test_predicted.extend(predicted.numpy())
            test_true.extend(labels.numpy())
    end_time = time.time()
    testing_time = end_time - start_time
    testing_times.append(testing_time)

    # Calculate accuracy for this fold
    accuracy = accuracy_score(test_true, test_predicted)
    accuracies.append(accuracy)

    # Calculate and store the confusion matrix for this fold
    cm = confusion_matrix(test_true, test_predicted)
    confusion_matrices.append(cm)

    print(f"Fold {fold + 1}/{num_splits} - Accuracy: {accuracy * 100:.2f}%")

# Calculate mean and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)

# Display the mean accuracy and standard deviation
print(f"Mean Accuracy: {mean_accuracy * 100:.2f}%")
print(f"Standard Deviation of Accuracy: {std_accuracy * 100:.2f}%")

# Plot confusion matrix (you can choose a specific fold for visualization)
plt.figure(figsize=(8, 6))
plt.imshow(confusion_matrices[0], interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print training and testing times
print(f'Mean Training Time: {np.mean(train_times):.2f} seconds')
print(f'Mean Testing Time: {np.mean(test_times):.2f} seconds')