In [14]:
# -*- coding: utf-8 -*- 
# This code is written in Python 3.7. To install the required packages, run the following commands:
# pip install pandas numpy matplotlib seaborn scikit-learn sympy
# This code is applicable to the Simargl 2022 dataset. 
# implemented Simple Ensamble Learning ('whighted Averaging') with Decision Tree, Random Forest and KNN 

import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [15]:
# Load the cleaned Simargl 2022 dataset
df = pd.read_csv(
    '/home/ibibers@ads.iu.edu/IDS_Datasets/Combined_datasets/Simargl_cleaned_dataset.csv')

In [16]:
df = df.sample(frac=0.75)

In [17]:
# Extract subsample of data
print (" Extract subsample of data: ")
print(df['ALERT'].value_counts())

 Extract subsample of data: 
ALERT
Normal               11287883
Denial of Service     3854136
Port Scanning         3126855
Malware                   427
Name: count, dtype: int64


In [18]:
dropped_df = ['ALERT']
X = df.drop(dropped_df, axis=1)
y = df['ALERT']

# # Label encoding for the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print(df['ALERT'].value_counts())

ALERT
Normal               11287883
Denial of Service     3854136
Port Scanning         3126855
Malware                   427
Name: count, dtype: int64


In [19]:
# Initial train test split set and split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Standardize numerical features if necessary (replace with appropriate scaling if needed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize models and create VotingClassifier ensemble
model1 = DecisionTreeClassifier()
model2 = KNeighborsClassifier()
model3 = RandomForestClassifier()
ensemble = VotingClassifier(estimators=[('dt', model1), ('knn', model2), ('rf', model3)],
                           voting='hard', weights=[0.4, 0.3, 0.3])  # Experiment with weights

# Initialize MirroredStrategy for GPU acceleration
strategy = tf.distribute.MirroredStrategy()

# Train the models
with strategy.scope():
    ensemble.fit(X_train, y_train)  # Train the ensemble with early stopping if necessary

# Make predictions
predictions = ensemble.predict_proba(X_test)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2024-03-02 18:55:26.828579: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1046 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:67:00.0, compute capability: 8.6
2024-03-02 18:55:26.829179: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 1047 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:68:00.0, compute capability: 8.6


AttributeError: predict_proba is not available when voting='hard'

In [None]:
y_pred = np.argmax(predictions, axis=1)
# Evaluate the model (consider using more comprehensive metrics)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
# Final prediction and 0.4 0.3 0.3 weights are used
# and 0.4 0.3 0.3 weights mean that 40% of the weight is given to the first model, 30% to the second model, and 30% to the third model.
finalpred = ((pred1 * 0.4 ) + (pred2*0.3)+(pred3*0.3))

In [None]:
# Convert probabilities to class predictions
# argmax returns the index of the maximum value in a row
final_class_pred_enc = np.argmax(finalpred, axis=1)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, final_class_pred_enc)
print(f"Accuracy: {accuracy}")

In [None]:
# Use inverse_transform to get original labels
y_pred = label_encoder.inverse_transform(final_class_pred_enc)
y_test_labels = label_encoder.inverse_transform(y_test)

In [None]:
#  Classificaiton Report 
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred))

In [None]:
import numpy as np
# Confusion matrix with string labels
conf_mat = confusion_matrix(y_test_labels, y_pred)

# Get unique class labels from y_test and y_pred_encoded
unique_labels = np.unique(np.concatenate((y_test_labels, y_pred)))

# Plotting the Confusion Matrix with class labels
plt.figure(figsize=(6, 6))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=unique_labels, yticklabels=unique_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()