In [None]:
# -*- coding: utf-8 -*- 
# This code is written in Python 3.7. To install the required packages, run the following commands:
# pip install pandas numpy matplotlib seaborn scikit-learn sympy
# This code is applicable to the Simargl 2022 dataset. 
# implemented Simple Ensamble Learning ('whighted Averaging') with Decision Tree, Random Forest and KNN 

import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import LabelEncoder
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
# Load the cleaned Simargl 2022 dataset
df = pd.read_csv(
    '/home/ibibers@ads.iu.edu/IDS_Datasets/Combined_datasets/Simargl_cleaned_dataset.csv')

In [None]:
# Extract subsample of data
print (" Extract subsample of data: ")
print(df['ALERT'].value_counts())

In [None]:
dropped_df = ['ALERT']
X = df.drop(dropped_df, axis=1)
y = df['ALERT']

# # Label encoding for the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print(df['ALERT'].value_counts())

In [None]:
# Initial train test split set and split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize MirroredStrategy for GPU acceleration
strategy = tf.distribute.MirroredStrategy()

# Train the models
with strategy.scope():
    # initialize the models
    model1 = DecisionTreeClassifier()
    model2 = KNeighborsClassifier()
    model3 = RandomForestClassifier()

    # Train the models
    model1.fit(X_train, y_train)
    model2.fit(X_train, y_train)
    model3.fit(X_train, y_train)

    # Predictions
    # We use the predict_proba method to get the probabilities of each class
    pred1 = model1.predict_proba(X_test)
    pred2 = model2.predict_proba(X_test)
    pred3 = model3.predict_proba(X_test)

In [None]:
# Final prediction and 0.4 0.3 0.3 weights are used
# and 0.4 0.3 0.3 weights mean that 40% of the weight is given to the first model, 30% to the second model, and 30% to the third model.
finalpred = ((pred1 * 0.4 ) + (pred2*0.3)+(pred3*0.3))

In [None]:
# Convert probabilities to class predictions
# argmax returns the index of the maximum value in a row
final_class_pred_enc = np.argmax(finalpred, axis=1)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, final_class_pred_enc)
print(f"Accuracy: {accuracy}")

In [None]:
# Use inverse_transform to get original labels
y_pred = label_encoder.inverse_transform(final_class_pred_enc)
y_test_labels = label_encoder.inverse_transform(y_test)

In [None]:
#  Classificaiton Report 
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred))

In [None]:
import numpy as np
# Confusion matrix with string labels
conf_mat = confusion_matrix(y_test_labels, y_pred)

# Get unique class labels from y_test and y_pred_encoded
unique_labels = np.unique(np.concatenate((y_test_labels, y_pred)))

# Plotting the Confusion Matrix with class labels
plt.figure(figsize=(6, 6))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=unique_labels, yticklabels=unique_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()