In [None]:
# -*- coding: utf-8 -*-
# This code is written in Python 3.7. To install the required packages, run the following commands:
# pip install pandas numpy matplotlib seaborn scikit-learn sympy
# This code is applicable to the Simargl 2022 dataset.
# implemented Adavance Ensamble Learning Techniques: Stacking

import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder



In [None]:
# Load the cleaned Simargl 2022 dataset
df = pd.read_csv(
    '/home/ibibers@ads.iu.edu/IDS_Datasets/Combined_datasets/Simargl_cleaned_dataset.csv')
tf.keras.backend.clear_session()

In [None]:
dropped_df = ['ALERT']
X = df.drop(dropped_df, axis=1)
y = df['ALERT']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [None]:
# Initial train test split set and split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Convert data to TensorFlow tensors
X_train_tf = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
y_train_tf = tf.convert_to_tensor(pd.factorize(y_train)[0], dtype=tf.int64)
X_test_tf = tf.convert_to_tensor(X_test.values, dtype=tf.float32)

# Initialize MirroredStrategy for GPU acceleration
strategy = tf.distribute.MirroredStrategy()



# Train the models
with strategy.scope():
    # the base models
    rf_model = RandomForestClassifier()
    nn_model = MLPClassifier()
    lr_model = LogisticRegression()
    dst_model = DecisionTreeClassifier()

    # Train the models
    rf_model.fit(X_train, y_train)
    nn_model.fit(X_train, y_train)
    lr_model.fit(X_train, y_train)
    dst_model.fit(X_train, y_train)

    # Make predictions models
    rf_pred = rf_model.predict(X_test)
    nn_pred = nn_model.predict(X_test)
    lr_pred = lr_model.predict(X_test)
    dst_pred = dst_model.predict(X_test)

    meta_model = DecisionTreeClassifier()
    
    from sklearn.ensemble import StackingClassifier
    # Stack models using StackingClassifier
    stacked_model = StackingClassifier(estimators=[
        ('rf', rf_model),
        ('nn', nn_model),
        ('lr', lr_model),
        ('dst', dst_model),
        # ('svm', svm_model)
    ], final_estimator=meta_model)

    # Train the stacked model
    stacked_model.fit(X_train, y_train)

    # Make predictions with the stacked model
    stacked_pred__encoded = stacked_model.predict(X_test)

# # Convert probabilities to TensorFlow tensors
# pred_tf = tf.convert_to_tensor(y_pred_enc, dtype=tf.float32)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, stacked_pred__encoded)
print(f"Accuracy: {accuracy}")

In [None]:
# Use inverse_transform to get original labels
y_pred = label_encoder.inverse_transform(stacked_pred__encoded)
y_test_labels = label_encoder.inverse_transform(y_test)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_labels, y_pred))

In [None]:

# Confusion matrix with string labels
conf_mat = confusion_matrix(y_test_labels, y_pred)

# Get unique class labels from y_test and y_pred_encoded
unique_labels = np.unique(np.concatenate(
    (y_test_labels, y_pred)))

# Plotting the Confusion Matrix with class labels
plt.figure(figsize=(6, 6))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=unique_labels, yticklabels=unique_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Extract feature importances from base models
base_models_importances = {}

for name, model in stacked_model.named_estimators_.items():
    if hasattr(model, 'feature_importances_'):
        base_models_importances[name] = model.feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame(base_models_importances, index=X.columns)

# Calculate average feature importance across base models
average_importance = importance_df.mean(axis=1)

# Sort features by average importance
sorted_importance = average_importance.sort_values(ascending=False)

# Visualize the feature importances
plt.figure(figsize=(14, 20))
sns.barplot(x=sorted_importance.values,
            y=sorted_importance.index, palette='mako')
plt.xlabel('Average Importance Value')
plt.ylabel('Feature Name')
plt.title('Average Feature Importance in StackingClassifier')
plt.show()